AltiVec SIMD implementation of sample conversion and integer quantization

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1474 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
DRC
2014-12-23 04:14:54 +00:00
parent 70d831dc0d
commit f654cf0e2c
4 changed files with 274 additions and 1 deletions

View File

@@ -75,7 +75,8 @@ if SIMD_POWERPC
libsimd_la_SOURCES = jsimd_powerpc.c \
jccolor-altivec.c jcgray-altivec.c \
jfdctfst-altivec.c jfdctint-altivec.c \
jidctfst-altivec.c jidctint-altivec.c
jidctfst-altivec.c jidctint-altivec.c \
jquanti-altivec.c
libsimd_la_CFLAGS = -maltivec
jccolor-altivec.lo: jccolext-altivec.c

236
simd/jquanti-altivec.c Normal file
View File

@@ -0,0 +1,236 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2014, D. R. Commander.
* All rights reserved.
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
#include "jsimd_altivec.h"
/* NOTE: The address will either be aligned or offset by 8 bytes, so we can
* always get the data we want by using a single vector load (although we may
* have to permute the result.)
*/
#define LOAD_ROW(row) { \
elemptr = sample_data[row] + start_col; \
in##row = vec_ld(0, elemptr); \
if ((size_t)elemptr & 15) \
in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
}
void
jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
DCTELEM * workspace)
{
JSAMPROW elemptr;
__vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
__vector short out0, out1, out2, out3, out4, out5, out6, out7;
/* Constants */
__vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
__vector unsigned char zero = { __16X(0) };
LOAD_ROW(0);
LOAD_ROW(1);
LOAD_ROW(2);
LOAD_ROW(3);
LOAD_ROW(4);
LOAD_ROW(5);
LOAD_ROW(6);
LOAD_ROW(7);
out0 = (__vector short)vec_mergeh(zero, in0);
out1 = (__vector short)vec_mergeh(zero, in1);
out2 = (__vector short)vec_mergeh(zero, in2);
out3 = (__vector short)vec_mergeh(zero, in3);
out4 = (__vector short)vec_mergeh(zero, in4);
out5 = (__vector short)vec_mergeh(zero, in5);
out6 = (__vector short)vec_mergeh(zero, in6);
out7 = (__vector short)vec_mergeh(zero, in7);
out0 = vec_sub(out0, pw_centerjsamp);
out1 = vec_sub(out1, pw_centerjsamp);
out2 = vec_sub(out2, pw_centerjsamp);
out3 = vec_sub(out3, pw_centerjsamp);
out4 = vec_sub(out4, pw_centerjsamp);
out5 = vec_sub(out5, pw_centerjsamp);
out6 = vec_sub(out6, pw_centerjsamp);
out7 = vec_sub(out7, pw_centerjsamp);
vec_st(out0, 0, workspace);
vec_st(out1, 16, workspace);
vec_st(out2, 32, workspace);
vec_st(out3, 48, workspace);
vec_st(out4, 64, workspace);
vec_st(out5, 80, workspace);
vec_st(out6, 96, workspace);
vec_st(out7, 112, workspace);
}
#define WORD_BIT 16
/* There is no AltiVec unsigned multiply instruction, hence this. */
#define MULTIPLY(vs0, vs1, out) { \
tmpe = vec_mule((__vector unsigned short)vs0, \
(__vector unsigned short)vs1); \
tmpo = vec_mulo((__vector unsigned short)vs0, \
(__vector unsigned short)vs1); \
out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
(__vector unsigned short)tmpo, \
shift_pack_index); \
}
void
jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM * divisors,
DCTELEM * workspace)
{
__vector short row0, row1, row2, row3, row4, row5, row6, row7;
__vector short row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s;
__vector short corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7;
__vector short recip0, recip1, recip2, recip3, recip4, recip5, recip6,
recip7;
__vector short scale0, scale1, scale2, scale3, scale4, scale5, scale6,
scale7;
__vector unsigned int tmpe, tmpo;
/* Constants */
__vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
__vector unsigned char shift_pack_index =
{ 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
row0 = vec_ld(0, workspace);
row1 = vec_ld(16, workspace);
row2 = vec_ld(32, workspace);
row3 = vec_ld(48, workspace);
row4 = vec_ld(64, workspace);
row5 = vec_ld(80, workspace);
row6 = vec_ld(96, workspace);
row7 = vec_ld(112, workspace);
/* Branch-less absolute value */
row0s = vec_sra(row0, pw_word_bit_m1);
row1s = vec_sra(row1, pw_word_bit_m1);
row2s = vec_sra(row2, pw_word_bit_m1);
row3s = vec_sra(row3, pw_word_bit_m1);
row4s = vec_sra(row4, pw_word_bit_m1);
row5s = vec_sra(row5, pw_word_bit_m1);
row6s = vec_sra(row6, pw_word_bit_m1);
row7s = vec_sra(row7, pw_word_bit_m1);
row0 = vec_xor(row0, row0s);
row1 = vec_xor(row1, row1s);
row2 = vec_xor(row2, row2s);
row3 = vec_xor(row3, row3s);
row4 = vec_xor(row4, row4s);
row5 = vec_xor(row5, row5s);
row6 = vec_xor(row6, row6s);
row7 = vec_xor(row7, row7s);
row0 = vec_sub(row0, row0s);
row1 = vec_sub(row1, row1s);
row2 = vec_sub(row2, row2s);
row3 = vec_sub(row3, row3s);
row4 = vec_sub(row4, row4s);
row5 = vec_sub(row5, row5s);
row6 = vec_sub(row6, row6s);
row7 = vec_sub(row7, row7s);
corr0 = vec_ld(DCTSIZE2 * 2, divisors);
corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
row0 = vec_add(row0, corr0);
row1 = vec_add(row1, corr1);
row2 = vec_add(row2, corr2);
row3 = vec_add(row3, corr3);
row4 = vec_add(row4, corr4);
row5 = vec_add(row5, corr5);
row6 = vec_add(row6, corr6);
row7 = vec_add(row7, corr7);
recip0 = vec_ld(0, divisors);
recip1 = vec_ld(16, divisors);
recip2 = vec_ld(32, divisors);
recip3 = vec_ld(48, divisors);
recip4 = vec_ld(64, divisors);
recip5 = vec_ld(80, divisors);
recip6 = vec_ld(96, divisors);
recip7 = vec_ld(112, divisors);
MULTIPLY(row0, recip0, row0);
MULTIPLY(row1, recip1, row1);
MULTIPLY(row2, recip2, row2);
MULTIPLY(row3, recip3, row3);
MULTIPLY(row4, recip4, row4);
MULTIPLY(row5, recip5, row5);
MULTIPLY(row6, recip6, row6);
MULTIPLY(row7, recip7, row7);
scale0 = vec_ld(DCTSIZE2 * 4, divisors);
scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
MULTIPLY(row0, scale0, row0);
MULTIPLY(row1, scale1, row1);
MULTIPLY(row2, scale2, row2);
MULTIPLY(row3, scale3, row3);
MULTIPLY(row4, scale4, row4);
MULTIPLY(row5, scale5, row5);
MULTIPLY(row6, scale6, row6);
MULTIPLY(row7, scale7, row7);
row0 = vec_xor(row0, row0s);
row1 = vec_xor(row1, row1s);
row2 = vec_xor(row2, row2s);
row3 = vec_xor(row3, row3s);
row4 = vec_xor(row4, row4s);
row5 = vec_xor(row5, row5s);
row6 = vec_xor(row6, row6s);
row7 = vec_xor(row7, row7s);
row0 = vec_sub(row0, row0s);
row1 = vec_sub(row1, row1s);
row2 = vec_sub(row2, row2s);
row3 = vec_sub(row3, row3s);
row4 = vec_sub(row4, row4s);
row5 = vec_sub(row5, row5s);
row6 = vec_sub(row6, row6s);
row7 = vec_sub(row7, row7s);
vec_st(row0, 0, coef_block);
vec_st(row1, 16, coef_block);
vec_st(row2, 32, coef_block);
vec_st(row3, 48, coef_block);
vec_st(row4, 64, coef_block);
vec_st(row5, 80, coef_block);
vec_st(row6, 96, coef_block);
vec_st(row7, 112, coef_block);
}

View File

@@ -568,6 +568,9 @@ EXTERN(void) jsimd_convsamp_neon
EXTERN(void) jsimd_convsamp_mips_dspr2
(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace);
EXTERN(void) jsimd_convsamp_altivec
(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace);
/* Floating Point Sample Conversion */
EXTERN(void) jsimd_convsamp_float_3dnow
(JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace);
@@ -622,6 +625,9 @@ EXTERN(void) jsimd_quantize_neon
EXTERN(void) jsimd_quantize_mips_dspr2
(JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace);
EXTERN(void) jsimd_quantize_altivec
(JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace);
/* Floating Point Quantization */
EXTERN(void) jsimd_quantize_float_3dnow
(JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace);

View File

@@ -290,6 +290,21 @@ jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
GLOBAL(int)
jsimd_can_convsamp (void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
@@ -303,6 +318,7 @@ GLOBAL(void)
jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
DCTELEM * workspace)
{
jsimd_convsamp_altivec(sample_data, start_col, workspace);
}
GLOBAL(void)
@@ -371,6 +387,19 @@ jsimd_fdct_float (FAST_FLOAT * data)
GLOBAL(int)
jsimd_can_quantize (void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
@@ -384,6 +413,7 @@ GLOBAL(void)
jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
DCTELEM * workspace)
{
jsimd_quantize_altivec(coef_block, divisors, workspace);
}
GLOBAL(void)