AltiVec SIMD implementation of fast integer inverse DCT

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1445 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
DRC
2014-12-18 10:12:29 +00:00
parent 7475e59637
commit c7dadd2d0b
3 changed files with 231 additions and 0 deletions

View File

@@ -667,6 +667,10 @@ EXTERN(void) jsimd_idct_ifast_rows_mips_dspr2
(DCTELEM * wsptr, JSAMPARRAY output_buf, JDIMENSION output_col, (DCTELEM * wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
const int * idct_coefs); const int * idct_coefs);
EXTERN(void) jsimd_idct_ifast_altivec
(void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col);
/* Floating Point Inverse DCT */ /* Floating Point Inverse DCT */
EXTERN(void) jsimd_idct_float_3dnow EXTERN(void) jsimd_idct_float_3dnow
(void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,

View File

@@ -339,6 +339,17 @@ jsimd_can_idct_islow (void)
GLOBAL(int) GLOBAL(int)
jsimd_can_idct_ifast (void) jsimd_can_idct_ifast (void)
{ {
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0; return 0;
} }
@@ -360,6 +371,8 @@ jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf, JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col) JDIMENSION output_col)
{ {
jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
output_col);
} }
GLOBAL(void) GLOBAL(void)

View File

@@ -478,3 +478,217 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
*(__vector short *)&data[48] = out6; *(__vector short *)&data[48] = out6;
*(__vector short *)&data[56] = out7; *(__vector short *)&data[56] = out7;
} }
/* FAST INTEGER INVERSE DCT
*
* This is similar to the SSE2 implementation, except that we left-shift the
* constants by 1 less bit (the -1 in IFAST_CONST_SHIFT.) This is because
* vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
* the elements in arg3 + the most significant 17 bits of
* (the elements in arg1 * the elements in arg2).
*/
#define F_1_082 277 /* FIX(1.082392200) */
#define F_1_414 362 /* FIX(1.414213562) */
#define F_1_847 473 /* FIX(1.847759065) */
#define F_2_613 669 /* FIX(2.613125930) */
#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
#define IFAST_PASS1_BITS 2
#define IFAST_CENTERJSAMPLE 128
static const __vector short jconst_idct_ifast __attribute__((aligned(16))) =
{
F_1_414 << IFAST_CONST_SHIFT,
F_1_847 << IFAST_CONST_SHIFT,
-F_1_613 << IFAST_CONST_SHIFT,
F_1_082 << IFAST_CONST_SHIFT
};
static const __vector signed char jconst_idct_ifast2
__attribute__((aligned(16))) =
{
IFAST_CENTERJSAMPLE
};
#define DO_IDCT_IFAST(in) \
{ \
/* Even part */ \
\
tmp10 = vec_add(in##0, in##4); \
tmp11 = vec_sub(in##0, in##4); \
tmp13 = vec_add(in##2, in##6); \
\
tmp12 = vec_sub(in##2, in##6); \
tmp12 = vec_sl(tmp12, PRE_MULTIPLY_SCALE_BITS); \
tmp12 = vec_madds(tmp12, PW_F1414, zero); \
tmp12 = vec_sub(tmp12, tmp13); \
\
tmp0 = vec_add(tmp10, tmp13); \
tmp3 = vec_sub(tmp10, tmp13); \
tmp1 = vec_add(tmp11, tmp12); \
tmp2 = vec_sub(tmp11, tmp12); \
\
/* Odd part */ \
\
z13 = vec_add(in##5, in##3); \
z10 = vec_sub(in##5, in##3); \
z10s = vec_sl(z10, PRE_MULTIPLY_SCALE_BITS); \
z11 = vec_add(in##1, in##7); \
z12s = vec_sub(in##1, in##7); \
z12s = vec_sl(z12s, PRE_MULTIPLY_SCALE_BITS); \
\
tmp11 = vec_sub(z11, z13); \
tmp11 = vec_sl(tmp11, PRE_MULTIPLY_SCALE_BITS); \
tmp11 = vec_madds(tmp11, PW_F1414, zero); \
\
tmp7 = vec_add(z11, z13); \
\
z5 = vec_add(z10s, z12s); \
z5 = vec_madds(z5, PW_F1847, zero); \
\
tmp10 = vec_madds(z12s, PW_F1082, zero); \
tmp10 = vec_sub(tmp10, z5); \
tmp12 = vec_madds(z10s, PW_MF1613, zero); \
tmp12 = vec_sub(tmp12, z10); \
tmp12 = vec_add(tmp12, z5); \
\
tmp6 = vec_sub(tmp12, tmp7); \
tmp5 = vec_sub(tmp11, tmp6); \
tmp4 = vec_add(tmp10, tmp5); \
\
out0 = vec_add(tmp0, tmp7); \
out1 = vec_add(tmp1, tmp6); \
out2 = vec_add(tmp2, tmp5); \
out3 = vec_sub(tmp3, tmp4); \
out4 = vec_add(tmp3, tmp4); \
out5 = vec_sub(tmp2, tmp5); \
out6 = vec_sub(tmp1, tmp6); \
out7 = vec_sub(tmp0, tmp7); \
}
void
jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col)
{
short *dct_table = (short *)dct_table_;
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
col0, col1, col2, col3, col4, col5, col6, col7,
quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
z5, z10, z10s, z11, z12s, z13,
out0, out1, out2, out3, out4, out5, out6, out7;
__vector signed char outb;
long long *outptr, *outbptr = (long long *)(&outb);
/* Constants */
__vector short zero = vec_splat_s16(0),
PW_F1414 = vec_splat(jconst_idct_ifast, 0),
PW_F1847 = vec_splat(jconst_idct_ifast, 1),
PW_MF1613 = vec_splat(jconst_idct_ifast, 2),
PW_F1082 = vec_splat(jconst_idct_ifast, 3);
__vector unsigned short
PRE_MULTIPLY_SCALE_BITS = vec_splat_u16(IFAST_PRE_MULTIPLY_SCALE_BITS),
PASS1_BITS3 = vec_splat_u16(IFAST_PASS1_BITS + 3);
__vector signed char PB_CENTERJSAMP = vec_splat(jconst_idct_ifast2, 0);
/* Pass 1: process columns. */
col0 = *(__vector short *)&coef_block[0];
col1 = *(__vector short *)&coef_block[8];
col2 = *(__vector short *)&coef_block[16];
col3 = *(__vector short *)&coef_block[24];
col4 = *(__vector short *)&coef_block[32];
col5 = *(__vector short *)&coef_block[40];
col6 = *(__vector short *)&coef_block[48];
col7 = *(__vector short *)&coef_block[56];
tmp1 = vec_or(col1, col2);
tmp2 = vec_or(col3, col4);
tmp1 = vec_or(tmp1, tmp2);
tmp3 = vec_or(col5, col6);
tmp3 = vec_or(tmp3, col7);
tmp1 = vec_or(tmp1, tmp3);
quant0 = *(__vector short *)&dct_table[0];
col0 = vec_mladd(col0, quant0, zero);
if (vec_all_eq(tmp1, zero)) {
/* AC terms all zero */
row0 = vec_splat(col0, 0);
row1 = vec_splat(col0, 1);
row2 = vec_splat(col0, 2);
row3 = vec_splat(col0, 3);
row4 = vec_splat(col0, 4);
row5 = vec_splat(col0, 5);
row6 = vec_splat(col0, 6);
row7 = vec_splat(col0, 7);
} else {
quant1 = *(__vector short *)&dct_table[8];
quant2 = *(__vector short *)&dct_table[16];
quant3 = *(__vector short *)&dct_table[24];
quant4 = *(__vector short *)&dct_table[32];
quant5 = *(__vector short *)&dct_table[40];
quant6 = *(__vector short *)&dct_table[48];
quant7 = *(__vector short *)&dct_table[56];
col1 = vec_mladd(col1, quant1, zero);
col2 = vec_mladd(col2, quant2, zero);
col3 = vec_mladd(col3, quant3, zero);
col4 = vec_mladd(col4, quant4, zero);
col5 = vec_mladd(col5, quant5, zero);
col6 = vec_mladd(col6, quant6, zero);
col7 = vec_mladd(col7, quant7, zero);
DO_IDCT_IFAST(col);
TRANSPOSE(out, row);
}
/* Pass 2: process rows. */
DO_IDCT_IFAST(row);
out0 = vec_sra(out0, PASS1_BITS3);
out1 = vec_sra(out1, PASS1_BITS3);
out2 = vec_sra(out2, PASS1_BITS3);
out3 = vec_sra(out3, PASS1_BITS3);
out4 = vec_sra(out4, PASS1_BITS3);
out5 = vec_sra(out5, PASS1_BITS3);
out6 = vec_sra(out6, PASS1_BITS3);
out7 = vec_sra(out7, PASS1_BITS3);
TRANSPOSE(out, col);
outb = vec_packs(col0, col1);
outb = vec_add(outb, PB_CENTERJSAMP);
outptr = (long long *)(output_buf[0] + output_col);
*outptr = outbptr[0];
outptr = (long long *)(output_buf[1] + output_col);
*outptr = outbptr[1];
outb = vec_packs(col2, col3);
outb = vec_add(outb, PB_CENTERJSAMP);
outptr = (long long *)(output_buf[2] + output_col);
*outptr = outbptr[0];
outptr = (long long *)(output_buf[3] + output_col);
*outptr = outbptr[1];
outb = vec_packs(col4, col5);
outb = vec_add(outb, PB_CENTERJSAMP);
outptr = (long long *)(output_buf[4] + output_col);
*outptr = outbptr[0];
outptr = (long long *)(output_buf[5] + output_col);
*outptr = outbptr[1];
outb = vec_packs(col6, col7);
outb = vec_add(outb, PB_CENTERJSAMP);
outptr = (long long *)(output_buf[6] + output_col);
*outptr = outbptr[0];
outptr = (long long *)(output_buf[7] + output_col);
*outptr = outbptr[1];
}