diff --git a/simd/jsimd.h b/simd/jsimd.h index b0329728..4dcdfc1a 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -545,6 +545,8 @@ EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM * data); EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM * data); +EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM * data); + /* Fast Integer Forward DCT */ EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM * data); diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c index a9a5965d..ff37c5f0 100644 --- a/simd/jsimd_powerpc.c +++ b/simd/jsimd_powerpc.c @@ -226,6 +226,17 @@ jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, GLOBAL(int) jsimd_can_fdct_islow (void) { + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + return 0; } @@ -255,6 +266,7 @@ jsimd_can_fdct_float (void) GLOBAL(void) jsimd_fdct_islow (DCTELEM * data) { + jsimd_fdct_islow_altivec(data); } GLOBAL(void) diff --git a/simd/jsimd_powerpc_altivec.c b/simd/jsimd_powerpc_altivec.c index e18eaa8e..ef32545d 100644 --- a/simd/jsimd_powerpc_altivec.c +++ b/simd/jsimd_powerpc_altivec.c @@ -29,6 +29,9 @@ #include "jsimd.h" #include + +/* Common code */ + #define TRANSPOSE(row, col) \ { \ __vector short row04l, row04h, row15l, row15h, \ @@ -67,15 +70,30 @@ col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \ } -static const __vector short constants __attribute__((aligned(16))) = + +/* FAST INTEGER FORWARD DCT + * + * This is similar to the SSE2 implementation, except that we left-shift the + * constants by 1 less bit (the -1 in IFAST_CONST_SHIFT.) This is because + * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of: + * the elements in arg3 + the most significant 17 bits of + * (the elements in arg1 * the elements in arg2). + */ + +#define IFAST_CONST_BITS 8 +#define IFAST_PRE_MULTIPLY_SCALE_BITS 2 +#define IFAST_CONST_SHIFT \ + (16 - IFAST_PRE_MULTIPLY_SCALE_BITS - IFAST_CONST_BITS - 1) + +static const __vector short jconst_fdct_ifast __attribute__((aligned(16))) = { - 98 << 5, /* FIX(0.382683433) */ - 139 << 5, /* FIX(0.541196100) */ - 181 << 5, /* FIX(0.707106781) */ - 334 << 5 /* FIX(1.306562965) */ + 98 << IFAST_CONST_SHIFT, /* FIX(0.382683433) */ + 139 << IFAST_CONST_SHIFT, /* FIX(0.541196100) */ + 181 << IFAST_CONST_SHIFT, /* FIX(0.707106781) */ + 334 << IFAST_CONST_SHIFT /* FIX(1.306562965) */ }; -#define DO_DCT() \ +#define DO_FDCT_IFAST() \ { \ /* Even part */ \ \ @@ -134,11 +152,12 @@ jsimd_fdct_ifast_altivec (DCTELEM *data) /* Constants */ __vector short zero = vec_splat_s16(0), - PW_0382 = vec_splat(constants, 0), - PW_0541 = vec_splat(constants, 1), - PW_0707 = vec_splat(constants, 2), - PW_1306 = vec_splat(constants, 3); - __vector unsigned short PRE_MULTIPLY_SCALE_BITS = vec_splat_u16(2); + PW_0382 = vec_splat(jconst_fdct_ifast, 0), + PW_0541 = vec_splat(jconst_fdct_ifast, 1), + PW_0707 = vec_splat(jconst_fdct_ifast, 2), + PW_1306 = vec_splat(jconst_fdct_ifast, 3); + __vector unsigned short PRE_MULTIPLY_SCALE_BITS = + vec_splat_u16(IFAST_PRE_MULTIPLY_SCALE_BITS); /* Pass 1: process rows. */ @@ -162,7 +181,7 @@ jsimd_fdct_ifast_altivec (DCTELEM *data) tmp3 = vec_add(col3, col4); tmp4 = vec_sub(col3, col4); - DO_DCT(); + DO_FDCT_IFAST(); /* Pass 2: process columns. */ @@ -177,7 +196,261 @@ jsimd_fdct_ifast_altivec (DCTELEM *data) tmp3 = vec_add(row3, row4); tmp4 = vec_sub(row3, row4); - DO_DCT(); + DO_FDCT_IFAST(); + + *(__vector short *)&data[0] = out0; + *(__vector short *)&data[8] = out1; + *(__vector short *)&data[16] = out2; + *(__vector short *)&data[24] = out3; + *(__vector short *)&data[32] = out4; + *(__vector short *)&data[40] = out5; + *(__vector short *)&data[48] = out6; + *(__vector short *)&data[56] = out7; +} + + +/* SLOW INTEGER FORWARD DCT */ + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +#define ISLOW_CONST_BITS 13 +#define ISLOW_PASS1_BITS 2 +#define ISLOW_DESCALE_P1 (ISLOW_CONST_BITS - ISLOW_PASS1_BITS) +#define ISLOW_DESCALE_P2 (ISLOW_CONST_BITS + ISLOW_PASS1_BITS) + +static const __vector int jconst_fdct_islow __attribute__((aligned(16))) = +{ + 1 << (ISLOW_DESCALE_P1 - 1), + 1 << (ISLOW_DESCALE_P2 - 1) +}; + +static const __vector short jconst_fdct_islow2 __attribute__((aligned(16))) = +{ + 1 << (ISLOW_PASS1_BITS - 1) +}; + +#define DO_FDCT_ISLOW_COMMON(PASS) \ +{ \ + tmp1312l = vec_mergeh(tmp13, tmp12); \ + tmp1312h = vec_mergel(tmp13, tmp12); \ + \ + out2l = vec_msums(tmp1312l, PW_F130_F054, zero); \ + out2h = vec_msums(tmp1312h, PW_F130_F054, zero); \ + out6l = vec_msums(tmp1312l, PW_F054_MF130, zero); \ + out6h = vec_msums(tmp1312h, PW_F054_MF130, zero); \ + \ + out2l = vec_add(out2l, PD_DESCALE_P##PASS); \ + out2h = vec_add(out2h, PD_DESCALE_P##PASS); \ + out2l = vec_sr(out2l, DESCALE_P##PASS); \ + out2h = vec_sr(out2h, DESCALE_P##PASS); \ + \ + out6l = vec_add(out6l, PD_DESCALE_P##PASS); \ + out6h = vec_add(out6h, PD_DESCALE_P##PASS); \ + out6l = vec_sr(out6l, DESCALE_P##PASS); \ + out6h = vec_sr(out6h, DESCALE_P##PASS); \ + \ + out2 = vec_pack(out2l, out2h); \ + out6 = vec_pack(out6l, out6h); \ + \ + /* Odd part */ \ + \ + z3 = vec_add(tmp4, tmp6); \ + z4 = vec_add(tmp5, tmp7); \ + \ + z34l = vec_mergeh(z3, z4); \ + z34h = vec_mergel(z3, z4); \ + \ + z3l = vec_msums(z34l, PW_MF078_F117, zero); \ + z3h = vec_msums(z34h, PW_MF078_F117, zero); \ + z4l = vec_msums(z34l, PW_F117_F078, zero); \ + z4h = vec_msums(z34h, PW_F117_F078, zero); \ + \ + tmp47l = vec_mergeh(tmp4, tmp7); \ + tmp47h = vec_mergel(tmp4, tmp7); \ + \ + tmp4l = vec_msums(tmp47l, PW_MF060_MF089, zero); \ + tmp4h = vec_msums(tmp47h, PW_MF060_MF089, zero); \ + tmp7l = vec_msums(tmp47l, PW_MF089_F060, zero); \ + tmp7h = vec_msums(tmp47h, PW_MF089_F060, zero); \ + \ + out7l = vec_add(z3l, tmp4l); \ + out7h = vec_add(z3h, tmp4h); \ + out1l = vec_add(z4l, tmp7l); \ + out1h = vec_add(z4h, tmp7h); \ + \ + out7l = vec_add(out7l, PD_DESCALE_P##PASS); \ + out7h = vec_add(out7h, PD_DESCALE_P##PASS); \ + out7l = vec_sr(out7l, DESCALE_P##PASS); \ + out7h = vec_sr(out7h, DESCALE_P##PASS); \ + \ + out1l = vec_add(out1l, PD_DESCALE_P##PASS); \ + out1h = vec_add(out1h, PD_DESCALE_P##PASS); \ + out1l = vec_sr(out1l, DESCALE_P##PASS); \ + out1h = vec_sr(out1h, DESCALE_P##PASS); \ + \ + out7 = vec_pack(out7l, out7h); \ + out1 = vec_pack(out1l, out1h); \ + \ + tmp56l = vec_mergeh(tmp5, tmp6); \ + tmp56h = vec_mergel(tmp5, tmp6); \ + \ + tmp5l = vec_msums(tmp56l, PW_MF050_MF256, zero); \ + tmp5h = vec_msums(tmp56h, PW_MF050_MF256, zero); \ + tmp6l = vec_msums(tmp56l, PW_MF256_F050, zero); \ + tmp6h = vec_msums(tmp56h, PW_MF256_F050, zero); \ + \ + out5l = vec_add(tmp5l, z4l); \ + out5h = vec_add(tmp5h, z4h); \ + out3l = vec_add(tmp6l, z3l); \ + out3h = vec_add(tmp6h, z3h); \ + \ + out5l = vec_add(out5l, PD_DESCALE_P##PASS); \ + out5h = vec_add(out5h, PD_DESCALE_P##PASS); \ + out5l = vec_sr(out5l, DESCALE_P##PASS); \ + out5h = vec_sr(out5h, DESCALE_P##PASS); \ + \ + out3l = vec_add(out3l, PD_DESCALE_P##PASS); \ + out3h = vec_add(out3h, PD_DESCALE_P##PASS); \ + out3l = vec_sr(out3l, DESCALE_P##PASS); \ + out3h = vec_sr(out3h, DESCALE_P##PASS); \ + \ + out5 = vec_pack(out5l, out5h); \ + out3 = vec_pack(out3l, out3h); \ +} + +#define DO_FDCT_ISLOW_ROWS() \ +{ \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out0 = vec_sl(out0, PASS1_BITS); \ + out4 = vec_sub(tmp10, tmp11); \ + out4 = vec_sl(out4, PASS1_BITS); \ + \ + DO_FDCT_ISLOW_COMMON(1); \ +} + +#define DO_FDCT_ISLOW_COLS() \ +{ \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out0 = vec_add(out0, PW_DESCALE_P2X); \ + out0 = vec_sra(out0, PASS1_BITS); \ + out4 = vec_sub(tmp10, tmp11); \ + out4 = vec_add(out4, PW_DESCALE_P2X); \ + out4 = vec_sra(out4, PASS1_BITS); \ + \ + DO_FDCT_ISLOW_COMMON(2); \ +} + +void +jsimd_fdct_islow_altivec (DCTELEM *data) +{ + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, + tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h, + z3, z4, z34l, z34h, + out0, out1, out2, out3, out4, out5, out6, out7; + __vector int tmp4l, tmp4h, tmp5l, tmp5h, tmp6l, tmp6h, tmp7l, tmp7h, + z3l, z3h, z4l, z4h, + out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h, + out7l, out7h; + + __vector short PW_F130_F054 = {F_0_541 + F_0_765, F_0_541, + F_0_541 + F_0_765, F_0_541, F_0_541 + F_0_765, F_0_541, + F_0_541 + F_0_765, F_0_541}; + __vector short PW_F054_MF130 = {F_0_541, F_0_541 - F_1_847, + F_0_541, F_0_541 - F_1_847, F_0_541, F_0_541 - F_1_847, + F_0_541, F_0_541 - F_1_847}; + __vector short PW_MF078_F117 = {F_1_175 - F_1_961, F_1_175, + F_1_175 - F_1_961, F_1_175, F_1_175 - F_1_961, F_1_175, + F_1_175 - F_1_961, F_1_175}; + __vector short PW_F117_F078 = {F_1_175, F_1_175 - F_0_390, + F_1_175, F_1_175 - F_0_390, F_1_175, F_1_175 - F_0_390, + F_1_175, F_1_175 - F_0_390}; + __vector short PW_MF060_MF089 = {F_0_298 - F_0_899, -F_0_899, + F_0_298 - F_0_899, -F_0_899, F_0_298 - F_0_899, -F_0_899, + F_0_298 - F_0_899, -F_0_899}; + __vector short PW_MF089_F060 = {-F_0_899, F_1_501 - F_0_899, + -F_0_899, F_1_501 - F_0_899, -F_0_899, F_1_501 - F_0_899, + -F_0_899, F_1_501 - F_0_899}; + __vector short PW_MF050_MF256 = {F_2_053 - F_2_562, -F_2_562, + F_2_053 - F_2_562, -F_2_562, F_2_053 - F_2_562, -F_2_562, + F_2_053 - F_2_562, -F_2_562}; + __vector short PW_MF256_F050 = {-F_2_562, F_3_072 - F_2_562, + -F_2_562, F_3_072 - F_2_562, -F_2_562, F_3_072 - F_2_562, + -F_2_562, F_3_072 - F_2_562}; + __vector short PW_DESCALE_P2X = vec_splat(jconst_fdct_islow2, 0); + + /* Constants */ + __vector unsigned short PASS1_BITS = vec_splat_u16(ISLOW_PASS1_BITS); + __vector int zero = vec_splat_s32(0), + PD_DESCALE_P1 = vec_splat(jconst_fdct_islow, 0), + PD_DESCALE_P2 = vec_splat(jconst_fdct_islow, 1); + __vector unsigned int DESCALE_P1 = vec_splat_u32(ISLOW_DESCALE_P1), + DESCALE_P2 = vec_splat_u32(ISLOW_DESCALE_P2); + + /* Pass 1: process rows. */ + + row0 = *(__vector short *)&data[0]; + row1 = *(__vector short *)&data[8]; + row2 = *(__vector short *)&data[16]; + row3 = *(__vector short *)&data[24]; + row4 = *(__vector short *)&data[32]; + row5 = *(__vector short *)&data[40]; + row6 = *(__vector short *)&data[48]; + row7 = *(__vector short *)&data[56]; + + TRANSPOSE(row, col); + + tmp0 = vec_add(col0, col7); + tmp7 = vec_sub(col0, col7); + tmp1 = vec_add(col1, col6); + tmp6 = vec_sub(col1, col6); + tmp2 = vec_add(col2, col5); + tmp5 = vec_sub(col2, col5); + tmp3 = vec_add(col3, col4); + tmp4 = vec_sub(col3, col4); + + DO_FDCT_ISLOW_ROWS(); + + /* Pass 2: process columns. */ + + TRANSPOSE(out, row); + + tmp0 = vec_add(row0, row7); + tmp7 = vec_sub(row0, row7); + tmp1 = vec_add(row1, row6); + tmp6 = vec_sub(row1, row6); + tmp2 = vec_add(row2, row5); + tmp5 = vec_sub(row2, row5); + tmp3 = vec_add(row3, row4); + tmp4 = vec_sub(row3, row4); + + DO_FDCT_ISLOW_COLS(); *(__vector short *)&data[0] = out0; *(__vector short *)&data[8] = out1;