NEON-optimized 2x2 and 4x4 scaled iDCTs
This commit is contained in:
@@ -569,6 +569,15 @@ EXTERN(void) jsimd_idct_4x4_sse2 JPP((void * dct_table,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col));
|
||||
|
||||
EXTERN(void) jsimd_idct_2x2_neon JPP((void * dct_table,
|
||||
JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col));
|
||||
EXTERN(void) jsimd_idct_4x4_neon JPP((void * dct_table,
|
||||
JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col));
|
||||
|
||||
/* SIMD Inverse DCT */
|
||||
EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table,
|
||||
JCOEFPTR coef_block,
|
||||
|
||||
@@ -440,6 +440,21 @@ jsimd_can_idct_2x2 (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (sizeof(ISLOW_MULT_TYPE) != 2)
|
||||
return 0;
|
||||
|
||||
if ((simd_support & JSIMD_ARM_NEON))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -448,6 +463,21 @@ jsimd_can_idct_4x4 (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (sizeof(ISLOW_MULT_TYPE) != 2)
|
||||
return 0;
|
||||
|
||||
if ((simd_support & JSIMD_ARM_NEON))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -456,6 +486,8 @@ jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
if ((simd_support & JSIMD_ARM_NEON))
|
||||
jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -463,6 +495,8 @@ jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
if ((simd_support & JSIMD_ARM_NEON))
|
||||
jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
|
||||
@@ -32,6 +32,9 @@
|
||||
.object_arch armv4
|
||||
.arm
|
||||
|
||||
|
||||
#define RESPECT_STRICT_ALIGNMENT 1
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
/* Supplementary macro for setting function attributes */
|
||||
@@ -246,6 +249,374 @@ asm_function jsimd_idct_ifast_neon
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
/*
|
||||
* jsimd_idct_4x4_neon
|
||||
*
|
||||
* This function contains inverse-DCT code for getting reduced-size
|
||||
* 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
|
||||
* and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
|
||||
* function from jpeg-6b (jidctred.c).
|
||||
*
|
||||
* NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
|
||||
* requires much less arithmetic operations and hence should be faster.
|
||||
* The primary purpose of this particular NEON optimized function is
|
||||
* bit exact compatibility with jpeg-6b.
|
||||
*
|
||||
* TODO: a bit better instructions scheduling can be achieved by expanding
|
||||
* idct_helper/transpose_4x4 macros and reordering instructions,
|
||||
* but readability will suffer somewhat.
|
||||
*/
|
||||
|
||||
#define CONST_BITS 13
|
||||
|
||||
#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
|
||||
#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
|
||||
#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
|
||||
#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
|
||||
#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
|
||||
#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
|
||||
#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
|
||||
#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
|
||||
#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
|
||||
#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
|
||||
#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
|
||||
#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
|
||||
#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
|
||||
#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
|
||||
|
||||
.balign 16
|
||||
jsimd_idct_4x4_neon_consts:
|
||||
.short FIX_1_847759065 /* d0[0] */
|
||||
.short -FIX_0_765366865 /* d0[1] */
|
||||
.short -FIX_0_211164243 /* d0[2] */
|
||||
.short FIX_1_451774981 /* d0[3] */
|
||||
.short -FIX_2_172734803 /* d1[0] */
|
||||
.short FIX_1_061594337 /* d1[1] */
|
||||
.short -FIX_0_509795579 /* d1[2] */
|
||||
.short -FIX_0_601344887 /* d1[3] */
|
||||
.short FIX_0_899976223 /* d2[0] */
|
||||
.short FIX_2_562915447 /* d2[1] */
|
||||
.short 1 << (CONST_BITS+1) /* d2[2] */
|
||||
.short 0 /* d2[3] */
|
||||
|
||||
.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
|
||||
vmull.s16 q14, \x4, d2[2]
|
||||
vmlal.s16 q14, \x8, d0[0]
|
||||
vmlal.s16 q14, \x14, d0[1]
|
||||
|
||||
vmull.s16 q13, \x16, d1[2]
|
||||
vmlal.s16 q13, \x12, d1[3]
|
||||
vmlal.s16 q13, \x10, d2[0]
|
||||
vmlal.s16 q13, \x6, d2[1]
|
||||
|
||||
vmull.s16 q15, \x4, d2[2]
|
||||
vmlsl.s16 q15, \x8, d0[0]
|
||||
vmlsl.s16 q15, \x14, d0[1]
|
||||
|
||||
vmull.s16 q12, \x16, d0[2]
|
||||
vmlal.s16 q12, \x12, d0[3]
|
||||
vmlal.s16 q12, \x10, d1[0]
|
||||
vmlal.s16 q12, \x6, d1[1]
|
||||
|
||||
vadd.s32 q10, q14, q13
|
||||
vsub.s32 q14, q14, q13
|
||||
|
||||
.if \shift > 16
|
||||
vrshr.s32 q10, q10, #\shift
|
||||
vrshr.s32 q14, q14, #\shift
|
||||
vmovn.s32 \y26, q10
|
||||
vmovn.s32 \y29, q14
|
||||
.else
|
||||
vrshrn.s32 \y26, q10, #\shift
|
||||
vrshrn.s32 \y29, q14, #\shift
|
||||
.endif
|
||||
|
||||
vadd.s32 q10, q15, q12
|
||||
vsub.s32 q15, q15, q12
|
||||
|
||||
.if \shift > 16
|
||||
vrshr.s32 q10, q10, #\shift
|
||||
vrshr.s32 q15, q15, #\shift
|
||||
vmovn.s32 \y27, q10
|
||||
vmovn.s32 \y28, q15
|
||||
.else
|
||||
vrshrn.s32 \y27, q10, #\shift
|
||||
vrshrn.s32 \y28, q15, #\shift
|
||||
.endif
|
||||
|
||||
.endm
|
||||
|
||||
asm_function jsimd_idct_4x4_neon
|
||||
|
||||
DCT_TABLE .req r0
|
||||
COEF_BLOCK .req r1
|
||||
OUTPUT_BUF .req r2
|
||||
OUTPUT_COL .req r3
|
||||
TMP1 .req r0
|
||||
TMP2 .req r1
|
||||
TMP3 .req r2
|
||||
TMP4 .req ip
|
||||
|
||||
vpush {d8-d15}
|
||||
|
||||
/* Load constants (d3 is just used for padding) */
|
||||
adr TMP4, jsimd_idct_4x4_neon_consts
|
||||
vld1.16 {d0, d1, d2, d3}, [TMP4, :128]
|
||||
|
||||
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
||||
* 0 1 2 3 | 4 5 6 7
|
||||
* ---------+--------
|
||||
* 0 | d4 | d5
|
||||
* 1 | d6 | d7
|
||||
* 2 | d8 | d9
|
||||
* 3 | d10 | d11
|
||||
* 4 | - | -
|
||||
* 5 | d12 | d13
|
||||
* 6 | d14 | d15
|
||||
* 7 | d16 | d17
|
||||
*/
|
||||
vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
|
||||
vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
|
||||
add COEF_BLOCK, COEF_BLOCK, #16
|
||||
vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
|
||||
vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
|
||||
/* dequantize */
|
||||
vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
|
||||
vmul.s16 q2, q2, q9
|
||||
vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!
|
||||
vmul.s16 q3, q3, q10
|
||||
vmul.s16 q4, q4, q11
|
||||
add DCT_TABLE, DCT_TABLE, #16
|
||||
vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!
|
||||
vmul.s16 q5, q5, q12
|
||||
vmul.s16 q6, q6, q13
|
||||
vld1.16 {d30, d31}, [DCT_TABLE, :128]!
|
||||
vmul.s16 q7, q7, q14
|
||||
vmul.s16 q8, q8, q15
|
||||
|
||||
/* Pass 1 */
|
||||
idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
|
||||
transpose_4x4 d4, d6, d8, d10
|
||||
idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
|
||||
transpose_4x4 d5, d7, d9, d11
|
||||
|
||||
/* Pass 2 */
|
||||
idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
|
||||
transpose_4x4 d26, d27, d28, d29
|
||||
|
||||
/* Range limit */
|
||||
vmov.u16 q15, #0x80
|
||||
vadd.s16 q13, q13, q15
|
||||
vadd.s16 q14, q14, q15
|
||||
vqmovun.s16 d26, q13
|
||||
vqmovun.s16 d27, q14
|
||||
|
||||
/* Store results to the output buffer */
|
||||
ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
|
||||
add TMP1, TMP1, OUTPUT_COL
|
||||
add TMP2, TMP2, OUTPUT_COL
|
||||
add TMP3, TMP3, OUTPUT_COL
|
||||
add TMP4, TMP4, OUTPUT_COL
|
||||
|
||||
#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
|
||||
/* We can use much less instructions on little endian systems if the
|
||||
* OS kernel is not configured to trap unaligned memory accesses
|
||||
*/
|
||||
vst1.32 {d26[0]}, [TMP1]!
|
||||
vst1.32 {d27[0]}, [TMP3]!
|
||||
vst1.32 {d26[1]}, [TMP2]!
|
||||
vst1.32 {d27[1]}, [TMP4]!
|
||||
#else
|
||||
vst1.8 {d26[0]}, [TMP1]!
|
||||
vst1.8 {d27[0]}, [TMP3]!
|
||||
vst1.8 {d26[1]}, [TMP1]!
|
||||
vst1.8 {d27[1]}, [TMP3]!
|
||||
vst1.8 {d26[2]}, [TMP1]!
|
||||
vst1.8 {d27[2]}, [TMP3]!
|
||||
vst1.8 {d26[3]}, [TMP1]!
|
||||
vst1.8 {d27[3]}, [TMP3]!
|
||||
|
||||
vst1.8 {d26[4]}, [TMP2]!
|
||||
vst1.8 {d27[4]}, [TMP4]!
|
||||
vst1.8 {d26[5]}, [TMP2]!
|
||||
vst1.8 {d27[5]}, [TMP4]!
|
||||
vst1.8 {d26[6]}, [TMP2]!
|
||||
vst1.8 {d27[6]}, [TMP4]!
|
||||
vst1.8 {d26[7]}, [TMP2]!
|
||||
vst1.8 {d27[7]}, [TMP4]!
|
||||
#endif
|
||||
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
|
||||
.unreq DCT_TABLE
|
||||
.unreq COEF_BLOCK
|
||||
.unreq OUTPUT_BUF
|
||||
.unreq OUTPUT_COL
|
||||
.unreq TMP1
|
||||
.unreq TMP2
|
||||
.unreq TMP3
|
||||
.unreq TMP4
|
||||
.endfunc
|
||||
|
||||
.purgem idct_helper
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
/*
|
||||
* jsimd_idct_2x2_neon
|
||||
*
|
||||
* This function contains inverse-DCT code for getting reduced-size
|
||||
* 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
|
||||
* and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
|
||||
* function from jpeg-6b (jidctred.c).
|
||||
*
|
||||
* NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
|
||||
* requires much less arithmetic operations and hence should be faster.
|
||||
* The primary purpose of this particular NEON optimized function is
|
||||
* bit exact compatibility with jpeg-6b.
|
||||
*/
|
||||
|
||||
.balign 8
|
||||
jsimd_idct_2x2_neon_consts:
|
||||
.short -FIX_0_720959822 /* d0[0] */
|
||||
.short FIX_0_850430095 /* d0[1] */
|
||||
.short -FIX_1_272758580 /* d0[2] */
|
||||
.short FIX_3_624509785 /* d0[3] */
|
||||
|
||||
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
|
||||
vshll.s16 q14, \x4, #15
|
||||
vmull.s16 q13, \x6, d0[3]
|
||||
vmlal.s16 q13, \x10, d0[2]
|
||||
vmlal.s16 q13, \x12, d0[1]
|
||||
vmlal.s16 q13, \x16, d0[0]
|
||||
|
||||
vadd.s32 q10, q14, q13
|
||||
vsub.s32 q14, q14, q13
|
||||
|
||||
.if \shift > 16
|
||||
vrshr.s32 q10, q10, #\shift
|
||||
vrshr.s32 q14, q14, #\shift
|
||||
vmovn.s32 \y26, q10
|
||||
vmovn.s32 \y27, q14
|
||||
.else
|
||||
vrshrn.s32 \y26, q10, #\shift
|
||||
vrshrn.s32 \y27, q14, #\shift
|
||||
.endif
|
||||
|
||||
.endm
|
||||
|
||||
asm_function jsimd_idct_2x2_neon
|
||||
|
||||
DCT_TABLE .req r0
|
||||
COEF_BLOCK .req r1
|
||||
OUTPUT_BUF .req r2
|
||||
OUTPUT_COL .req r3
|
||||
TMP1 .req r0
|
||||
TMP2 .req ip
|
||||
|
||||
vpush {d8-d15}
|
||||
|
||||
/* Load constants */
|
||||
adr TMP2, jsimd_idct_2x2_neon_consts
|
||||
vld1.16 {d0}, [TMP2, :64]
|
||||
|
||||
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
||||
* 0 1 2 3 | 4 5 6 7
|
||||
* ---------+--------
|
||||
* 0 | d4 | d5
|
||||
* 1 | d6 | d7
|
||||
* 2 | - | -
|
||||
* 3 | d10 | d11
|
||||
* 4 | - | -
|
||||
* 5 | d12 | d13
|
||||
* 6 | - | -
|
||||
* 7 | d16 | d17
|
||||
*/
|
||||
vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
|
||||
add COEF_BLOCK, COEF_BLOCK, #16
|
||||
vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
|
||||
add COEF_BLOCK, COEF_BLOCK, #16
|
||||
vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
|
||||
add COEF_BLOCK, COEF_BLOCK, #16
|
||||
vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
|
||||
/* Dequantize */
|
||||
vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
|
||||
vmul.s16 q2, q2, q9
|
||||
vmul.s16 q3, q3, q10
|
||||
add DCT_TABLE, DCT_TABLE, #16
|
||||
vld1.16 {d24, d25}, [DCT_TABLE, :128]!
|
||||
vmul.s16 q5, q5, q12
|
||||
add DCT_TABLE, DCT_TABLE, #16
|
||||
vld1.16 {d26, d27}, [DCT_TABLE, :128]!
|
||||
vmul.s16 q6, q6, q13
|
||||
add DCT_TABLE, DCT_TABLE, #16
|
||||
vld1.16 {d30, d31}, [DCT_TABLE, :128]!
|
||||
vmul.s16 q8, q8, q15
|
||||
|
||||
/* Pass 1 */
|
||||
#if 0
|
||||
idct_helper d4, d6, d10, d12, d16, 13, d4, d6
|
||||
transpose_4x4 d4, d6, d8, d10
|
||||
idct_helper d5, d7, d11, d13, d17, 13, d5, d7
|
||||
transpose_4x4 d5, d7, d9, d11
|
||||
#else
|
||||
vmull.s16 q13, d6, d0[3]
|
||||
vmlal.s16 q13, d10, d0[2]
|
||||
vmlal.s16 q13, d12, d0[1]
|
||||
vmlal.s16 q13, d16, d0[0]
|
||||
vmull.s16 q12, d7, d0[3]
|
||||
vmlal.s16 q12, d11, d0[2]
|
||||
vmlal.s16 q12, d13, d0[1]
|
||||
vmlal.s16 q12, d17, d0[0]
|
||||
vshll.s16 q14, d4, #15
|
||||
vshll.s16 q15, d5, #15
|
||||
vadd.s32 q10, q14, q13
|
||||
vsub.s32 q14, q14, q13
|
||||
vrshrn.s32 d4, q10, #13
|
||||
vrshrn.s32 d6, q14, #13
|
||||
vadd.s32 q10, q15, q12
|
||||
vsub.s32 q14, q15, q12
|
||||
vrshrn.s32 d5, q10, #13
|
||||
vrshrn.s32 d7, q14, #13
|
||||
vtrn.16 q2, q3
|
||||
vtrn.32 q3, q5
|
||||
#endif
|
||||
|
||||
/* Pass 2 */
|
||||
idct_helper d4, d6, d10, d7, d11, 20, d26, d27
|
||||
|
||||
/* Range limit */
|
||||
vmov.u16 q15, #0x80
|
||||
vadd.s16 q13, q13, q15
|
||||
vqmovun.s16 d26, q13
|
||||
vqmovun.s16 d27, q13
|
||||
|
||||
/* Store results to the output buffer */
|
||||
ldmia OUTPUT_BUF, {TMP1, TMP2}
|
||||
add TMP1, TMP1, OUTPUT_COL
|
||||
add TMP2, TMP2, OUTPUT_COL
|
||||
|
||||
vst1.8 {d26[0]}, [TMP1]!
|
||||
vst1.8 {d27[4]}, [TMP1]!
|
||||
vst1.8 {d26[1]}, [TMP2]!
|
||||
vst1.8 {d27[5]}, [TMP2]!
|
||||
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
|
||||
.unreq DCT_TABLE
|
||||
.unreq COEF_BLOCK
|
||||
.unreq OUTPUT_BUF
|
||||
.unreq OUTPUT_COL
|
||||
.unreq TMP1
|
||||
.unreq TMP2
|
||||
.endfunc
|
||||
|
||||
.purgem idct_helper
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
/*
|
||||
* jsimd_ycc_extrgb_convert_neon
|
||||
* jsimd_ycc_extbgr_convert_neon
|
||||
|
||||
Reference in New Issue
Block a user