NEON-optimized 2x2 and 4x4 scaled iDCTs

2011-06-17 21:12:58 +00:00
parent b6fb92eeff
commit e3f7e75525
3 changed files with 414 additions and 0 deletions
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -569,6 +569,15 @@ EXTERN(void) jsimd_idct_4x4_sse2 JPP((void * dct_table,
                                      JSAMPARRAY output_buf,
                                      JDIMENSION output_col));

+EXTERN(void) jsimd_idct_2x2_neon JPP((void * dct_table,
+                                      JCOEFPTR coef_block,
+                                      JSAMPARRAY output_buf,
+                                      JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4_neon JPP((void * dct_table,
+                                      JCOEFPTR coef_block,
+                                      JSAMPARRAY output_buf,
+                                      JDIMENSION output_col));
+
 /* SIMD Inverse DCT */
 EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table,
                                       JCOEFPTR coef_block,
--- a/simd/jsimd_arm.c
+++ b/simd/jsimd_arm.c
@@ -440,6 +440,21 @@ jsimd_can_idct_2x2 (void)
 {
  init_simd();

+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_ARM_NEON))
+    return 1;
+
  return 0;
 }

@@ -448,6 +463,21 @@ jsimd_can_idct_4x4 (void)
 {
  init_simd();

+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_ARM_NEON))
+    return 1;
+
  return 0;
 }

@@ -456,6 +486,8 @@ jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
                JCOEFPTR coef_block, JSAMPARRAY output_buf,
                JDIMENSION output_col)
 {
+  if ((simd_support & JSIMD_ARM_NEON))
+    jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
 }

 GLOBAL(void)
@@ -463,6 +495,8 @@ jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
                JCOEFPTR coef_block, JSAMPARRAY output_buf,
                JDIMENSION output_col)
 {
+  if ((simd_support & JSIMD_ARM_NEON))
+    jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
 }

 GLOBAL(int)
--- a/simd/jsimd_arm_neon.S
+++ b/simd/jsimd_arm_neon.S
@@ -32,6 +32,9 @@
 .object_arch armv4
 .arm

+
+#define RESPECT_STRICT_ALIGNMENT 1
+
 /*****************************************************************************/

 /* Supplementary macro for setting function attributes */
@@ -246,6 +249,374 @@ asm_function jsimd_idct_ifast_neon

 /*****************************************************************************/

+/*
+ * jsimd_idct_4x4_neon
+ *
+ * This function contains inverse-DCT code for getting reduced-size
+ * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
+ * function from jpeg-6b (jidctred.c).
+ *
+ * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
+ *       requires much less arithmetic operations and hence should be faster.
+ *       The primary purpose of this particular NEON optimized function is
+ *       bit exact compatibility with jpeg-6b.
+ *
+ * TODO: a bit better instructions scheduling can be achieved by expanding
+ *       idct_helper/transpose_4x4 macros and reordering instructions,
+ *       but readability will suffer somewhat.
+ */
+
+#define CONST_BITS  13
+
+#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
+#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
+#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
+#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
+#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
+#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
+#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
+#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
+#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
+#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
+#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
+#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
+#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
+#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
+
+.balign 16
+jsimd_idct_4x4_neon_consts:
+    .short     FIX_1_847759065     /* d0[0] */
+    .short     -FIX_0_765366865    /* d0[1] */
+    .short     -FIX_0_211164243    /* d0[2] */
+    .short     FIX_1_451774981     /* d0[3] */
+    .short     -FIX_2_172734803    /* d1[0] */
+    .short     FIX_1_061594337     /* d1[1] */
+    .short     -FIX_0_509795579    /* d1[2] */
+    .short     -FIX_0_601344887    /* d1[3] */
+    .short     FIX_0_899976223     /* d2[0] */
+    .short     FIX_2_562915447     /* d2[1] */
+    .short     1 << (CONST_BITS+1) /* d2[2] */
+    .short     0                   /* d2[3] */
+
+.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
+    vmull.s16       q14, \x4,  d2[2]
+    vmlal.s16       q14, \x8,  d0[0]
+    vmlal.s16       q14, \x14, d0[1]
+
+    vmull.s16       q13, \x16, d1[2]
+    vmlal.s16       q13, \x12, d1[3]
+    vmlal.s16       q13, \x10, d2[0]
+    vmlal.s16       q13, \x6,  d2[1]
+
+    vmull.s16       q15, \x4,  d2[2]
+    vmlsl.s16       q15, \x8,  d0[0]
+    vmlsl.s16       q15, \x14, d0[1]
+
+    vmull.s16       q12, \x16, d0[2]
+    vmlal.s16       q12, \x12, d0[3]
+    vmlal.s16       q12, \x10, d1[0]
+    vmlal.s16       q12, \x6,  d1[1]
+
+    vadd.s32        q10, q14, q13
+    vsub.s32        q14, q14, q13
+
+.if \shift > 16
+    vrshr.s32       q10,  q10, #\shift
+    vrshr.s32       q14,  q14, #\shift
+    vmovn.s32       \y26, q10
+    vmovn.s32       \y29, q14
+.else
+    vrshrn.s32      \y26, q10, #\shift
+    vrshrn.s32      \y29, q14, #\shift
+.endif
+
+    vadd.s32        q10, q15, q12
+    vsub.s32        q15, q15, q12
+
+.if \shift > 16
+    vrshr.s32       q10,  q10, #\shift
+    vrshr.s32       q15,  q15, #\shift
+    vmovn.s32       \y27, q10
+    vmovn.s32       \y28, q15
+.else
+    vrshrn.s32      \y27, q10, #\shift
+    vrshrn.s32      \y28, q15, #\shift
+.endif
+
+.endm
+
+asm_function jsimd_idct_4x4_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP1            .req r0
+    TMP2            .req r1
+    TMP3            .req r2
+    TMP4            .req ip
+
+    vpush           {d8-d15}
+
+    /* Load constants (d3 is just used for padding) */
+    adr             TMP4, jsimd_idct_4x4_neon_consts
+    vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
+
+    /* Load all COEF_BLOCK into NEON registers with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d4      | d5
+     *   1 | d6      | d7
+     *   2 | d8      | d9
+     *   3 | d10     | d11
+     *   4 | -       | -
+     *   5 | d12     | d13
+     *   6 | d14     | d15
+     *   7 | d16     | d17
+     */
+    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
+    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
+    add COEF_BLOCK, COEF_BLOCK, #16
+    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
+    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
+    /* dequantize */
+    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
+    vmul.s16        q2, q2, q9
+    vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]!
+    vmul.s16        q3, q3, q10
+    vmul.s16        q4, q4, q11
+    add             DCT_TABLE, DCT_TABLE, #16
+    vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]!
+    vmul.s16        q5, q5, q12
+    vmul.s16        q6, q6, q13
+    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
+    vmul.s16        q7, q7, q14
+    vmul.s16        q8, q8, q15
+
+    /* Pass 1 */
+    idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
+    transpose_4x4   d4, d6, d8, d10
+    idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
+    transpose_4x4   d5, d7, d9, d11
+
+    /* Pass 2 */
+    idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
+    transpose_4x4   d26, d27, d28, d29
+
+    /* Range limit */
+    vmov.u16        q15, #0x80
+    vadd.s16        q13, q13, q15
+    vadd.s16        q14, q14, q15
+    vqmovun.s16     d26, q13
+    vqmovun.s16     d27, q14
+
+    /* Store results to the output buffer */
+    ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+    add             TMP1, TMP1, OUTPUT_COL
+    add             TMP2, TMP2, OUTPUT_COL
+    add             TMP3, TMP3, OUTPUT_COL
+    add             TMP4, TMP4, OUTPUT_COL
+
+#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
+    /* We can use much less instructions on little endian systems if the
+     * OS kernel is not configured to trap unaligned memory accesses
+     */
+    vst1.32         {d26[0]}, [TMP1]!
+    vst1.32         {d27[0]}, [TMP3]!
+    vst1.32         {d26[1]}, [TMP2]!
+    vst1.32         {d27[1]}, [TMP4]!
+#else
+    vst1.8          {d26[0]}, [TMP1]!
+    vst1.8          {d27[0]}, [TMP3]!
+    vst1.8          {d26[1]}, [TMP1]!
+    vst1.8          {d27[1]}, [TMP3]!
+    vst1.8          {d26[2]}, [TMP1]!
+    vst1.8          {d27[2]}, [TMP3]!
+    vst1.8          {d26[3]}, [TMP1]!
+    vst1.8          {d27[3]}, [TMP3]!
+
+    vst1.8          {d26[4]}, [TMP2]!
+    vst1.8          {d27[4]}, [TMP4]!
+    vst1.8          {d26[5]}, [TMP2]!
+    vst1.8          {d27[5]}, [TMP4]!
+    vst1.8          {d26[6]}, [TMP2]!
+    vst1.8          {d27[6]}, [TMP4]!
+    vst1.8          {d26[7]}, [TMP2]!
+    vst1.8          {d27[7]}, [TMP4]!
+#endif
+
+    vpop            {d8-d15}
+    bx              lr
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+.endfunc
+
+.purgem idct_helper
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_2x2_neon
+ *
+ * This function contains inverse-DCT code for getting reduced-size
+ * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
+ * function from jpeg-6b (jidctred.c).
+ *
+ * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
+ *       requires much less arithmetic operations and hence should be faster.
+ *       The primary purpose of this particular NEON optimized function is
+ *       bit exact compatibility with jpeg-6b.
+ */
+
+.balign 8
+jsimd_idct_2x2_neon_consts:
+    .short     -FIX_0_720959822    /* d0[0] */
+    .short     FIX_0_850430095     /* d0[1] */
+    .short     -FIX_1_272758580    /* d0[2] */
+    .short     FIX_3_624509785     /* d0[3] */
+
+.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
+    vshll.s16  q14,  \x4,  #15
+    vmull.s16  q13,  \x6,  d0[3]
+    vmlal.s16  q13,  \x10, d0[2]
+    vmlal.s16  q13,  \x12, d0[1]
+    vmlal.s16  q13,  \x16, d0[0]
+
+    vadd.s32   q10,  q14,  q13
+    vsub.s32   q14,  q14,  q13
+
+.if \shift > 16
+    vrshr.s32  q10,  q10,  #\shift
+    vrshr.s32  q14,  q14,  #\shift
+    vmovn.s32  \y26, q10
+    vmovn.s32  \y27, q14
+.else
+    vrshrn.s32 \y26, q10,  #\shift
+    vrshrn.s32 \y27, q14,  #\shift
+.endif
+
+.endm
+
+asm_function jsimd_idct_2x2_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP1            .req r0
+    TMP2            .req ip
+
+    vpush           {d8-d15}
+
+    /* Load constants */
+    adr             TMP2, jsimd_idct_2x2_neon_consts
+    vld1.16         {d0}, [TMP2, :64]
+
+    /* Load all COEF_BLOCK into NEON registers with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d4      | d5
+     *   1 | d6      | d7
+     *   2 | -       | -
+     *   3 | d10     | d11
+     *   4 | -       | -
+     *   5 | d12     | d13
+     *   6 | -       | -
+     *   7 | d16     | d17
+     */
+    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
+    add             COEF_BLOCK, COEF_BLOCK, #16
+    vld1.16         {d10, d11}, [COEF_BLOCK, :128]!
+    add             COEF_BLOCK, COEF_BLOCK, #16
+    vld1.16         {d12, d13}, [COEF_BLOCK, :128]!
+    add             COEF_BLOCK, COEF_BLOCK, #16
+    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
+    /* Dequantize */
+    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
+    vmul.s16        q2, q2, q9
+    vmul.s16        q3, q3, q10
+    add             DCT_TABLE, DCT_TABLE, #16
+    vld1.16         {d24, d25}, [DCT_TABLE, :128]!
+    vmul.s16        q5, q5, q12
+    add             DCT_TABLE, DCT_TABLE, #16
+    vld1.16         {d26, d27}, [DCT_TABLE, :128]!
+    vmul.s16        q6, q6, q13
+    add             DCT_TABLE, DCT_TABLE, #16
+    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
+    vmul.s16        q8, q8, q15
+
+    /* Pass 1 */
+#if 0
+    idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
+    transpose_4x4   d4, d6, d8,  d10
+    idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
+    transpose_4x4   d5, d7, d9,  d11
+#else
+    vmull.s16       q13, d6,  d0[3]
+    vmlal.s16       q13, d10, d0[2]
+    vmlal.s16       q13, d12, d0[1]
+    vmlal.s16       q13, d16, d0[0]
+    vmull.s16       q12, d7,  d0[3]
+    vmlal.s16       q12, d11, d0[2]
+    vmlal.s16       q12, d13, d0[1]
+    vmlal.s16       q12, d17, d0[0]
+    vshll.s16       q14, d4,  #15
+    vshll.s16       q15, d5,  #15
+    vadd.s32        q10, q14, q13
+    vsub.s32        q14, q14, q13
+    vrshrn.s32      d4,  q10, #13
+    vrshrn.s32      d6,  q14, #13
+    vadd.s32        q10, q15, q12
+    vsub.s32        q14, q15, q12
+    vrshrn.s32      d5,  q10, #13
+    vrshrn.s32      d7,  q14, #13
+    vtrn.16         q2,  q3
+    vtrn.32         q3,  q5
+#endif
+
+    /* Pass 2 */
+    idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
+
+    /* Range limit */
+    vmov.u16        q15, #0x80
+    vadd.s16        q13, q13, q15
+    vqmovun.s16     d26, q13
+    vqmovun.s16     d27, q13
+
+    /* Store results to the output buffer */
+    ldmia           OUTPUT_BUF, {TMP1, TMP2}
+    add             TMP1, TMP1, OUTPUT_COL
+    add             TMP2, TMP2, OUTPUT_COL
+
+    vst1.8          {d26[0]}, [TMP1]!
+    vst1.8          {d27[4]}, [TMP1]!
+    vst1.8          {d26[1]}, [TMP2]!
+    vst1.8          {d27[5]}, [TMP2]!
+
+    vpop            {d8-d15}
+    bx              lr
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+.endfunc
+
+.purgem idct_helper
+
+/*****************************************************************************/
+
 /*
 * jsimd_ycc_extrgb_convert_neon
 * jsimd_ycc_extbgr_convert_neon