Neon: Intrinsics impl. of 2x2 and 4x4 scaled IDCTs

The previous AArch32 and AArch64 GAS implementations have been removed, since the intrinsics implementations provide the same or better performance.
2018-09-18 18:28:31 +00:00
parent 4574f01f43
commit 141f26ff6d
4 changed files with 489 additions and 876 deletions
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -267,7 +267,7 @@ file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S)

 set(SIMD_SOURCES arm/jcgray-neon.c arm/jcphuff-neon.c arm/jcsample-neon.c
  arm/jdmerge-neon.c arm/jdsample-neon.c arm/jfdctfst-neon.c
-  arm/jquanti-neon.c)
+  arm/jidctred-neon.c arm/jquanti-neon.c)
 if(NEON_INTRINSICS)
  set(SIMD_SOURCES ${SIMD_SOURCES} arm/jccolor-neon.c arm/jidctint-neon.c)
 endif()
@@ -281,9 +281,11 @@ endif()
 if(BITS EQUAL 32)
  set_source_files_properties(${SIMD_SOURCES} COMPILE_FLAGS -mfpu=neon)
 endif()
+if(NOT NEON_INTRINSICS)
+  set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jsimd_neon.S)
+endif()

-add_library(simd OBJECT ${SIMD_SOURCES} arm/aarch${BITS}/jsimd_neon.S
-  arm/aarch${BITS}/jsimd.c)
+add_library(simd OBJECT ${SIMD_SOURCES} arm/aarch${BITS}/jsimd.c)

 if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
--- a/simd/arm/aarch32/jsimd_neon.S
+++ b/simd/arm/aarch32/jsimd_neon.S
@@ -38,9 +38,6 @@
 .syntax unified


-#define RESPECT_STRICT_ALIGNMENT  1
-
-
 /*****************************************************************************/

 /* Supplementary macro for setting function attributes */
@@ -59,16 +56,6 @@ _\fname:
 #endif
 .endm

-/* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4 x0, x1, x2, x3
-    vtrn.16         \x0, \x1
-    vtrn.16         \x2, \x3
-    vtrn.32         \x0, \x2
-    vtrn.32         \x1, \x3
-.endm
-
-
-#ifndef NEON_INTRINSICS

 #define CENTERJSAMPLE  128

@@ -902,376 +889,6 @@ asm_function jsimd_idct_ifast_neon
    .unreq          TMP3
    .unreq          TMP4

-#endif  /* NEON_INTRINSICS */
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_4x4_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular Neon optimized function is
- *       bit exact compatibility with jpeg-6b.
- *
- * TODO: a bit better instructions scheduling can be achieved by expanding
- *       idct_helper/transpose_4x4 macros and reordering instructions,
- *       but readability will suffer somewhat.
- */
-
-#define CONST_BITS  13
-
-#define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
-#define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
-#define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
-#define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
-#define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
-#define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
-#define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
-#define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
-#define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
-#define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
-#define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
-#define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
-#define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
-#define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
-
-.balign 16
-jsimd_idct_4x4_neon_consts:
-  .short FIX_1_847759065        /* d0[0] */
-  .short -FIX_0_765366865       /* d0[1] */
-  .short -FIX_0_211164243       /* d0[2] */
-  .short FIX_1_451774981        /* d0[3] */
-  .short -FIX_2_172734803       /* d1[0] */
-  .short FIX_1_061594337        /* d1[1] */
-  .short -FIX_0_509795579       /* d1[2] */
-  .short -FIX_0_601344887       /* d1[3] */
-  .short FIX_0_899976223        /* d2[0] */
-  .short FIX_2_562915447        /* d2[1] */
-  .short 1 << (CONST_BITS + 1)  /* d2[2] */
-  .short 0                      /* d2[3] */
-
-.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    vmull.s16       q14, \x4, d2[2]
-    vmlal.s16       q14, \x8, d0[0]
-    vmlal.s16       q14, \x14, d0[1]
-
-    vmull.s16       q13, \x16, d1[2]
-    vmlal.s16       q13, \x12, d1[3]
-    vmlal.s16       q13, \x10, d2[0]
-    vmlal.s16       q13, \x6, d2[1]
-
-    vmull.s16       q15, \x4, d2[2]
-    vmlsl.s16       q15, \x8, d0[0]
-    vmlsl.s16       q15, \x14, d0[1]
-
-    vmull.s16       q12, \x16, d0[2]
-    vmlal.s16       q12, \x12, d0[3]
-    vmlal.s16       q12, \x10, d1[0]
-    vmlal.s16       q12, \x6, d1[1]
-
-    vadd.s32        q10, q14, q13
-    vsub.s32        q14, q14, q13
-
-  .if \shift > 16
-    vrshr.s32       q10, q10, #\shift
-    vrshr.s32       q14, q14, #\shift
-    vmovn.s32       \y26, q10
-    vmovn.s32       \y29, q14
-  .else
-    vrshrn.s32      \y26, q10, #\shift
-    vrshrn.s32      \y29, q14, #\shift
-  .endif
-
-    vadd.s32        q10, q15, q12
-    vsub.s32        q15, q15, q12
-
-  .if \shift > 16
-    vrshr.s32       q10, q10, #\shift
-    vrshr.s32       q15, q15, #\shift
-    vmovn.s32       \y27, q10
-    vmovn.s32       \y28, q15
-  .else
-    vrshrn.s32      \y27, q10, #\shift
-    vrshrn.s32      \y28, q15, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_4x4_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req r1
-    TMP3            .req r2
-    TMP4            .req ip
-
-    vpush           {d8 - d15}
-
-    /* Load constants (d3 is just used for padding) */
-    adr             TMP4, jsimd_idct_4x4_neon_consts
-    vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
-
-    /* Load all COEF_BLOCK into Neon registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d4      | d5
-     *   1 | d6      | d7
-     *   2 | d8      | d9
-     *   3 | d10     | d11
-     *   4 | -       | -
-     *   5 | d12     | d13
-     *   6 | d14     | d15
-     *   7 | d16     | d17
-     */
-    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
-    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
-    add COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
-    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
-    /* dequantize */
-    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
-    vmul.s16        q2, q2, q9
-    vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]!
-    vmul.s16        q3, q3, q10
-    vmul.s16        q4, q4, q11
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]!
-    vmul.s16        q5, q5, q12
-    vmul.s16        q6, q6, q13
-    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
-    vmul.s16        q7, q7, q14
-    vmul.s16        q8, q8, q15
-
-    /* Pass 1 */
-    idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
-    transpose_4x4   d4, d6, d8, d10
-    idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
-    transpose_4x4   d5, d7, d9, d11
-
-    /* Pass 2 */
-    idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
-    transpose_4x4   d26, d27, d28, d29
-
-    /* Range limit */
-    vmov.u16        q15, #0x80
-    vadd.s16        q13, q13, q15
-    vadd.s16        q14, q14, q15
-    vqmovun.s16     d26, q13
-    vqmovun.s16     d27, q14
-
-    /* Store results to the output buffer */
-    ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-    add             TMP3, TMP3, OUTPUT_COL
-    add             TMP4, TMP4, OUTPUT_COL
-
-#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
-    /* We can use much less instructions on little endian systems if the
-     * OS kernel is not configured to trap unaligned memory accesses
-     */
-    vst1.32         {d26[0]}, [TMP1]!
-    vst1.32         {d27[0]}, [TMP3]!
-    vst1.32         {d26[1]}, [TMP2]!
-    vst1.32         {d27[1]}, [TMP4]!
-#else
-    vst1.8          {d26[0]}, [TMP1]!
-    vst1.8          {d27[0]}, [TMP3]!
-    vst1.8          {d26[1]}, [TMP1]!
-    vst1.8          {d27[1]}, [TMP3]!
-    vst1.8          {d26[2]}, [TMP1]!
-    vst1.8          {d27[2]}, [TMP3]!
-    vst1.8          {d26[3]}, [TMP1]!
-    vst1.8          {d27[3]}, [TMP3]!
-
-    vst1.8          {d26[4]}, [TMP2]!
-    vst1.8          {d27[4]}, [TMP4]!
-    vst1.8          {d26[5]}, [TMP2]!
-    vst1.8          {d27[5]}, [TMP4]!
-    vst1.8          {d26[6]}, [TMP2]!
-    vst1.8          {d27[6]}, [TMP4]!
-    vst1.8          {d26[7]}, [TMP2]!
-    vst1.8          {d27[7]}, [TMP4]!
-#endif
-
-    vpop            {d8 - d15}
-    bx              lr
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_2x2_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular Neon optimized function is
- *       bit exact compatibility with jpeg-6b.
- */
-
-.balign 8
-jsimd_idct_2x2_neon_consts:
-  .short -FIX_0_720959822  /* d0[0] */
-  .short FIX_0_850430095   /* d0[1] */
-  .short -FIX_1_272758580  /* d0[2] */
-  .short FIX_3_624509785   /* d0[3] */
-
-.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    vshll.s16       q14, \x4, #15
-    vmull.s16       q13, \x6, d0[3]
-    vmlal.s16       q13, \x10, d0[2]
-    vmlal.s16       q13, \x12, d0[1]
-    vmlal.s16       q13, \x16, d0[0]
-
-    vadd.s32        q10, q14, q13
-    vsub.s32        q14, q14, q13
-
-  .if \shift > 16
-    vrshr.s32       q10, q10, #\shift
-    vrshr.s32       q14, q14, #\shift
-    vmovn.s32       \y26, q10
-    vmovn.s32       \y27, q14
-  .else
-    vrshrn.s32      \y26, q10, #\shift
-    vrshrn.s32      \y27, q14, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_2x2_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req ip
-
-    vpush           {d8 - d15}
-
-    /* Load constants */
-    adr             TMP2, jsimd_idct_2x2_neon_consts
-    vld1.16         {d0}, [TMP2, :64]
-
-    /* Load all COEF_BLOCK into Neon registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d4      | d5
-     *   1 | d6      | d7
-     *   2 | -       | -
-     *   3 | d10     | d11
-     *   4 | -       | -
-     *   5 | d12     | d13
-     *   6 | -       | -
-     *   7 | d16     | d17
-     */
-    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d10, d11}, [COEF_BLOCK, :128]!
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d12, d13}, [COEF_BLOCK, :128]!
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
-    /* Dequantize */
-    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
-    vmul.s16        q2, q2, q9
-    vmul.s16        q3, q3, q10
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d24, d25}, [DCT_TABLE, :128]!
-    vmul.s16        q5, q5, q12
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d26, d27}, [DCT_TABLE, :128]!
-    vmul.s16        q6, q6, q13
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
-    vmul.s16        q8, q8, q15
-
-    /* Pass 1 */
-#if 0
-    idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
-    transpose_4x4   d4, d6, d8, d10
-    idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
-    transpose_4x4   d5, d7, d9, d11
-#else
-    vmull.s16       q13, d6, d0[3]
-    vmlal.s16       q13, d10, d0[2]
-    vmlal.s16       q13, d12, d0[1]
-    vmlal.s16       q13, d16, d0[0]
-    vmull.s16       q12, d7, d0[3]
-    vmlal.s16       q12, d11, d0[2]
-    vmlal.s16       q12, d13, d0[1]
-    vmlal.s16       q12, d17, d0[0]
-    vshll.s16       q14, d4, #15
-    vshll.s16       q15, d5, #15
-    vadd.s32        q10, q14, q13
-    vsub.s32        q14, q14, q13
-    vrshrn.s32      d4, q10, #13
-    vrshrn.s32      d6, q14, #13
-    vadd.s32        q10, q15, q12
-    vsub.s32        q14, q15, q12
-    vrshrn.s32      d5, q10, #13
-    vrshrn.s32      d7, q14, #13
-    vtrn.16         q2, q3
-    vtrn.32         q3, q5
-#endif
-
-    /* Pass 2 */
-    idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
-
-    /* Range limit */
-    vmov.u16        q15, #0x80
-    vadd.s16        q13, q13, q15
-    vqmovun.s16     d26, q13
-    vqmovun.s16     d27, q13
-
-    /* Store results to the output buffer */
-    ldmia           OUTPUT_BUF, {TMP1, TMP2}
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-
-    vst1.8          {d26[0]}, [TMP1]!
-    vst1.8          {d27[4]}, [TMP1]!
-    vst1.8          {d26[1]}, [TMP2]!
-    vst1.8          {d27[5]}, [TMP2]!
-
-    vpop            {d8 - d15}
-    bx              lr
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-
-.purgem idct_helper
-
-
-#ifndef NEON_INTRINSICS

 /*****************************************************************************/

@@ -1581,5 +1198,3 @@ generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3

 .purgem do_load
 .purgem do_store
-
-#endif  /* NEON_INTRINSICS */
--- a/simd/arm/aarch64/jsimd_neon.S
+++ b/simd/arm/aarch64/jsimd_neon.S
@@ -39,8 +39,6 @@
 .section .rodata, "a", %progbits
 #endif

-#ifndef NEON_INTRINSICS
-
 /* Constants for jsimd_idct_islow_neon() */

 #define F_0_298   2446  /* FIX(0.298631336) */
@@ -88,51 +86,6 @@ Ljsimd_idct_islow_neon_consts:
 #undef F_2_562
 #undef F_3_072

-#endif
-
-/* Constants for jsimd_idct_4x4_neon() and jsimd_idct_2x2_neon() */
-
-#define CONST_BITS  13
-
-#define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
-#define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
-#define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
-#define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
-#define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
-#define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
-#define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
-#define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
-#define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
-#define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
-#define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
-#define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
-#define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
-#define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
-
-.balign 16
-Ljsimd_idct_4x4_neon_consts:
-  .short FIX_1_847759065        /* v0.h[0] */
-  .short -FIX_0_765366865       /* v0.h[1] */
-  .short -FIX_0_211164243       /* v0.h[2] */
-  .short FIX_1_451774981        /* v0.h[3] */
-  .short -FIX_2_172734803       /* d1[0] */
-  .short FIX_1_061594337        /* d1[1] */
-  .short -FIX_0_509795579       /* d1[2] */
-  .short -FIX_0_601344887       /* d1[3] */
-  .short FIX_0_899976223        /* v2.h[0] */
-  .short FIX_2_562915447        /* v2.h[1] */
-  .short 1 << (CONST_BITS + 1)  /* v2.h[2] */
-  .short 0                      /* v2.h[3] */
-
-.balign 8
-Ljsimd_idct_2x2_neon_consts:
-  .short -FIX_0_720959822  /* v14[0] */
-  .short FIX_0_850430095   /* v14[1] */
-  .short -FIX_1_272758580  /* v14[2] */
-  .short FIX_3_624509785   /* v14[3] */
-
-#ifndef NEON_INTRINSICS
-
 /* Constants for jsimd_ycc_*_neon() */

 .balign 16
@@ -151,8 +104,6 @@ Ljsimd_rgb_ycc_neon_consts:
  .short 32767, 128, 32767, 128
  .short 32767, 128, 32767, 128

-#endif
-
 /* Constants for jsimd_fdct_islow_neon() */

 #define F_0_298   2446  /* FIX(0.298631336) */
@@ -200,8 +151,6 @@ Ljsimd_fdct_islow_neon_consts:
 #undef F_2_562
 #undef F_3_072

-#ifndef NEON_INTRINSICS
-
 /* Constants for jsimd_huff_encode_one_block_neon() */

 .balign 16
@@ -233,11 +182,6 @@ Ljsimd_huff_encode_one_block_neon_consts:
    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */

-#endif
-
-
-#define RESPECT_STRICT_ALIGNMENT  1
-

 /*****************************************************************************/

@@ -268,47 +212,6 @@ _\fname:
 #endif
 .endm

-/* Transpose elements of single 128 bit registers */
-.macro transpose_single x0, x1, xi, xilen, literal
-    ins             \xi\xilen[0], \x0\xilen[0]
-    ins             \x1\xilen[0], \x0\xilen[1]
-    trn1            \x0\literal, \x0\literal, \x1\literal
-    trn2            \x1\literal, \xi\literal, \x1\literal
-.endm
-
-/* Transpose elements of 2 different registers */
-.macro transpose x0, x1, xi, xilen, literal
-    mov             \xi\xilen, \x0\xilen
-    trn1            \x0\literal, \x0\literal, \x1\literal
-    trn2            \x1\literal, \xi\literal, \x1\literal
-.endm
-
-/* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
-    mov             \xi\xilen, \x0\xilen
-    trn1            \x0\x0len, \x0\x0len, \x2\x2len
-    trn2            \x2\x2len, \xi\x0len, \x2\x2len
-    mov             \xi\xilen, \x1\xilen
-    trn1            \x1\x1len, \x1\x1len, \x3\x3len
-    trn2            \x3\x3len, \xi\x1len, \x3\x3len
-.endm
-
-.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
-    mov             \xi\xilen, \x0\xilen
-    trn1            \x0\x0len, \x0\x0len, \x1\x1len
-    trn2            \x1\x2len, \xi\x0len, \x1\x2len
-    mov             \xi\xilen, \x2\xilen
-    trn1            \x2\x2len, \x2\x2len, \x3\x3len
-    trn2            \x3\x2len, \xi\x1len, \x3\x3len
-.endm
-
-.macro transpose_4x4 x0, x1, x2, x3, x5
-    transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
-    transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
-.endm
-
-#ifndef NEON_INTRINSICS
-
 .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
    trn1            \t0\().8h, \l0\().8h, \l1\().8h
    trn1            \t1\().8h, \l2\().8h, \l3\().8h
@@ -937,395 +840,6 @@ asm_function jsimd_idct_islow_neon
 #undef XFIX_N_2_562
 #undef XFIX_P_3_072

-#endif  /* NEON_INTRINSICS */
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_4x4_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular Neon optimized function is
- *       bit exact compatibility with jpeg-6b.
- *
- * TODO: a bit better instructions scheduling can be achieved by expanding
- *       idct_helper/transpose_4x4 macros and reordering instructions,
- *       but readability will suffer somewhat.
- */
-
-.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    smull           v28.4s, \x4, v2.h[2]
-    smlal           v28.4s, \x8, v0.h[0]
-    smlal           v28.4s, \x14, v0.h[1]
-
-    smull           v26.4s, \x16, v1.h[2]
-    smlal           v26.4s, \x12, v1.h[3]
-    smlal           v26.4s, \x10, v2.h[0]
-    smlal           v26.4s, \x6, v2.h[1]
-
-    smull           v30.4s, \x4, v2.h[2]
-    smlsl           v30.4s, \x8, v0.h[0]
-    smlsl           v30.4s, \x14, v0.h[1]
-
-    smull           v24.4s, \x16, v0.h[2]
-    smlal           v24.4s, \x12, v0.h[3]
-    smlal           v24.4s, \x10, v1.h[0]
-    smlal           v24.4s, \x6, v1.h[1]
-
-    add             v20.4s, v28.4s, v26.4s
-    sub             v28.4s, v28.4s, v26.4s
-
-  .if \shift > 16
-    srshr           v20.4s, v20.4s, #\shift
-    srshr           v28.4s, v28.4s, #\shift
-    xtn             \y26, v20.4s
-    xtn             \y29, v28.4s
-  .else
-    rshrn           \y26, v20.4s, #\shift
-    rshrn           \y29, v28.4s, #\shift
-  .endif
-
-    add             v20.4s, v30.4s, v24.4s
-    sub             v30.4s, v30.4s, v24.4s
-
-  .if \shift > 16
-    srshr           v20.4s, v20.4s, #\shift
-    srshr           v30.4s, v30.4s, #\shift
-    xtn             \y27, v20.4s
-    xtn             \y28, v30.4s
-  .else
-    rshrn           \y27, v20.4s, #\shift
-    rshrn           \y28, v30.4s, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_4x4_neon
-
-    DCT_TABLE       .req x0
-    COEF_BLOCK      .req x1
-    OUTPUT_BUF      .req x2
-    OUTPUT_COL      .req x3
-    TMP1            .req x0
-    TMP2            .req x1
-    TMP3            .req x2
-    TMP4            .req x15
-
-    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x3 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x3, w3
-
-    /* Save all used Neon registers */
-    sub             sp, sp, 64
-    mov             x9, sp
-    /* Load constants (v3.4h is just used for padding) */
-    get_symbol_loc  TMP4, Ljsimd_idct_4x4_neon_consts
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
-    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
-
-    /* Load all COEF_BLOCK into Neon registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | v4.4h   | v5.4h
-     *   1 | v6.4h   | v7.4h
-     *   2 | v8.4h   | v9.4h
-     *   3 | v10.4h  | v11.4h
-     *   4 | -       | -
-     *   5 | v12.4h  | v13.4h
-     *   6 | v14.4h  | v15.4h
-     *   7 | v16.4h  | v17.4h
-     */
-    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
-    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
-    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
-    /* dequantize */
-    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
-    mul             v4.4h, v4.4h, v18.4h
-    mul             v5.4h, v5.4h, v19.4h
-    ins             v4.d[1], v5.d[0]              /* 128 bit q4 */
-    ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
-    mul             v6.4h, v6.4h, v20.4h
-    mul             v7.4h, v7.4h, v21.4h
-    ins             v6.d[1], v7.d[0]              /* 128 bit q6 */
-    mul             v8.4h, v8.4h, v22.4h
-    mul             v9.4h, v9.4h, v23.4h
-    ins             v8.d[1], v9.d[0]              /* 128 bit q8 */
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
-    mul             v10.4h, v10.4h, v24.4h
-    mul             v11.4h, v11.4h, v25.4h
-    ins             v10.d[1], v11.d[0]            /* 128 bit q10 */
-    mul             v12.4h, v12.4h, v26.4h
-    mul             v13.4h, v13.4h, v27.4h
-    ins             v12.d[1], v13.d[0]            /* 128 bit q12 */
-    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
-    mul             v14.4h, v14.4h, v28.4h
-    mul             v15.4h, v15.4h, v29.4h
-    ins             v14.d[1], v15.d[0]            /* 128 bit q14 */
-    mul             v16.4h, v16.4h, v30.4h
-    mul             v17.4h, v17.4h, v31.4h
-    ins             v16.d[1], v17.d[0]            /* 128 bit q16 */
-
-    /* Pass 1 */
-    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
-                    v4.4h, v6.4h, v8.4h, v10.4h
-    transpose_4x4   v4, v6, v8, v10, v3
-    ins             v10.d[1], v11.d[0]
-    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
-                    v5.4h, v7.4h, v9.4h, v11.4h
-    transpose_4x4   v5, v7, v9, v11, v3
-    ins             v10.d[1], v11.d[0]
-
-    /* Pass 2 */
-    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
-                    v26.4h, v27.4h, v28.4h, v29.4h
-    transpose_4x4   v26, v27, v28, v29, v3
-
-    /* Range limit */
-    movi            v30.8h, #0x80
-    ins             v26.d[1], v27.d[0]
-    ins             v28.d[1], v29.d[0]
-    add             v26.8h, v26.8h, v30.8h
-    add             v28.8h, v28.8h, v30.8h
-    sqxtun          v26.8b, v26.8h
-    sqxtun          v27.8b, v28.8h
-
-    /* Store results to the output buffer */
-    ldp             TMP1, TMP2, [OUTPUT_BUF], 16
-    ldp             TMP3, TMP4, [OUTPUT_BUF]
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-    add             TMP3, TMP3, OUTPUT_COL
-    add             TMP4, TMP4, OUTPUT_COL
-
-#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
-    /* We can use much less instructions on little endian systems if the
-     * OS kernel is not configured to trap unaligned memory accesses
-     */
-    st1             {v26.s}[0], [TMP1], 4
-    st1             {v27.s}[0], [TMP3], 4
-    st1             {v26.s}[1], [TMP2], 4
-    st1             {v27.s}[1], [TMP4], 4
-#else
-    st1             {v26.b}[0], [TMP1], 1
-    st1             {v27.b}[0], [TMP3], 1
-    st1             {v26.b}[1], [TMP1], 1
-    st1             {v27.b}[1], [TMP3], 1
-    st1             {v26.b}[2], [TMP1], 1
-    st1             {v27.b}[2], [TMP3], 1
-    st1             {v26.b}[3], [TMP1], 1
-    st1             {v27.b}[3], [TMP3], 1
-
-    st1             {v26.b}[4], [TMP2], 1
-    st1             {v27.b}[4], [TMP4], 1
-    st1             {v26.b}[5], [TMP2], 1
-    st1             {v27.b}[5], [TMP4], 1
-    st1             {v26.b}[6], [TMP2], 1
-    st1             {v27.b}[6], [TMP4], 1
-    st1             {v26.b}[7], [TMP2], 1
-    st1             {v27.b}[7], [TMP4], 1
-#endif
-
-    /* vpop            {v8.4h - v15.4h}    (not available) */
-    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    blr             x30
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_2x2_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular Neon optimized function is
- *       bit exact compatibility with jpeg-6b.
- */
-
-.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    sshll           v15.4s, \x4, #15
-    smull           v26.4s, \x6, v14.h[3]
-    smlal           v26.4s, \x10, v14.h[2]
-    smlal           v26.4s, \x12, v14.h[1]
-    smlal           v26.4s, \x16, v14.h[0]
-
-    add             v20.4s, v15.4s, v26.4s
-    sub             v15.4s, v15.4s, v26.4s
-
-  .if \shift > 16
-    srshr           v20.4s, v20.4s, #\shift
-    srshr           v15.4s, v15.4s, #\shift
-    xtn             \y26, v20.4s
-    xtn             \y27, v15.4s
-  .else
-    rshrn           \y26, v20.4s, #\shift
-    rshrn           \y27, v15.4s, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_2x2_neon
-
-    DCT_TABLE       .req x0
-    COEF_BLOCK      .req x1
-    OUTPUT_BUF      .req x2
-    OUTPUT_COL      .req x3
-    TMP1            .req x0
-    TMP2            .req x15
-
-    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x3 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x3, w3
-
-    /* vpush           {v8.4h - v15.4h}    (not available) */
-    sub             sp, sp, 64
-    mov             x9, sp
-
-    /* Load constants */
-    get_symbol_loc  TMP2, Ljsimd_idct_2x2_neon_consts
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
-    ld1             {v14.4h}, [TMP2]
-
-    /* Load all COEF_BLOCK into Neon registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | v4.4h   | v5.4h
-     *   1 | v6.4h   | v7.4h
-     *   2 | -       | -
-     *   3 | v10.4h  | v11.4h
-     *   4 | -       | -
-     *   5 | v12.4h  | v13.4h
-     *   6 | -       | -
-     *   7 | v16.4h  | v17.4h
-     */
-    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
-    /* Dequantize */
-    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
-    mul             v4.4h, v4.4h, v18.4h
-    mul             v5.4h, v5.4h, v19.4h
-    ins             v4.d[1], v5.d[0]
-    mul             v6.4h, v6.4h, v20.4h
-    mul             v7.4h, v7.4h, v21.4h
-    ins             v6.d[1], v7.d[0]
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
-    mul             v10.4h, v10.4h, v24.4h
-    mul             v11.4h, v11.4h, v25.4h
-    ins             v10.d[1], v11.d[0]
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
-    mul             v12.4h, v12.4h, v26.4h
-    mul             v13.4h, v13.4h, v27.4h
-    ins             v12.d[1], v13.d[0]
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
-    mul             v16.4h, v16.4h, v30.4h
-    mul             v17.4h, v17.4h, v31.4h
-    ins             v16.d[1], v17.d[0]
-
-    /* Pass 1 */
-#if 0
-    idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
-    transpose_4x4   v4.4h, v6.4h, v8.4h, v10.4h
-    idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
-    transpose_4x4   v5.4h, v7.4h, v9.4h, v11.4h
-#else
-    smull           v26.4s, v6.4h, v14.h[3]
-    smlal           v26.4s, v10.4h, v14.h[2]
-    smlal           v26.4s, v12.4h, v14.h[1]
-    smlal           v26.4s, v16.4h, v14.h[0]
-    smull           v24.4s, v7.4h, v14.h[3]
-    smlal           v24.4s, v11.4h, v14.h[2]
-    smlal           v24.4s, v13.4h, v14.h[1]
-    smlal           v24.4s, v17.4h, v14.h[0]
-    sshll           v15.4s, v4.4h, #15
-    sshll           v30.4s, v5.4h, #15
-    add             v20.4s, v15.4s, v26.4s
-    sub             v15.4s, v15.4s, v26.4s
-    rshrn           v4.4h, v20.4s, #13
-    rshrn           v6.4h, v15.4s, #13
-    add             v20.4s, v30.4s, v24.4s
-    sub             v15.4s, v30.4s, v24.4s
-    rshrn           v5.4h, v20.4s, #13
-    rshrn           v7.4h, v15.4s, #13
-    ins             v4.d[1], v5.d[0]
-    ins             v6.d[1], v7.d[0]
-    transpose       v4, v6, v3, .16b, .8h
-    transpose       v6, v10, v3, .16b, .4s
-    ins             v11.d[0], v10.d[1]
-    ins             v7.d[0], v6.d[1]
-#endif
-
-    /* Pass 2 */
-    idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
-
-    /* Range limit */
-    movi            v30.8h, #0x80
-    ins             v26.d[1], v27.d[0]
-    add             v26.8h, v26.8h, v30.8h
-    sqxtun          v30.8b, v26.8h
-    ins             v26.d[0], v30.d[0]
-    sqxtun          v27.8b, v26.8h
-
-    /* Store results to the output buffer */
-    ldp             TMP1, TMP2, [OUTPUT_BUF]
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-
-    st1             {v26.b}[0], [TMP1], 1
-    st1             {v27.b}[4], [TMP1], 1
-    st1             {v26.b}[1], [TMP2], 1
-    st1             {v27.b}[5], [TMP2], 1
-
-    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    blr             x30
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-
-.purgem idct_helper
-
-
-#ifndef NEON_INTRINSICS

 /*****************************************************************************/

@@ -2736,5 +2250,3 @@ generate_jsimd_huff_encode_one_block 0
 .purgem put_bits
 .purgem checkbuf31
 .purgem checkbuf47
-
-#endif  /* NEON_INTRINSICS */
--- a/simd/arm/jidctred-neon.c
+++ b/simd/arm/jidctred-neon.c
@@ -0,0 +1,484 @@
+/*
+ * jidctred-neon.c - reduced-size IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define F_0_211  1730
+#define F_0_509  4176
+#define F_0_601  4926
+#define F_0_720  5906
+#define F_0_765  6270
+#define F_0_850  6967
+#define F_0_899  7373
+#define F_1_061  8697
+#define F_1_272  10426
+#define F_1_451  11893
+#define F_1_847  15137
+#define F_2_172  17799
+#define F_2_562  20995
+#define F_3_624  29692
+
+
+/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
+ * 2x2 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_2x2() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.720959822 =  5906 * 2^-13
+ *    0.850430095 =  6967 * 2^-13
+ *    1.272758580 = 10426 * 2^-13
+ *    3.624509785 = 29692 * 2^-13
+ *
+ * See jidctred.c for further details of the 2x2 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_2x2_neon()
+ * match up with those in jpeg_idct_2x2().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
+  -F_0_720, F_0_850, -F_1_272, F_3_624
+};
+
+void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
+                         JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+  int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+  int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+  int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+  /* Dequantize DCT coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+  row1 = vmulq_s16(row1, quant_row1);
+  row3 = vmulq_s16(row3, quant_row3);
+  row5 = vmulq_s16(row5, quant_row5);
+  row7 = vmulq_s16(row7, quant_row7);
+
+  /* Load IDCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
+
+  /* Pass 1: process columns from input, put results in vectors row0 and
+   * row1.
+   */
+
+  /* Even part */
+  int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
+  int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
+
+  /* Odd part */
+  int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
+  int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
+                      vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
+  row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
+                      vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
+
+  /* Transpose two rows, ready for second pass. */
+  int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
+  int16x8_t cols_0246 = cols_0246_1357.val[0];
+  int16x8_t cols_1357 = cols_0246_1357.val[1];
+  /* Duplicate columns such that each is accessible in its own vector. */
+  int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
+                                         vreinterpretq_s32_s16(cols_1357));
+  int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
+  int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
+
+  /* Pass 2: process two rows, store to output array. */
+
+  /* Even part: we're only interested in col0; the top half of tmp10 is "don't
+   * care."
+   */
+  int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
+
+  /* Odd part: we're only interested in the bottom half of tmp0. */
+  int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
+  tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
+  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
+  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
+                                      vsubhn_s32(tmp10, tmp0));
+  output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
+                            CONST_BITS + PASS1_BITS + 3 + 2 - 16);
+  /* Narrow to 8-bit and convert to unsigned. */
+  uint8x8_t output_u8 = vqmovun_s16(output_s16);
+
+  /* Store 2x2 block to memory. */
+  vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
+  vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
+  vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
+  vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
+}
+
+
+/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
+ * 4x4 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_4x4() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.211164243 =  1730 * 2^-13
+ *    0.509795579 =  4176 * 2^-13
+ *    0.601344887 =  4926 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.061594337 =  8697 * 2^-13
+ *    1.451774981 = 11893 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    2.172734803 = 17799 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *
+ * See jidctred.c for further details of the 4x4 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_4x4_neon()
+ * match up with those in jpeg_idct_4x4().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
+  F_1_847, -F_0_765, -F_0_211,  F_1_451,
+ -F_2_172,  F_1_061, -F_0_509, -F_0_601,
+  F_0_899,  F_2_562,        0,        0
+};
+
+void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
+                         JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0  = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1  = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row2  = vld1q_s16(coef_block + 2 * DCTSIZE);
+  int16x8_t row3  = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row5  = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row6  = vld1q_s16(coef_block + 6 * DCTSIZE);
+  int16x8_t row7  = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values for DC coefficients. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  /* Dequantize DC coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  int16x8_t bitmap = vorrq_s16(row1, row2);
+  bitmap = vorrq_s16(bitmap, row3);
+  bitmap = vorrq_s16(bitmap, row5);
+  bitmap = vorrq_s16(bitmap, row6);
+  bitmap = vorrq_s16(bitmap, row7);
+
+  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+  /* Load constants for IDCT computation. */
+#if defined(__clang__) || defined(_MSC_VER)
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
+     */
+    int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
+    row0 = dcval;
+    row1 = dcval;
+    row2 = dcval;
+    row3 = dcval;
+  } else if (left_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 0, 1, 2, and 3.
+     * Compute DC values for these columns.
+     */
+    int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
+
+    /* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+    /* Even part */
+    int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+    int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
+    int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+    /* Odd part */
+    int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
+    z2 = vmul_s16(vget_high_s16(row5), quant_row5);
+    z3 = vmul_s16(vget_high_s16(row3), quant_row3);
+    int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
+
+    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+                                            CONST_BITS - PASS1_BITS + 1));
+  } else if (right_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 4, 5, 6, and 7.
+     * Compute DC values for these columns.
+     */
+    int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
+
+    /* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part */
+    int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+
+    int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
+    int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+    /* Odd part */
+    int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
+    z2 = vmul_s16(vget_low_s16(row5), quant_row5);
+    z3 = vmul_s16(vget_low_s16(row3), quant_row3);
+    int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
+
+    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+  } else {
+    /* All AC coefficients are non-zero; full IDCT calculation required. */
+    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part */
+    int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+    int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+    int16x8_t z2 = vmulq_s16(row2, quant_row2);
+    int16x8_t z3 = vmulq_s16(row6, quant_row6);
+
+    int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
+    int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
+
+    int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
+    int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
+    int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
+    int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
+
+    /* Odd part */
+    int16x8_t z1 = vmulq_s16(row7, quant_row7);
+    z2 = vmulq_s16(row5, quant_row5);
+    z3 = vmulq_s16(row3, quant_row3);
+    int16x8_t z4 = vmulq_s16(row1, quant_row1);
+
+    tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
+    tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
+
+    tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
+    tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+  }
+
+  /* Transpose 8x4 block to perform IDCT on rows in second pass. */
+  int16x8x2_t row_01 = vtrnq_s16(row0, row1);
+  int16x8x2_t row_23 = vtrnq_s16(row2, row3);
+
+  int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
+                                    vreinterpretq_s32_s16(row_23.val[0]));
+  int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
+                                    vreinterpretq_s32_s16(row_23.val[1]));
+
+  int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
+  int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
+  int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
+  int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
+  int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
+  int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
+  int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
+
+  /* Commence second pass of IDCT. */
+
+  /* Even part */
+  int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
+  int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
+  tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+  /* Odd part */
+  tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
+  tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
+  tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
+  tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
+
+  tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
+  tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
+  tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
+                                          vsubhn_s32(tmp12, tmp0));
+  int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
+                                          vsubhn_s32(tmp10, tmp2));
+  output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
+                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+  output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
+                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+  /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
+   * An interleaving store completes the transpose.
+   */
+  uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
+                                    vqmovun_s16(output_cols_13));
+  uint16x4x2_t output_01_23 = { {
+    vreinterpret_u16_u8(output_0123.val[0]),
+    vreinterpret_u16_u8(output_0123.val[1])
+  } };
+
+  /* Store 4x4 block to memory. */
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
+  vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
+  vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
+  vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
+}