Neon: Intrinsics impl. of 2x2 and 4x4 scaled IDCTs

The previous AArch32 and AArch64 GAS implementations have been removed,
since the intrinsics implementations provide the same or better
performance.
This commit is contained in:
Martyn Jacques
2018-09-18 18:28:31 +00:00
committed by DRC
parent 4574f01f43
commit 141f26ff6d
4 changed files with 489 additions and 876 deletions

View File

@@ -267,7 +267,7 @@ file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S)
set(SIMD_SOURCES arm/jcgray-neon.c arm/jcphuff-neon.c arm/jcsample-neon.c
arm/jdmerge-neon.c arm/jdsample-neon.c arm/jfdctfst-neon.c
arm/jquanti-neon.c)
arm/jidctred-neon.c arm/jquanti-neon.c)
if(NEON_INTRINSICS)
set(SIMD_SOURCES ${SIMD_SOURCES} arm/jccolor-neon.c arm/jidctint-neon.c)
endif()
@@ -281,9 +281,11 @@ endif()
if(BITS EQUAL 32)
set_source_files_properties(${SIMD_SOURCES} COMPILE_FLAGS -mfpu=neon)
endif()
if(NOT NEON_INTRINSICS)
set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jsimd_neon.S)
endif()
add_library(simd OBJECT ${SIMD_SOURCES} arm/aarch${BITS}/jsimd_neon.S
arm/aarch${BITS}/jsimd.c)
add_library(simd OBJECT ${SIMD_SOURCES} arm/aarch${BITS}/jsimd.c)
if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)

View File

@@ -38,9 +38,6 @@
.syntax unified
#define RESPECT_STRICT_ALIGNMENT 1
/*****************************************************************************/
/* Supplementary macro for setting function attributes */
@@ -59,16 +56,6 @@ _\fname:
#endif
.endm
/* Transpose a block of 4x4 coefficients in four 64-bit registers */
.macro transpose_4x4 x0, x1, x2, x3
vtrn.16 \x0, \x1
vtrn.16 \x2, \x3
vtrn.32 \x0, \x2
vtrn.32 \x1, \x3
.endm
#ifndef NEON_INTRINSICS
#define CENTERJSAMPLE 128
@@ -902,376 +889,6 @@ asm_function jsimd_idct_ifast_neon
.unreq TMP3
.unreq TMP4
#endif /* NEON_INTRINSICS */
/*****************************************************************************/
/*
* jsimd_idct_4x4_neon
*
* This function contains inverse-DCT code for getting reduced-size
* 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
* and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
* function from jpeg-6b (jidctred.c).
*
* NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
* requires much less arithmetic operations and hence should be faster.
* The primary purpose of this particular Neon optimized function is
* bit exact compatibility with jpeg-6b.
*
* TODO: a bit better instructions scheduling can be achieved by expanding
* idct_helper/transpose_4x4 macros and reordering instructions,
* but readability will suffer somewhat.
*/
#define CONST_BITS 13
#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
.balign 16
jsimd_idct_4x4_neon_consts:
.short FIX_1_847759065 /* d0[0] */
.short -FIX_0_765366865 /* d0[1] */
.short -FIX_0_211164243 /* d0[2] */
.short FIX_1_451774981 /* d0[3] */
.short -FIX_2_172734803 /* d1[0] */
.short FIX_1_061594337 /* d1[1] */
.short -FIX_0_509795579 /* d1[2] */
.short -FIX_0_601344887 /* d1[3] */
.short FIX_0_899976223 /* d2[0] */
.short FIX_2_562915447 /* d2[1] */
.short 1 << (CONST_BITS + 1) /* d2[2] */
.short 0 /* d2[3] */
.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
vmull.s16 q14, \x4, d2[2]
vmlal.s16 q14, \x8, d0[0]
vmlal.s16 q14, \x14, d0[1]
vmull.s16 q13, \x16, d1[2]
vmlal.s16 q13, \x12, d1[3]
vmlal.s16 q13, \x10, d2[0]
vmlal.s16 q13, \x6, d2[1]
vmull.s16 q15, \x4, d2[2]
vmlsl.s16 q15, \x8, d0[0]
vmlsl.s16 q15, \x14, d0[1]
vmull.s16 q12, \x16, d0[2]
vmlal.s16 q12, \x12, d0[3]
vmlal.s16 q12, \x10, d1[0]
vmlal.s16 q12, \x6, d1[1]
vadd.s32 q10, q14, q13
vsub.s32 q14, q14, q13
.if \shift > 16
vrshr.s32 q10, q10, #\shift
vrshr.s32 q14, q14, #\shift
vmovn.s32 \y26, q10
vmovn.s32 \y29, q14
.else
vrshrn.s32 \y26, q10, #\shift
vrshrn.s32 \y29, q14, #\shift
.endif
vadd.s32 q10, q15, q12
vsub.s32 q15, q15, q12
.if \shift > 16
vrshr.s32 q10, q10, #\shift
vrshr.s32 q15, q15, #\shift
vmovn.s32 \y27, q10
vmovn.s32 \y28, q15
.else
vrshrn.s32 \y27, q10, #\shift
vrshrn.s32 \y28, q15, #\shift
.endif
.endm
asm_function jsimd_idct_4x4_neon
DCT_TABLE .req r0
COEF_BLOCK .req r1
OUTPUT_BUF .req r2
OUTPUT_COL .req r3
TMP1 .req r0
TMP2 .req r1
TMP3 .req r2
TMP4 .req ip
vpush {d8 - d15}
/* Load constants (d3 is just used for padding) */
adr TMP4, jsimd_idct_4x4_neon_consts
vld1.16 {d0, d1, d2, d3}, [TMP4, :128]
/* Load all COEF_BLOCK into Neon registers with the following allocation:
* 0 1 2 3 | 4 5 6 7
* ---------+--------
* 0 | d4 | d5
* 1 | d6 | d7
* 2 | d8 | d9
* 3 | d10 | d11
* 4 | - | -
* 5 | d12 | d13
* 6 | d14 | d15
* 7 | d16 | d17
*/
vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
add COEF_BLOCK, COEF_BLOCK, #16
vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
/* dequantize */
vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
vmul.s16 q2, q2, q9
vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!
vmul.s16 q3, q3, q10
vmul.s16 q4, q4, q11
add DCT_TABLE, DCT_TABLE, #16
vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!
vmul.s16 q5, q5, q12
vmul.s16 q6, q6, q13
vld1.16 {d30, d31}, [DCT_TABLE, :128]!
vmul.s16 q7, q7, q14
vmul.s16 q8, q8, q15
/* Pass 1 */
idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
transpose_4x4 d4, d6, d8, d10
idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
transpose_4x4 d5, d7, d9, d11
/* Pass 2 */
idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
transpose_4x4 d26, d27, d28, d29
/* Range limit */
vmov.u16 q15, #0x80
vadd.s16 q13, q13, q15
vadd.s16 q14, q14, q15
vqmovun.s16 d26, q13
vqmovun.s16 d27, q14
/* Store results to the output buffer */
ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
add TMP1, TMP1, OUTPUT_COL
add TMP2, TMP2, OUTPUT_COL
add TMP3, TMP3, OUTPUT_COL
add TMP4, TMP4, OUTPUT_COL
#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
/* We can use much less instructions on little endian systems if the
* OS kernel is not configured to trap unaligned memory accesses
*/
vst1.32 {d26[0]}, [TMP1]!
vst1.32 {d27[0]}, [TMP3]!
vst1.32 {d26[1]}, [TMP2]!
vst1.32 {d27[1]}, [TMP4]!
#else
vst1.8 {d26[0]}, [TMP1]!
vst1.8 {d27[0]}, [TMP3]!
vst1.8 {d26[1]}, [TMP1]!
vst1.8 {d27[1]}, [TMP3]!
vst1.8 {d26[2]}, [TMP1]!
vst1.8 {d27[2]}, [TMP3]!
vst1.8 {d26[3]}, [TMP1]!
vst1.8 {d27[3]}, [TMP3]!
vst1.8 {d26[4]}, [TMP2]!
vst1.8 {d27[4]}, [TMP4]!
vst1.8 {d26[5]}, [TMP2]!
vst1.8 {d27[5]}, [TMP4]!
vst1.8 {d26[6]}, [TMP2]!
vst1.8 {d27[6]}, [TMP4]!
vst1.8 {d26[7]}, [TMP2]!
vst1.8 {d27[7]}, [TMP4]!
#endif
vpop {d8 - d15}
bx lr
.unreq DCT_TABLE
.unreq COEF_BLOCK
.unreq OUTPUT_BUF
.unreq OUTPUT_COL
.unreq TMP1
.unreq TMP2
.unreq TMP3
.unreq TMP4
.purgem idct_helper
/*****************************************************************************/
/*
* jsimd_idct_2x2_neon
*
* This function contains inverse-DCT code for getting reduced-size
* 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
* and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
* function from jpeg-6b (jidctred.c).
*
* NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
* requires much less arithmetic operations and hence should be faster.
* The primary purpose of this particular Neon optimized function is
* bit exact compatibility with jpeg-6b.
*/
.balign 8
jsimd_idct_2x2_neon_consts:
.short -FIX_0_720959822 /* d0[0] */
.short FIX_0_850430095 /* d0[1] */
.short -FIX_1_272758580 /* d0[2] */
.short FIX_3_624509785 /* d0[3] */
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
vshll.s16 q14, \x4, #15
vmull.s16 q13, \x6, d0[3]
vmlal.s16 q13, \x10, d0[2]
vmlal.s16 q13, \x12, d0[1]
vmlal.s16 q13, \x16, d0[0]
vadd.s32 q10, q14, q13
vsub.s32 q14, q14, q13
.if \shift > 16
vrshr.s32 q10, q10, #\shift
vrshr.s32 q14, q14, #\shift
vmovn.s32 \y26, q10
vmovn.s32 \y27, q14
.else
vrshrn.s32 \y26, q10, #\shift
vrshrn.s32 \y27, q14, #\shift
.endif
.endm
asm_function jsimd_idct_2x2_neon
DCT_TABLE .req r0
COEF_BLOCK .req r1
OUTPUT_BUF .req r2
OUTPUT_COL .req r3
TMP1 .req r0
TMP2 .req ip
vpush {d8 - d15}
/* Load constants */
adr TMP2, jsimd_idct_2x2_neon_consts
vld1.16 {d0}, [TMP2, :64]
/* Load all COEF_BLOCK into Neon registers with the following allocation:
* 0 1 2 3 | 4 5 6 7
* ---------+--------
* 0 | d4 | d5
* 1 | d6 | d7
* 2 | - | -
* 3 | d10 | d11
* 4 | - | -
* 5 | d12 | d13
* 6 | - | -
* 7 | d16 | d17
*/
vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
add COEF_BLOCK, COEF_BLOCK, #16
vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
add COEF_BLOCK, COEF_BLOCK, #16
vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
add COEF_BLOCK, COEF_BLOCK, #16
vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
/* Dequantize */
vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
vmul.s16 q2, q2, q9
vmul.s16 q3, q3, q10
add DCT_TABLE, DCT_TABLE, #16
vld1.16 {d24, d25}, [DCT_TABLE, :128]!
vmul.s16 q5, q5, q12
add DCT_TABLE, DCT_TABLE, #16
vld1.16 {d26, d27}, [DCT_TABLE, :128]!
vmul.s16 q6, q6, q13
add DCT_TABLE, DCT_TABLE, #16
vld1.16 {d30, d31}, [DCT_TABLE, :128]!
vmul.s16 q8, q8, q15
/* Pass 1 */
#if 0
idct_helper d4, d6, d10, d12, d16, 13, d4, d6
transpose_4x4 d4, d6, d8, d10
idct_helper d5, d7, d11, d13, d17, 13, d5, d7
transpose_4x4 d5, d7, d9, d11
#else
vmull.s16 q13, d6, d0[3]
vmlal.s16 q13, d10, d0[2]
vmlal.s16 q13, d12, d0[1]
vmlal.s16 q13, d16, d0[0]
vmull.s16 q12, d7, d0[3]
vmlal.s16 q12, d11, d0[2]
vmlal.s16 q12, d13, d0[1]
vmlal.s16 q12, d17, d0[0]
vshll.s16 q14, d4, #15
vshll.s16 q15, d5, #15
vadd.s32 q10, q14, q13
vsub.s32 q14, q14, q13
vrshrn.s32 d4, q10, #13
vrshrn.s32 d6, q14, #13
vadd.s32 q10, q15, q12
vsub.s32 q14, q15, q12
vrshrn.s32 d5, q10, #13
vrshrn.s32 d7, q14, #13
vtrn.16 q2, q3
vtrn.32 q3, q5
#endif
/* Pass 2 */
idct_helper d4, d6, d10, d7, d11, 20, d26, d27
/* Range limit */
vmov.u16 q15, #0x80
vadd.s16 q13, q13, q15
vqmovun.s16 d26, q13
vqmovun.s16 d27, q13
/* Store results to the output buffer */
ldmia OUTPUT_BUF, {TMP1, TMP2}
add TMP1, TMP1, OUTPUT_COL
add TMP2, TMP2, OUTPUT_COL
vst1.8 {d26[0]}, [TMP1]!
vst1.8 {d27[4]}, [TMP1]!
vst1.8 {d26[1]}, [TMP2]!
vst1.8 {d27[5]}, [TMP2]!
vpop {d8 - d15}
bx lr
.unreq DCT_TABLE
.unreq COEF_BLOCK
.unreq OUTPUT_BUF
.unreq OUTPUT_COL
.unreq TMP1
.unreq TMP2
.purgem idct_helper
#ifndef NEON_INTRINSICS
/*****************************************************************************/
@@ -1581,5 +1198,3 @@ generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
.purgem do_load
.purgem do_store
#endif /* NEON_INTRINSICS */

View File

@@ -39,8 +39,6 @@
.section .rodata, "a", %progbits
#endif
#ifndef NEON_INTRINSICS
/* Constants for jsimd_idct_islow_neon() */
#define F_0_298 2446 /* FIX(0.298631336) */
@@ -88,51 +86,6 @@ Ljsimd_idct_islow_neon_consts:
#undef F_2_562
#undef F_3_072
#endif
/* Constants for jsimd_idct_4x4_neon() and jsimd_idct_2x2_neon() */
#define CONST_BITS 13
#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
.balign 16
Ljsimd_idct_4x4_neon_consts:
.short FIX_1_847759065 /* v0.h[0] */
.short -FIX_0_765366865 /* v0.h[1] */
.short -FIX_0_211164243 /* v0.h[2] */
.short FIX_1_451774981 /* v0.h[3] */
.short -FIX_2_172734803 /* d1[0] */
.short FIX_1_061594337 /* d1[1] */
.short -FIX_0_509795579 /* d1[2] */
.short -FIX_0_601344887 /* d1[3] */
.short FIX_0_899976223 /* v2.h[0] */
.short FIX_2_562915447 /* v2.h[1] */
.short 1 << (CONST_BITS + 1) /* v2.h[2] */
.short 0 /* v2.h[3] */
.balign 8
Ljsimd_idct_2x2_neon_consts:
.short -FIX_0_720959822 /* v14[0] */
.short FIX_0_850430095 /* v14[1] */
.short -FIX_1_272758580 /* v14[2] */
.short FIX_3_624509785 /* v14[3] */
#ifndef NEON_INTRINSICS
/* Constants for jsimd_ycc_*_neon() */
.balign 16
@@ -151,8 +104,6 @@ Ljsimd_rgb_ycc_neon_consts:
.short 32767, 128, 32767, 128
.short 32767, 128, 32767, 128
#endif
/* Constants for jsimd_fdct_islow_neon() */
#define F_0_298 2446 /* FIX(0.298631336) */
@@ -200,8 +151,6 @@ Ljsimd_fdct_islow_neon_consts:
#undef F_2_562
#undef F_3_072
#ifndef NEON_INTRINSICS
/* Constants for jsimd_huff_encode_one_block_neon() */
.balign 16
@@ -233,11 +182,6 @@ Ljsimd_huff_encode_one_block_neon_consts:
.byte 4, 5, 6, 7, 255, 255, 255, 255, \
255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
#endif
#define RESPECT_STRICT_ALIGNMENT 1
/*****************************************************************************/
@@ -268,47 +212,6 @@ _\fname:
#endif
.endm
/* Transpose elements of single 128 bit registers */
.macro transpose_single x0, x1, xi, xilen, literal
ins \xi\xilen[0], \x0\xilen[0]
ins \x1\xilen[0], \x0\xilen[1]
trn1 \x0\literal, \x0\literal, \x1\literal
trn2 \x1\literal, \xi\literal, \x1\literal
.endm
/* Transpose elements of 2 different registers */
.macro transpose x0, x1, xi, xilen, literal
mov \xi\xilen, \x0\xilen
trn1 \x0\literal, \x0\literal, \x1\literal
trn2 \x1\literal, \xi\literal, \x1\literal
.endm
/* Transpose a block of 4x4 coefficients in four 64-bit registers */
.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
mov \xi\xilen, \x0\xilen
trn1 \x0\x0len, \x0\x0len, \x2\x2len
trn2 \x2\x2len, \xi\x0len, \x2\x2len
mov \xi\xilen, \x1\xilen
trn1 \x1\x1len, \x1\x1len, \x3\x3len
trn2 \x3\x3len, \xi\x1len, \x3\x3len
.endm
.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
mov \xi\xilen, \x0\xilen
trn1 \x0\x0len, \x0\x0len, \x1\x1len
trn2 \x1\x2len, \xi\x0len, \x1\x2len
mov \xi\xilen, \x2\xilen
trn1 \x2\x2len, \x2\x2len, \x3\x3len
trn2 \x3\x2len, \xi\x1len, \x3\x3len
.endm
.macro transpose_4x4 x0, x1, x2, x3, x5
transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
.endm
#ifndef NEON_INTRINSICS
.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
trn1 \t0\().8h, \l0\().8h, \l1\().8h
trn1 \t1\().8h, \l2\().8h, \l3\().8h
@@ -937,395 +840,6 @@ asm_function jsimd_idct_islow_neon
#undef XFIX_N_2_562
#undef XFIX_P_3_072
#endif /* NEON_INTRINSICS */
/*****************************************************************************/
/*
* jsimd_idct_4x4_neon
*
* This function contains inverse-DCT code for getting reduced-size
* 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
* and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
* function from jpeg-6b (jidctred.c).
*
* NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
* requires much less arithmetic operations and hence should be faster.
* The primary purpose of this particular Neon optimized function is
* bit exact compatibility with jpeg-6b.
*
* TODO: a bit better instructions scheduling can be achieved by expanding
* idct_helper/transpose_4x4 macros and reordering instructions,
* but readability will suffer somewhat.
*/
.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
smull v28.4s, \x4, v2.h[2]
smlal v28.4s, \x8, v0.h[0]
smlal v28.4s, \x14, v0.h[1]
smull v26.4s, \x16, v1.h[2]
smlal v26.4s, \x12, v1.h[3]
smlal v26.4s, \x10, v2.h[0]
smlal v26.4s, \x6, v2.h[1]
smull v30.4s, \x4, v2.h[2]
smlsl v30.4s, \x8, v0.h[0]
smlsl v30.4s, \x14, v0.h[1]
smull v24.4s, \x16, v0.h[2]
smlal v24.4s, \x12, v0.h[3]
smlal v24.4s, \x10, v1.h[0]
smlal v24.4s, \x6, v1.h[1]
add v20.4s, v28.4s, v26.4s
sub v28.4s, v28.4s, v26.4s
.if \shift > 16
srshr v20.4s, v20.4s, #\shift
srshr v28.4s, v28.4s, #\shift
xtn \y26, v20.4s
xtn \y29, v28.4s
.else
rshrn \y26, v20.4s, #\shift
rshrn \y29, v28.4s, #\shift
.endif
add v20.4s, v30.4s, v24.4s
sub v30.4s, v30.4s, v24.4s
.if \shift > 16
srshr v20.4s, v20.4s, #\shift
srshr v30.4s, v30.4s, #\shift
xtn \y27, v20.4s
xtn \y28, v30.4s
.else
rshrn \y27, v20.4s, #\shift
rshrn \y28, v30.4s, #\shift
.endif
.endm
asm_function jsimd_idct_4x4_neon
DCT_TABLE .req x0
COEF_BLOCK .req x1
OUTPUT_BUF .req x2
OUTPUT_COL .req x3
TMP1 .req x0
TMP2 .req x1
TMP3 .req x2
TMP4 .req x15
/* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
guarantee that the upper (unused) 32 bits of x3 are valid. This
instruction ensures that those bits are set to zero. */
uxtw x3, w3
/* Save all used Neon registers */
sub sp, sp, 64
mov x9, sp
/* Load constants (v3.4h is just used for padding) */
get_symbol_loc TMP4, Ljsimd_idct_4x4_neon_consts
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
/* Load all COEF_BLOCK into Neon registers with the following allocation:
* 0 1 2 3 | 4 5 6 7
* ---------+--------
* 0 | v4.4h | v5.4h
* 1 | v6.4h | v7.4h
* 2 | v8.4h | v9.4h
* 3 | v10.4h | v11.4h
* 4 | - | -
* 5 | v12.4h | v13.4h
* 6 | v14.4h | v15.4h
* 7 | v16.4h | v17.4h
*/
ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
add COEF_BLOCK, COEF_BLOCK, #16
ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
/* dequantize */
ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
mul v4.4h, v4.4h, v18.4h
mul v5.4h, v5.4h, v19.4h
ins v4.d[1], v5.d[0] /* 128 bit q4 */
ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
mul v6.4h, v6.4h, v20.4h
mul v7.4h, v7.4h, v21.4h
ins v6.d[1], v7.d[0] /* 128 bit q6 */
mul v8.4h, v8.4h, v22.4h
mul v9.4h, v9.4h, v23.4h
ins v8.d[1], v9.d[0] /* 128 bit q8 */
add DCT_TABLE, DCT_TABLE, #16
ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
mul v10.4h, v10.4h, v24.4h
mul v11.4h, v11.4h, v25.4h
ins v10.d[1], v11.d[0] /* 128 bit q10 */
mul v12.4h, v12.4h, v26.4h
mul v13.4h, v13.4h, v27.4h
ins v12.d[1], v13.d[0] /* 128 bit q12 */
ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
mul v14.4h, v14.4h, v28.4h
mul v15.4h, v15.4h, v29.4h
ins v14.d[1], v15.d[0] /* 128 bit q14 */
mul v16.4h, v16.4h, v30.4h
mul v17.4h, v17.4h, v31.4h
ins v16.d[1], v17.d[0] /* 128 bit q16 */
/* Pass 1 */
idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
v4.4h, v6.4h, v8.4h, v10.4h
transpose_4x4 v4, v6, v8, v10, v3
ins v10.d[1], v11.d[0]
idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
v5.4h, v7.4h, v9.4h, v11.4h
transpose_4x4 v5, v7, v9, v11, v3
ins v10.d[1], v11.d[0]
/* Pass 2 */
idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
v26.4h, v27.4h, v28.4h, v29.4h
transpose_4x4 v26, v27, v28, v29, v3
/* Range limit */
movi v30.8h, #0x80
ins v26.d[1], v27.d[0]
ins v28.d[1], v29.d[0]
add v26.8h, v26.8h, v30.8h
add v28.8h, v28.8h, v30.8h
sqxtun v26.8b, v26.8h
sqxtun v27.8b, v28.8h
/* Store results to the output buffer */
ldp TMP1, TMP2, [OUTPUT_BUF], 16
ldp TMP3, TMP4, [OUTPUT_BUF]
add TMP1, TMP1, OUTPUT_COL
add TMP2, TMP2, OUTPUT_COL
add TMP3, TMP3, OUTPUT_COL
add TMP4, TMP4, OUTPUT_COL
#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
/* We can use much less instructions on little endian systems if the
* OS kernel is not configured to trap unaligned memory accesses
*/
st1 {v26.s}[0], [TMP1], 4
st1 {v27.s}[0], [TMP3], 4
st1 {v26.s}[1], [TMP2], 4
st1 {v27.s}[1], [TMP4], 4
#else
st1 {v26.b}[0], [TMP1], 1
st1 {v27.b}[0], [TMP3], 1
st1 {v26.b}[1], [TMP1], 1
st1 {v27.b}[1], [TMP3], 1
st1 {v26.b}[2], [TMP1], 1
st1 {v27.b}[2], [TMP3], 1
st1 {v26.b}[3], [TMP1], 1
st1 {v27.b}[3], [TMP3], 1
st1 {v26.b}[4], [TMP2], 1
st1 {v27.b}[4], [TMP4], 1
st1 {v26.b}[5], [TMP2], 1
st1 {v27.b}[5], [TMP4], 1
st1 {v26.b}[6], [TMP2], 1
st1 {v27.b}[6], [TMP4], 1
st1 {v26.b}[7], [TMP2], 1
st1 {v27.b}[7], [TMP4], 1
#endif
/* vpop {v8.4h - v15.4h} (not available) */
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
blr x30
.unreq DCT_TABLE
.unreq COEF_BLOCK
.unreq OUTPUT_BUF
.unreq OUTPUT_COL
.unreq TMP1
.unreq TMP2
.unreq TMP3
.unreq TMP4
.purgem idct_helper
/*****************************************************************************/
/*
* jsimd_idct_2x2_neon
*
* This function contains inverse-DCT code for getting reduced-size
* 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
* and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
* function from jpeg-6b (jidctred.c).
*
* NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
* requires much less arithmetic operations and hence should be faster.
* The primary purpose of this particular Neon optimized function is
* bit exact compatibility with jpeg-6b.
*/
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
sshll v15.4s, \x4, #15
smull v26.4s, \x6, v14.h[3]
smlal v26.4s, \x10, v14.h[2]
smlal v26.4s, \x12, v14.h[1]
smlal v26.4s, \x16, v14.h[0]
add v20.4s, v15.4s, v26.4s
sub v15.4s, v15.4s, v26.4s
.if \shift > 16
srshr v20.4s, v20.4s, #\shift
srshr v15.4s, v15.4s, #\shift
xtn \y26, v20.4s
xtn \y27, v15.4s
.else
rshrn \y26, v20.4s, #\shift
rshrn \y27, v15.4s, #\shift
.endif
.endm
asm_function jsimd_idct_2x2_neon
DCT_TABLE .req x0
COEF_BLOCK .req x1
OUTPUT_BUF .req x2
OUTPUT_COL .req x3
TMP1 .req x0
TMP2 .req x15
/* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
guarantee that the upper (unused) 32 bits of x3 are valid. This
instruction ensures that those bits are set to zero. */
uxtw x3, w3
/* vpush {v8.4h - v15.4h} (not available) */
sub sp, sp, 64
mov x9, sp
/* Load constants */
get_symbol_loc TMP2, Ljsimd_idct_2x2_neon_consts
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
ld1 {v14.4h}, [TMP2]
/* Load all COEF_BLOCK into Neon registers with the following allocation:
* 0 1 2 3 | 4 5 6 7
* ---------+--------
* 0 | v4.4h | v5.4h
* 1 | v6.4h | v7.4h
* 2 | - | -
* 3 | v10.4h | v11.4h
* 4 | - | -
* 5 | v12.4h | v13.4h
* 6 | - | -
* 7 | v16.4h | v17.4h
*/
ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
add COEF_BLOCK, COEF_BLOCK, #16
ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16
add COEF_BLOCK, COEF_BLOCK, #16
ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16
add COEF_BLOCK, COEF_BLOCK, #16
ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
/* Dequantize */
ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
mul v4.4h, v4.4h, v18.4h
mul v5.4h, v5.4h, v19.4h
ins v4.d[1], v5.d[0]
mul v6.4h, v6.4h, v20.4h
mul v7.4h, v7.4h, v21.4h
ins v6.d[1], v7.d[0]
add DCT_TABLE, DCT_TABLE, #16
ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
mul v10.4h, v10.4h, v24.4h
mul v11.4h, v11.4h, v25.4h
ins v10.d[1], v11.d[0]
add DCT_TABLE, DCT_TABLE, #16
ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
mul v12.4h, v12.4h, v26.4h
mul v13.4h, v13.4h, v27.4h
ins v12.d[1], v13.d[0]
add DCT_TABLE, DCT_TABLE, #16
ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
mul v16.4h, v16.4h, v30.4h
mul v17.4h, v17.4h, v31.4h
ins v16.d[1], v17.d[0]
/* Pass 1 */
#if 0
idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h
idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
#else
smull v26.4s, v6.4h, v14.h[3]
smlal v26.4s, v10.4h, v14.h[2]
smlal v26.4s, v12.4h, v14.h[1]
smlal v26.4s, v16.4h, v14.h[0]
smull v24.4s, v7.4h, v14.h[3]
smlal v24.4s, v11.4h, v14.h[2]
smlal v24.4s, v13.4h, v14.h[1]
smlal v24.4s, v17.4h, v14.h[0]
sshll v15.4s, v4.4h, #15
sshll v30.4s, v5.4h, #15
add v20.4s, v15.4s, v26.4s
sub v15.4s, v15.4s, v26.4s
rshrn v4.4h, v20.4s, #13
rshrn v6.4h, v15.4s, #13
add v20.4s, v30.4s, v24.4s
sub v15.4s, v30.4s, v24.4s
rshrn v5.4h, v20.4s, #13
rshrn v7.4h, v15.4s, #13
ins v4.d[1], v5.d[0]
ins v6.d[1], v7.d[0]
transpose v4, v6, v3, .16b, .8h
transpose v6, v10, v3, .16b, .4s
ins v11.d[0], v10.d[1]
ins v7.d[0], v6.d[1]
#endif
/* Pass 2 */
idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
/* Range limit */
movi v30.8h, #0x80
ins v26.d[1], v27.d[0]
add v26.8h, v26.8h, v30.8h
sqxtun v30.8b, v26.8h
ins v26.d[0], v30.d[0]
sqxtun v27.8b, v26.8h
/* Store results to the output buffer */
ldp TMP1, TMP2, [OUTPUT_BUF]
add TMP1, TMP1, OUTPUT_COL
add TMP2, TMP2, OUTPUT_COL
st1 {v26.b}[0], [TMP1], 1
st1 {v27.b}[4], [TMP1], 1
st1 {v26.b}[1], [TMP2], 1
st1 {v27.b}[5], [TMP2], 1
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
blr x30
.unreq DCT_TABLE
.unreq COEF_BLOCK
.unreq OUTPUT_BUF
.unreq OUTPUT_COL
.unreq TMP1
.unreq TMP2
.purgem idct_helper
#ifndef NEON_INTRINSICS
/*****************************************************************************/
@@ -2736,5 +2250,3 @@ generate_jsimd_huff_encode_one_block 0
.purgem put_bits
.purgem checkbuf31
.purgem checkbuf47
#endif /* NEON_INTRINSICS */

484
simd/arm/jidctred-neon.c Normal file
View File

@@ -0,0 +1,484 @@
/*
* jidctred-neon.c - reduced-size IDCT (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include <arm_neon.h>
#define CONST_BITS 13
#define PASS1_BITS 2
#define F_0_211 1730
#define F_0_509 4176
#define F_0_601 4926
#define F_0_720 5906
#define F_0_765 6270
#define F_0_850 6967
#define F_0_899 7373
#define F_1_061 8697
#define F_1_272 10426
#define F_1_451 11893
#define F_1_847 15137
#define F_2_172 17799
#define F_2_562 20995
#define F_3_624 29692
/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
* 2x2 output from an 8x8 DCT block. It uses the same calculations and
* produces exactly the same output as IJG's original jpeg_idct_2x2() function
* from jpeg-6b, which can be found in jidctred.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.720959822 = 5906 * 2^-13
* 0.850430095 = 6967 * 2^-13
* 1.272758580 = 10426 * 2^-13
* 3.624509785 = 29692 * 2^-13
*
* See jidctred.c for further details of the 2x2 IDCT algorithm. Where
* possible, the variable names and comments here in jsimd_idct_2x2_neon()
* match up with those in jpeg_idct_2x2().
*/
ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
-F_0_720, F_0_850, -F_1_272, F_3_624
};
void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col)
{
ISLOW_MULT_TYPE *quantptr = dct_table;
/* Load DCT coefficients. */
int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
/* Load quantization table values. */
int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
/* Dequantize DCT coefficients. */
row0 = vmulq_s16(row0, quant_row0);
row1 = vmulq_s16(row1, quant_row1);
row3 = vmulq_s16(row3, quant_row3);
row5 = vmulq_s16(row5, quant_row5);
row7 = vmulq_s16(row7, quant_row7);
/* Load IDCT conversion constants. */
const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
/* Pass 1: process columns from input, put results in vectors row0 and
* row1.
*/
/* Even part */
int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
/* Odd part */
int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
/* Final output stage: descale and narrow to 16-bit. */
row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
/* Transpose two rows, ready for second pass. */
int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
int16x8_t cols_0246 = cols_0246_1357.val[0];
int16x8_t cols_1357 = cols_0246_1357.val[1];
/* Duplicate columns such that each is accessible in its own vector. */
int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
vreinterpretq_s32_s16(cols_1357));
int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
/* Pass 2: process two rows, store to output array. */
/* Even part: we're only interested in col0; the top half of tmp10 is "don't
* care."
*/
int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
/* Odd part: we're only interested in the bottom half of tmp0. */
int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
/* Final output stage: descale and clamp to range [0-255]. */
int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
vsubhn_s32(tmp10, tmp0));
output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
CONST_BITS + PASS1_BITS + 3 + 2 - 16);
/* Narrow to 8-bit and convert to unsigned. */
uint8x8_t output_u8 = vqmovun_s16(output_s16);
/* Store 2x2 block to memory. */
vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
}
/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
* 4x4 output from an 8x8 DCT block. It uses the same calculations and
* produces exactly the same output as IJG's original jpeg_idct_4x4() function
* from jpeg-6b, which can be found in jidctred.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.211164243 = 1730 * 2^-13
* 0.509795579 = 4176 * 2^-13
* 0.601344887 = 4926 * 2^-13
* 0.765366865 = 6270 * 2^-13
* 0.899976223 = 7373 * 2^-13
* 1.061594337 = 8697 * 2^-13
* 1.451774981 = 11893 * 2^-13
* 1.847759065 = 15137 * 2^-13
* 2.172734803 = 17799 * 2^-13
* 2.562915447 = 20995 * 2^-13
*
* See jidctred.c for further details of the 4x4 IDCT algorithm. Where
* possible, the variable names and comments here in jsimd_idct_4x4_neon()
* match up with those in jpeg_idct_4x4().
*/
ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
F_1_847, -F_0_765, -F_0_211, F_1_451,
-F_2_172, F_1_061, -F_0_509, -F_0_601,
F_0_899, F_2_562, 0, 0
};
void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col)
{
ISLOW_MULT_TYPE *quantptr = dct_table;
/* Load DCT coefficients. */
int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
/* Load quantization table values for DC coefficients. */
int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
/* Dequantize DC coefficients. */
row0 = vmulq_s16(row0, quant_row0);
/* Construct bitmap to test if all AC coefficients are 0. */
int16x8_t bitmap = vorrq_s16(row1, row2);
bitmap = vorrq_s16(bitmap, row3);
bitmap = vorrq_s16(bitmap, row5);
bitmap = vorrq_s16(bitmap, row6);
bitmap = vorrq_s16(bitmap, row7);
int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
/* Load constants for IDCT computation. */
#if defined(__clang__) || defined(_MSC_VER)
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
#else
/* GCC does not currently support the intrinsic vld1_<type>_x3(). */
const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
#endif
if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
/* All AC coefficients are zero.
* Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
*/
int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
row0 = dcval;
row1 = dcval;
row2 = dcval;
row3 = dcval;
} else if (left_ac_bitmap == 0) {
/* AC coefficients are zero for columns 0, 1, 2, and 3.
* Compute DC values for these columns.
*/
int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
/* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
/* Load quantization table. */
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
/* Even part */
int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
/* Odd part */
int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
z2 = vmul_s16(vget_high_s16(row5), quant_row5);
z3 = vmul_s16(vget_high_s16(row3), quant_row3);
int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
/* Final output stage: descale and narrow to 16-bit. */
row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
CONST_BITS - PASS1_BITS + 1));
row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
CONST_BITS - PASS1_BITS + 1));
row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
CONST_BITS - PASS1_BITS + 1));
row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
CONST_BITS - PASS1_BITS + 1));
} else if (right_ac_bitmap == 0) {
/* AC coefficients are zero for columns 4, 5, 6, and 7.
* Compute DC values for these columns.
*/
int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
/* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
/* Load quantization table. */
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
/* Even part */
int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
/* Odd part */
int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
z2 = vmul_s16(vget_low_s16(row5), quant_row5);
z3 = vmul_s16(vget_low_s16(row3), quant_row3);
int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
/* Final output stage: descale and narrow to 16-bit. */
row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
CONST_BITS - PASS1_BITS + 1), dcval);
row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
CONST_BITS - PASS1_BITS + 1), dcval);
row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
CONST_BITS - PASS1_BITS + 1), dcval);
row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
CONST_BITS - PASS1_BITS + 1), dcval);
} else {
/* All AC coefficients are non-zero; full IDCT calculation required. */
int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
/* Even part */
int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
int16x8_t z2 = vmulq_s16(row2, quant_row2);
int16x8_t z3 = vmulq_s16(row6, quant_row6);
int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
/* Odd part */
int16x8_t z1 = vmulq_s16(row7, quant_row7);
z2 = vmulq_s16(row5, quant_row5);
z3 = vmulq_s16(row3, quant_row3);
int16x8_t z4 = vmulq_s16(row1, quant_row1);
tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
/* Final output stage: descale and narrow to 16-bit. */
row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
CONST_BITS - PASS1_BITS + 1),
vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
CONST_BITS - PASS1_BITS + 1));
row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
CONST_BITS - PASS1_BITS + 1),
vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
CONST_BITS - PASS1_BITS + 1));
row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
CONST_BITS - PASS1_BITS + 1),
vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
CONST_BITS - PASS1_BITS + 1));
row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
CONST_BITS - PASS1_BITS + 1),
vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
CONST_BITS - PASS1_BITS + 1));
}
/* Transpose 8x4 block to perform IDCT on rows in second pass. */
int16x8x2_t row_01 = vtrnq_s16(row0, row1);
int16x8x2_t row_23 = vtrnq_s16(row2, row3);
int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
vreinterpretq_s32_s16(row_23.val[0]));
int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
vreinterpretq_s32_s16(row_23.val[1]));
int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
/* Commence second pass of IDCT. */
/* Even part */
int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
/* Odd part */
tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
/* Final output stage: descale and clamp to range [0-255]. */
int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
vsubhn_s32(tmp12, tmp0));
int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
vsubhn_s32(tmp10, tmp2));
output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
CONST_BITS + PASS1_BITS + 3 + 1 - 16);
output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
CONST_BITS + PASS1_BITS + 3 + 1 - 16);
/* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
* An interleaving store completes the transpose.
*/
uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
vqmovun_s16(output_cols_13));
uint16x4x2_t output_01_23 = { {
vreinterpret_u16_u8(output_0123.val[0]),
vreinterpret_u16_u8(output_0123.val[1])
} };
/* Store 4x4 block to memory. */
JSAMPROW outptr0 = output_buf[0] + output_col;
JSAMPROW outptr1 = output_buf[1] + output_col;
JSAMPROW outptr2 = output_buf[2] + output_col;
JSAMPROW outptr3 = output_buf[3] + output_col;
vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
}