Make ARM ISLOW iDCT faster on typical cases, and eliminate the possibility of 16-bit overflows when handling arbitrary coefficients.

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@692 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
DRC
2011-09-06 18:55:45 +00:00
parent 98a44fe07b
commit 5129e3960f

View File

@@ -263,46 +263,74 @@ asm_function jsimd_idct_islow_neon
vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
vmlal.s16 q6, d5, XFIX_1_175875602 vmlal.s16 q6, d5, XFIX_1_175875602
vmull.s16 q7, d4, XFIX_1_175875602 vmull.s16 q7, d4, XFIX_1_175875602
/* Check for the zero coefficients in the right 4x8 half */
push {r4, r5}
vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
vsubl.s16 q3, ROW0L, ROW4L vsubl.s16 q3, ROW0L, ROW4L
ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
vmull.s16 q2, ROW2L, XFIX_0_541196100 vmull.s16 q2, ROW2L, XFIX_0_541196100
vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
orr r0, r4, r5
vmov q4, q6 vmov q4, q6
vmlsl.s16 q6, ROW5L, XFIX_2_562915447 vmlsl.s16 q6, ROW5L, XFIX_2_562915447
ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
vshl.s32 q3, q3, #13 vshl.s32 q3, q3, #13
orr r0, r0, r4
vmlsl.s16 q4, ROW1L, XFIX_0_899976223 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
orr r0, r0, r5
vadd.s32 q1, q3, q2 vadd.s32 q1, q3, q2
ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
vmov q5, q7 vmov q5, q7
vadd.s32 q1, q1, q6 vadd.s32 q1, q1, q6
orr r0, r0, r4
vmlsl.s16 q7, ROW7L, XFIX_0_899976223 vmlsl.s16 q7, ROW7L, XFIX_0_899976223
orr r0, r0, r5
vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
vrshrn.s32 ROW1L, q1, #11 vrshrn.s32 ROW1L, q1, #11
ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
vsub.s32 q1, q1, q6 vsub.s32 q1, q1, q6
vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
orr r0, r0, r4
vmlsl.s16 q5, ROW3L, XFIX_2_562915447 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
orr r0, r0, r5
vsub.s32 q1, q1, q6 vsub.s32 q1, q1, q6
vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
vmlal.s16 q6, ROW6L, XFIX_0_541196100 vmlal.s16 q6, ROW6L, XFIX_0_541196100
vsub.s32 q3, q3, q2 vsub.s32 q3, q3, q2
orr r0, r0, r4
vrshrn.s32 ROW6L, q1, #11 vrshrn.s32 ROW6L, q1, #11
orr r0, r0, r5
vadd.s32 q1, q3, q5 vadd.s32 q1, q3, q5
ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
vsub.s32 q3, q3, q5 vsub.s32 q3, q3, q5
vaddl.s16 q5, ROW0L, ROW4L vaddl.s16 q5, ROW0L, ROW4L
orr r0, r0, r4
vrshrn.s32 ROW2L, q1, #11 vrshrn.s32 ROW2L, q1, #11
orr r0, r0, r5
vrshrn.s32 ROW5L, q3, #11 vrshrn.s32 ROW5L, q3, #11
ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
vshl.s32 q5, q5, #13 vshl.s32 q5, q5, #13
vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
orr r0, r0, r4
vadd.s32 q2, q5, q6 vadd.s32 q2, q5, q6
orrs r0, r0, r5
vsub.s32 q1, q5, q6 vsub.s32 q1, q5, q6
vadd.s32 q6, q2, q7 vadd.s32 q6, q2, q7
ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
vsub.s32 q2, q2, q7 vsub.s32 q2, q2, q7
vadd.s32 q5, q1, q4 vadd.s32 q5, q1, q4
orr r0, r4, r5
vsub.s32 q3, q1, q4 vsub.s32 q3, q1, q4
pop {r4, r5}
vrshrn.s32 ROW7L, q2, #11 vrshrn.s32 ROW7L, q2, #11
vrshrn.s32 ROW3L, q5, #11 vrshrn.s32 ROW3L, q5, #11
vrshrn.s32 ROW0L, q6, #11 vrshrn.s32 ROW0L, q6, #11
vrshrn.s32 ROW4L, q3, #11 vrshrn.s32 ROW4L, q3, #11
beq 3f /* Go to do some special handling for the sparse right 4x8 half */
/* 1-D IDCT, pass 1, right 4x8 half */ /* 1-D IDCT, pass 1, right 4x8 half */
vld1.s16 {d2}, [ip, :64] /* reload constants */ vld1.s16 {d2}, [ip, :64] /* reload constants */
vadd.s16 d10, ROW7R, ROW3R vadd.s16 d10, ROW7R, ROW3R
@@ -359,102 +387,101 @@ asm_function jsimd_idct_islow_neon
vrshrn.s32 ROW3R, q5, #11 vrshrn.s32 ROW3R, q5, #11
vrshrn.s32 ROW0R, q6, #11 vrshrn.s32 ROW0R, q6, #11
vrshrn.s32 ROW4R, q3, #11 vrshrn.s32 ROW4R, q3, #11
/* Transpose right 4x8 half */
vtrn.16 ROW6R, ROW7R
vtrn.16 ROW2R, ROW3R
vtrn.16 ROW0R, ROW1R
vtrn.16 ROW4R, ROW5R
vtrn.32 ROW1R, ROW3R
vtrn.32 ROW4R, ROW6R
vtrn.32 ROW0R, ROW2R
vtrn.32 ROW5R, ROW7R
1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
vld1.s16 {d2}, [ip, :64] /* reload constants */ vld1.s16 {d2}, [ip, :64] /* reload constants */
/* Transpose right 4x8 half */ vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
vtrn.16 ROW6R, ROW7R vmlal.s16 q6, ROW1L, XFIX_1_175875602
vtrn.16 ROW2R, ROW3R vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
vtrn.16 ROW0R, ROW1R vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
vtrn.16 ROW4R, ROW5R vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
vmov.s16 q7, #(CENTERJSAMPLE << 5) vmlal.s16 q7, ROW3L, XFIX_1_175875602
vtrn.32 ROW1R, ROW3R vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
vtrn.32 ROW4R, ROW6R vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
vtrn.32 ROW0R, ROW2R vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
vtrn.32 ROW5R, ROW7R
/* 1-D IDCT, pass 2, left 4x8 half */
vswp ROW7L, ROW3R
vadd.s16 d10, ROW7L, ROW3L
vswp ROW5L, ROW1R
vadd.s16 d8, ROW5L, ROW1L
vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
vmlal.s16 q6, d8, XFIX_1_175875602
vswp ROW4L, ROW0R
vadd.s16 q8, q8, q7
vmull.s16 q7, d10, XFIX_1_175875602
vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
vsubl.s16 q3, ROW0L, ROW4L
vswp ROW6L, ROW2R
vmull.s16 q2, ROW2L, XFIX_0_541196100 vmull.s16 q2, ROW2L, XFIX_0_541196100
vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
vmov q4, q6 vmov q4, q6
vmlsl.s16 q6, ROW5L, XFIX_2_562915447 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
vshl.s32 q3, q3, #13 vshl.s32 q3, q3, #13
vmlsl.s16 q4, ROW1L, XFIX_0_899976223 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
vadd.s32 q1, q3, q2 vadd.s32 q1, q3, q2
vmov q5, q7 vmov q5, q7
vadd.s32 q1, q1, q6 vadd.s32 q1, q1, q6
vmlsl.s16 q7, ROW7L, XFIX_0_899976223 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
vshrn.s32 ROW1L, q1, #16 vshrn.s32 ROW1L, q1, #16
vsub.s32 q1, q1, q6 vsub.s32 q1, q1, q6
vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
vmlsl.s16 q5, ROW3L, XFIX_2_562915447 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
vsub.s32 q1, q1, q6 vsub.s32 q1, q1, q6
vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
vmlal.s16 q6, ROW6L, XFIX_0_541196100 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
vsub.s32 q3, q3, q2 vsub.s32 q3, q3, q2
vshrn.s32 ROW6L, q1, #16 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
vadd.s32 q1, q3, q5 vadd.s32 q1, q3, q5
vsub.s32 q3, q3, q5 vsub.s32 q3, q3, q5
vaddl.s16 q5, ROW0L, ROW4L vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
vshrn.s32 ROW2L, q1, #16 vshrn.s32 ROW2L, q1, #16
vshrn.s32 ROW5L, q3, #16 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
vshl.s32 q5, q5, #13 vshl.s32 q5, q5, #13
vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
vadd.s32 q2, q5, q6 vadd.s32 q2, q5, q6
vsub.s32 q1, q5, q6 vsub.s32 q1, q5, q6
vadd.s32 q6, q2, q7 vadd.s32 q6, q2, q7
vsub.s32 q2, q2, q7 vsub.s32 q2, q2, q7
vadd.s32 q5, q1, q4 vadd.s32 q5, q1, q4
vsub.s32 q3, q1, q4 vsub.s32 q3, q1, q4
vshrn.s32 ROW7L, q2, #16 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
vshrn.s32 ROW3L, q5, #16 vshrn.s32 ROW3L, q5, #16
vshrn.s32 ROW0L, q6, #16 vshrn.s32 ROW0L, q6, #16
vshrn.s32 ROW4L, q3, #16 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
/* 1-D IDCT, pass 2, right 4x8 half */ /* 1-D IDCT, pass 2, right 4x8 half */
vld1.s16 {d2}, [ip, :64] /* reload constants */ vld1.s16 {d2}, [ip, :64] /* reload constants */
vadd.s16 d10, ROW7R, ROW3R vmull.s16 q6, ROW5R, XFIX_1_175875602
vadd.s16 d8, ROW5R, ROW1R vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
vmlal.s16 q6, d8, XFIX_1_175875602 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
vmull.s16 q7, d10, XFIX_1_175875602 vmull.s16 q7, ROW7R, XFIX_1_175875602
vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
vsubl.s16 q3, ROW0R, ROW4R vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
vmull.s16 q2, ROW2R, XFIX_0_541196100 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
vmov q4, q6 vmov q4, q6
vmlsl.s16 q6, ROW5R, XFIX_2_562915447 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
vshl.s32 q3, q3, #13 vshl.s32 q3, q3, #13
vmlsl.s16 q4, ROW1R, XFIX_0_899976223 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
vadd.s32 q1, q3, q2 vadd.s32 q1, q3, q2
vmov q5, q7 vmov q5, q7
vadd.s32 q1, q1, q6 vadd.s32 q1, q1, q6
vmlsl.s16 q7, ROW7R, XFIX_0_899976223 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
vshrn.s32 ROW1R, q1, #16 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
vsub.s32 q1, q1, q6 vsub.s32 q1, q1, q6
vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
vmlsl.s16 q5, ROW3R, XFIX_2_562915447 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
vsub.s32 q1, q1, q6 vsub.s32 q1, q1, q6
vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
vmlal.s16 q6, ROW6R, XFIX_0_541196100 vmlal.s16 q6, ROW6R, XFIX_0_541196100
vsub.s32 q3, q3, q2 vsub.s32 q3, q3, q2
vshrn.s32 ROW6R, q1, #16 vshrn.s32 ROW6R, q1, #16
vadd.s32 q1, q3, q5 vadd.s32 q1, q3, q5
vsub.s32 q3, q3, q5 vsub.s32 q3, q3, q5
vaddl.s16 q5, ROW0R, ROW4R vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
vshrn.s32 ROW2R, q1, #16 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
vshrn.s32 ROW5R, q3, #16 vshrn.s32 ROW5R, q3, #16
vshl.s32 q5, q5, #13 vshl.s32 q5, q5, #13
vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
@@ -465,50 +492,157 @@ asm_function jsimd_idct_islow_neon
vadd.s32 q5, q1, q4 vadd.s32 q5, q1, q4
vsub.s32 q3, q1, q4 vsub.s32 q3, q1, q4
vshrn.s32 ROW7R, q2, #16 vshrn.s32 ROW7R, q2, #16
vshrn.s32 ROW3R, q5, #16 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
vshrn.s32 ROW0R, q6, #16 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
vshrn.s32 ROW4R, q3, #16 vshrn.s32 ROW4R, q3, #16
/* Descale to 8-bit and range limit */
vqrshrun.s16 d16, q8, #2 2: /* Descale to 8-bit and range limit */
vqrshrun.s16 d17, q9, #2 vqrshrn.s16 d16, q8, #2
vqrshrun.s16 d18, q10, #2 vqrshrn.s16 d17, q9, #2
vqrshrun.s16 d19, q11, #2 vqrshrn.s16 d18, q10, #2
vqrshrn.s16 d19, q11, #2
vpop {d8-d15} /* restore NEON registers */ vpop {d8-d15} /* restore NEON registers */
vqrshrun.s16 d20, q12, #2 vqrshrn.s16 d20, q12, #2
vqrshrun.s16 d21, q13, #2 /* Transpose the final 8-bit samples and do signed->unsigned conversion */
vqrshrun.s16 d22, q14, #2 vtrn.16 q8, q9
vqrshrun.s16 d23, q15, #2 vqrshrn.s16 d21, q13, #2
/* Transpose the final 8-bit samples */ vqrshrn.s16 d22, q14, #2
vtrn.16 q8, q9 vmov.u8 q0, #(CENTERJSAMPLE)
vtrn.16 q10, q11 vqrshrn.s16 d23, q15, #2
vtrn.32 q8, q10 vtrn.8 d16, d17
vtrn.32 q9, q11 vtrn.8 d18, d19
vtrn.8 d16, d17 vadd.u8 q8, q8, q0
vtrn.8 d18, d19 vadd.u8 q9, q9, q0
/* Store results to the output buffer */ vtrn.16 q10, q11
ldmia OUTPUT_BUF!, {TMP1, TMP2} /* Store results to the output buffer */
add TMP1, TMP1, OUTPUT_COL ldmia OUTPUT_BUF!, {TMP1, TMP2}
add TMP2, TMP2, OUTPUT_COL add TMP1, TMP1, OUTPUT_COL
vst1.8 {d16}, [TMP1] add TMP2, TMP2, OUTPUT_COL
vst1.8 {d17}, [TMP2] vst1.8 {d16}, [TMP1]
ldmia OUTPUT_BUF!, {TMP1, TMP2} vtrn.8 d20, d21
add TMP1, TMP1, OUTPUT_COL vst1.8 {d17}, [TMP2]
add TMP2, TMP2, OUTPUT_COL ldmia OUTPUT_BUF!, {TMP1, TMP2}
vst1.8 {d18}, [TMP1] add TMP1, TMP1, OUTPUT_COL
vtrn.8 d20, d21 add TMP2, TMP2, OUTPUT_COL
vst1.8 {d19}, [TMP2] vst1.8 {d18}, [TMP1]
ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} vadd.u8 q10, q10, q0
add TMP1, TMP1, OUTPUT_COL vst1.8 {d19}, [TMP2]
add TMP2, TMP2, OUTPUT_COL ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
add TMP3, TMP3, OUTPUT_COL add TMP1, TMP1, OUTPUT_COL
add TMP4, TMP4, OUTPUT_COL add TMP2, TMP2, OUTPUT_COL
vst1.8 {d20}, [TMP1] add TMP3, TMP3, OUTPUT_COL
vtrn.8 d22, d23 add TMP4, TMP4, OUTPUT_COL
vst1.8 {d21}, [TMP2] vtrn.8 d22, d23
vst1.8 {d22}, [TMP3] vst1.8 {d20}, [TMP1]
vst1.8 {d23}, [TMP4] vadd.u8 q11, q11, q0
vst1.8 {d21}, [TMP2]
vst1.8 {d22}, [TMP3]
vst1.8 {d23}, [TMP4]
bx lr bx lr
3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
/* Transpose left 4x8 half */
vtrn.16 ROW6L, ROW7L
vtrn.16 ROW2L, ROW3L
vtrn.16 ROW0L, ROW1L
vtrn.16 ROW4L, ROW5L
vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
vtrn.32 ROW1L, ROW3L
vtrn.32 ROW4L, ROW6L
vtrn.32 ROW0L, ROW2L
vtrn.32 ROW5L, ROW7L
cmp r0, #0
beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
/* Only row 0 is non-zero for the right 4x8 half */
vdup.s16 ROW1R, ROW0R[1]
vdup.s16 ROW2R, ROW0R[2]
vdup.s16 ROW3R, ROW0R[3]
vdup.s16 ROW4R, ROW0R[0]
vdup.s16 ROW5R, ROW0R[1]
vdup.s16 ROW6R, ROW0R[2]
vdup.s16 ROW7R, ROW0R[3]
vdup.s16 ROW0R, ROW0R[0]
b 1b /* Go to 'normal' second pass */
4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
vld1.s16 {d2}, [ip, :64] /* reload constants */
vmull.s16 q6, ROW1L, XFIX_1_175875602
vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
vmull.s16 q7, ROW3L, XFIX_1_175875602
vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
vmull.s16 q2, ROW2L, XFIX_0_541196100
vshll.s16 q3, ROW0L, #13
vmov q4, q6
vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
vmlsl.s16 q4, ROW1L, XFIX_0_899976223
vadd.s32 q1, q3, q2
vmov q5, q7
vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
vadd.s32 q1, q1, q6
vadd.s32 q6, q6, q6
vmlsl.s16 q5, ROW3L, XFIX_2_562915447
vshrn.s32 ROW1L, q1, #16
vsub.s32 q1, q1, q6
vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
vsub.s32 q3, q3, q2
vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
vadd.s32 q1, q3, q5
vsub.s32 q3, q3, q5
vshll.s16 q5, ROW0L, #13
vshrn.s32 ROW2L, q1, #16
vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
vadd.s32 q2, q5, q6
vsub.s32 q1, q5, q6
vadd.s32 q6, q2, q7
vsub.s32 q2, q2, q7
vadd.s32 q5, q1, q4
vsub.s32 q3, q1, q4
vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
vshrn.s32 ROW3L, q5, #16
vshrn.s32 ROW0L, q6, #16
vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
/* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
vld1.s16 {d2}, [ip, :64] /* reload constants */
vmull.s16 q6, ROW5L, XFIX_1_175875602
vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
vmull.s16 q7, ROW7L, XFIX_1_175875602
vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
vmull.s16 q2, ROW6L, XFIX_0_541196100
vshll.s16 q3, ROW4L, #13
vmov q4, q6
vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
vmlsl.s16 q4, ROW5L, XFIX_0_899976223
vadd.s32 q1, q3, q2
vmov q5, q7
vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
vadd.s32 q1, q1, q6
vadd.s32 q6, q6, q6
vmlsl.s16 q5, ROW7L, XFIX_2_562915447
vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
vsub.s32 q1, q1, q6
vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
vsub.s32 q3, q3, q2
vshrn.s32 ROW6R, q1, #16
vadd.s32 q1, q3, q5
vsub.s32 q3, q3, q5
vshll.s16 q5, ROW4L, #13
vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
vshrn.s32 ROW5R, q3, #16
vadd.s32 q2, q5, q6
vsub.s32 q1, q5, q6
vadd.s32 q6, q2, q7
vsub.s32 q2, q2, q7
vadd.s32 q5, q1, q4
vsub.s32 q3, q1, q4
vshrn.s32 ROW7R, q2, #16
vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
vshrn.s32 ROW4R, q3, #16
b 2b /* Go to epilogue */
.unreq DCT_TABLE .unreq DCT_TABLE
.unreq COEF_BLOCK .unreq COEF_BLOCK
.unreq OUTPUT_BUF .unreq OUTPUT_BUF