diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S index 8229fc41..9ef6efc0 100644 --- a/simd/jsimd_arm_neon.S +++ b/simd/jsimd_arm_neon.S @@ -1152,12 +1152,12 @@ asm_function jsimd_idct_2x2_neon .macro do_load size .if \size == 8 - vld1.8 {d4}, [U]! - vld1.8 {d5}, [V]! - vld1.8 {d0}, [Y]! - pld [Y, #64] + vld1.8 {d4}, [U, :64]! + vld1.8 {d5}, [V, :64]! + vld1.8 {d0}, [Y, :64]! pld [U, #64] pld [V, #64] + pld [Y, #64] .elseif \size == 4 vld1.8 {d4[0]}, [U]! vld1.8 {d4[1]}, [U]! @@ -1227,7 +1227,11 @@ asm_function jsimd_idct_2x2_neon .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs -.macro do_yuv_to_rgb +/* + * 2 stage pipelined YCbCr->RGB conversion + */ + +.macro do_yuv_to_rgb_stage1 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ @@ -1238,6 +1242,9 @@ asm_function jsimd_idct_2x2_neon vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ +.endm + +.macro do_yuv_to_rgb_stage2 vrshrn.s32 d20, q10, #15 vrshrn.s32 d21, q11, #15 vrshrn.s32 d24, q12, #14 @@ -1252,6 +1259,43 @@ asm_function jsimd_idct_2x2_neon vqmovun.s16 d1\b_offs, q14 .endm +.macro do_yuv_to_rgb_stage2_store_load_stage1 + vld1.8 {d4}, [U, :64]! + vrshrn.s32 d20, q10, #15 + vrshrn.s32 d21, q11, #15 + vrshrn.s32 d24, q12, #14 + vrshrn.s32 d25, q13, #14 + vrshrn.s32 d28, q14, #14 + vld1.8 {d5}, [V, :64]! + vrshrn.s32 d29, q15, #14 + vaddw.u8 q10, q10, d0 + vaddw.u8 q12, q12, d0 + vaddw.u8 q14, q14, d0 + vqmovun.s16 d1\g_offs, q10 + vld1.8 {d0}, [Y, :64]! + vqmovun.s16 d1\r_offs, q12 + pld [U, #64] + pld [V, #64] + pld [Y, #64] + vqmovun.s16 d1\b_offs, q14 + vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ + vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ + do_store \bpp, 8 + vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ + vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ + vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ + vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ + vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ + vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ + vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ + vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ +.endm + +.macro do_yuv_to_rgb + do_yuv_to_rgb_stage1 + do_yuv_to_rgb_stage2 +.endm + /* Apple gas crashes on adrl, work around that by using adr. * But this requires a copy of these constants for each function. */ @@ -1312,16 +1356,21 @@ asm_function jsimd_ycc_\colorid\()_convert_neon /* Inner loop over pixels */ subs N, N, #8 + blt 3f + do_load 8 + do_yuv_to_rgb_stage1 + subs N, N, #8 blt 2f 1: - do_load 8 - do_yuv_to_rgb - do_store \bpp, 8 + do_yuv_to_rgb_stage2_store_load_stage1 subs N, N, #8 bge 1b +2: + do_yuv_to_rgb_stage2 + do_store \bpp, 8 tst N, #7 beq 8f -2: +3: tst N, #4 beq 3f do_load 4 @@ -1369,6 +1418,9 @@ asm_function jsimd_ycc_\colorid\()_convert_neon .endfunc .purgem do_yuv_to_rgb +.purgem do_yuv_to_rgb_stage1 +.purgem do_yuv_to_rgb_stage2 +.purgem do_yuv_to_rgb_stage2_store_load_stage1 .endm