Improve the performance of YCbCr to RGB conversion on ARM
This commit is contained in:
@@ -1152,12 +1152,12 @@ asm_function jsimd_idct_2x2_neon
|
||||
|
||||
.macro do_load size
|
||||
.if \size == 8
|
||||
vld1.8 {d4}, [U]!
|
||||
vld1.8 {d5}, [V]!
|
||||
vld1.8 {d0}, [Y]!
|
||||
pld [Y, #64]
|
||||
vld1.8 {d4}, [U, :64]!
|
||||
vld1.8 {d5}, [V, :64]!
|
||||
vld1.8 {d0}, [Y, :64]!
|
||||
pld [U, #64]
|
||||
pld [V, #64]
|
||||
pld [Y, #64]
|
||||
.elseif \size == 4
|
||||
vld1.8 {d4[0]}, [U]!
|
||||
vld1.8 {d4[1]}, [U]!
|
||||
@@ -1227,7 +1227,11 @@ asm_function jsimd_idct_2x2_neon
|
||||
|
||||
.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
|
||||
|
||||
.macro do_yuv_to_rgb
|
||||
/*
|
||||
* 2 stage pipelined YCbCr->RGB conversion
|
||||
*/
|
||||
|
||||
.macro do_yuv_to_rgb_stage1
|
||||
vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
|
||||
vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
|
||||
vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
|
||||
@@ -1238,6 +1242,9 @@ asm_function jsimd_idct_2x2_neon
|
||||
vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
|
||||
vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
|
||||
vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
|
||||
.endm
|
||||
|
||||
.macro do_yuv_to_rgb_stage2
|
||||
vrshrn.s32 d20, q10, #15
|
||||
vrshrn.s32 d21, q11, #15
|
||||
vrshrn.s32 d24, q12, #14
|
||||
@@ -1252,6 +1259,43 @@ asm_function jsimd_idct_2x2_neon
|
||||
vqmovun.s16 d1\b_offs, q14
|
||||
.endm
|
||||
|
||||
.macro do_yuv_to_rgb_stage2_store_load_stage1
|
||||
vld1.8 {d4}, [U, :64]!
|
||||
vrshrn.s32 d20, q10, #15
|
||||
vrshrn.s32 d21, q11, #15
|
||||
vrshrn.s32 d24, q12, #14
|
||||
vrshrn.s32 d25, q13, #14
|
||||
vrshrn.s32 d28, q14, #14
|
||||
vld1.8 {d5}, [V, :64]!
|
||||
vrshrn.s32 d29, q15, #14
|
||||
vaddw.u8 q10, q10, d0
|
||||
vaddw.u8 q12, q12, d0
|
||||
vaddw.u8 q14, q14, d0
|
||||
vqmovun.s16 d1\g_offs, q10
|
||||
vld1.8 {d0}, [Y, :64]!
|
||||
vqmovun.s16 d1\r_offs, q12
|
||||
pld [U, #64]
|
||||
pld [V, #64]
|
||||
pld [Y, #64]
|
||||
vqmovun.s16 d1\b_offs, q14
|
||||
vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
|
||||
vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
|
||||
do_store \bpp, 8
|
||||
vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
|
||||
vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
|
||||
vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
|
||||
vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
|
||||
vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
|
||||
vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
|
||||
vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
|
||||
vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
|
||||
.endm
|
||||
|
||||
.macro do_yuv_to_rgb
|
||||
do_yuv_to_rgb_stage1
|
||||
do_yuv_to_rgb_stage2
|
||||
.endm
|
||||
|
||||
/* Apple gas crashes on adrl, work around that by using adr.
|
||||
* But this requires a copy of these constants for each function.
|
||||
*/
|
||||
@@ -1312,16 +1356,21 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
|
||||
/* Inner loop over pixels */
|
||||
subs N, N, #8
|
||||
blt 3f
|
||||
do_load 8
|
||||
do_yuv_to_rgb_stage1
|
||||
subs N, N, #8
|
||||
blt 2f
|
||||
1:
|
||||
do_load 8
|
||||
do_yuv_to_rgb
|
||||
do_store \bpp, 8
|
||||
do_yuv_to_rgb_stage2_store_load_stage1
|
||||
subs N, N, #8
|
||||
bge 1b
|
||||
2:
|
||||
do_yuv_to_rgb_stage2
|
||||
do_store \bpp, 8
|
||||
tst N, #7
|
||||
beq 8f
|
||||
2:
|
||||
3:
|
||||
tst N, #4
|
||||
beq 3f
|
||||
do_load 4
|
||||
@@ -1369,6 +1418,9 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
.endfunc
|
||||
|
||||
.purgem do_yuv_to_rgb
|
||||
.purgem do_yuv_to_rgb_stage1
|
||||
.purgem do_yuv_to_rgb_stage2
|
||||
.purgem do_yuv_to_rgb_stage2_store_load_stage1
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
Reference in New Issue
Block a user