Accelerated 4:2:2 upsampling routine for ARM (improves performance ~20-30% when decompressing 4:2:2 JPEGs using fancy upsampling)
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@773 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
@@ -522,6 +522,10 @@ EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
|
|||||||
JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
|
JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
|
||||||
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
|
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
|
||||||
|
|
||||||
|
EXTERN(void) jsimd_h2v1_fancy_upsample_neon
|
||||||
|
JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
|
||||||
|
JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
|
||||||
|
|
||||||
/* SIMD Sample Conversion */
|
/* SIMD Sample Conversion */
|
||||||
EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
|
EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
|
||||||
JDIMENSION start_col,
|
JDIMENSION start_col,
|
||||||
|
|||||||
@@ -338,6 +338,15 @@ jsimd_can_h2v1_fancy_upsample (void)
|
|||||||
{
|
{
|
||||||
init_simd();
|
init_simd();
|
||||||
|
|
||||||
|
/* The code is optimised for these values only */
|
||||||
|
if (BITS_IN_JSAMPLE != 8)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(JDIMENSION) != 4)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (simd_support & JSIMD_ARM_NEON)
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -355,6 +364,9 @@ jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
|
|||||||
JSAMPARRAY input_data,
|
JSAMPARRAY input_data,
|
||||||
JSAMPARRAY * output_data_ptr)
|
JSAMPARRAY * output_data_ptr)
|
||||||
{
|
{
|
||||||
|
if (simd_support & JSIMD_ARM_NEON)
|
||||||
|
jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
|
||||||
|
compptr->downsampled_width, input_data, output_data_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
|
|||||||
@@ -2157,3 +2157,241 @@ asm_function jsimd_quantize_neon
|
|||||||
.unreq SHIFT
|
.unreq SHIFT
|
||||||
.unreq LOOP_COUNT
|
.unreq LOOP_COUNT
|
||||||
.endfunc
|
.endfunc
|
||||||
|
|
||||||
|
/*****************************************************************************/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* GLOBAL(void)
|
||||||
|
* jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
|
||||||
|
* JDIMENSION downsampled_width,
|
||||||
|
* JSAMPARRAY input_data,
|
||||||
|
* JSAMPARRAY * output_data_ptr);
|
||||||
|
*
|
||||||
|
* Note: the use of unaligned writes is the main remaining bottleneck in
|
||||||
|
* this code, which can be potentially solved to get up to tens
|
||||||
|
* of percents performance improvement on Cortex-A8/Cortex-A9.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Upsample 16 source pixels to 32 destination pixels. The new 16 source
|
||||||
|
* pixels are loaded to q0. The previous 16 source pixels are in q1. The
|
||||||
|
* shifted-by-one source pixels are constructed in q2 by using q0 and q1.
|
||||||
|
* Register d28 is used for multiplication by 3. Register q15 is used
|
||||||
|
* for adding +1 bias.
|
||||||
|
*/
|
||||||
|
.macro upsample16 OUTPTR, INPTR
|
||||||
|
vld1.8 {q0}, [\INPTR]!
|
||||||
|
vmovl.u8 q8, d0
|
||||||
|
vext.8 q2, q1, q0, #15
|
||||||
|
vmovl.u8 q9, d1
|
||||||
|
vaddw.u8 q10, q15, d4
|
||||||
|
vaddw.u8 q11, q15, d5
|
||||||
|
vmlal.u8 q8, d4, d28
|
||||||
|
vmlal.u8 q9, d5, d28
|
||||||
|
vmlal.u8 q10, d0, d28
|
||||||
|
vmlal.u8 q11, d1, d28
|
||||||
|
vmov q1, q0 /* backup source pixels to q1 */
|
||||||
|
vrshrn.u16 d6, q8, #2
|
||||||
|
vrshrn.u16 d7, q9, #2
|
||||||
|
vshrn.u16 d8, q10, #2
|
||||||
|
vshrn.u16 d9, q11, #2
|
||||||
|
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
|
||||||
|
* macro, the roles of q0 and q1 registers are reversed for even and odd
|
||||||
|
* groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
|
||||||
|
* Also this unrolling allows to reorder loads and stores to compensate
|
||||||
|
* multiplication latency and reduce stalls.
|
||||||
|
*/
|
||||||
|
.macro upsample32 OUTPTR, INPTR
|
||||||
|
/* even 16 pixels group */
|
||||||
|
vld1.8 {q0}, [\INPTR]!
|
||||||
|
vmovl.u8 q8, d0
|
||||||
|
vext.8 q2, q1, q0, #15
|
||||||
|
vmovl.u8 q9, d1
|
||||||
|
vaddw.u8 q10, q15, d4
|
||||||
|
vaddw.u8 q11, q15, d5
|
||||||
|
vmlal.u8 q8, d4, d28
|
||||||
|
vmlal.u8 q9, d5, d28
|
||||||
|
vmlal.u8 q10, d0, d28
|
||||||
|
vmlal.u8 q11, d1, d28
|
||||||
|
/* odd 16 pixels group */
|
||||||
|
vld1.8 {q1}, [\INPTR]!
|
||||||
|
vrshrn.u16 d6, q8, #2
|
||||||
|
vrshrn.u16 d7, q9, #2
|
||||||
|
vshrn.u16 d8, q10, #2
|
||||||
|
vshrn.u16 d9, q11, #2
|
||||||
|
vmovl.u8 q8, d2
|
||||||
|
vext.8 q2, q0, q1, #15
|
||||||
|
vmovl.u8 q9, d3
|
||||||
|
vaddw.u8 q10, q15, d4
|
||||||
|
vaddw.u8 q11, q15, d5
|
||||||
|
vmlal.u8 q8, d4, d28
|
||||||
|
vmlal.u8 q9, d5, d28
|
||||||
|
vmlal.u8 q10, d2, d28
|
||||||
|
vmlal.u8 q11, d3, d28
|
||||||
|
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
|
||||||
|
vrshrn.u16 d6, q8, #2
|
||||||
|
vrshrn.u16 d7, q9, #2
|
||||||
|
vshrn.u16 d8, q10, #2
|
||||||
|
vshrn.u16 d9, q11, #2
|
||||||
|
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Upsample a row of WIDTH pixels from INPTR to OUTPTR.
|
||||||
|
*/
|
||||||
|
.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
|
||||||
|
/* special case for the first and last pixels */
|
||||||
|
sub \WIDTH, \WIDTH, #1
|
||||||
|
add \OUTPTR, \OUTPTR, #1
|
||||||
|
ldrb \TMP1, [\INPTR, \WIDTH]
|
||||||
|
strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
|
||||||
|
ldrb \TMP1, [\INPTR], #1
|
||||||
|
strb \TMP1, [\OUTPTR, #-1]
|
||||||
|
vmov.8 d3[7], \TMP1
|
||||||
|
|
||||||
|
subs \WIDTH, \WIDTH, #32
|
||||||
|
blt 5f
|
||||||
|
0: /* process 32 pixels per iteration */
|
||||||
|
upsample32 \OUTPTR, \INPTR
|
||||||
|
subs \WIDTH, \WIDTH, #32
|
||||||
|
bge 0b
|
||||||
|
5:
|
||||||
|
adds \WIDTH, \WIDTH, #16
|
||||||
|
blt 1f
|
||||||
|
0: /* process 16 pixels if needed */
|
||||||
|
upsample16 \OUTPTR, \INPTR
|
||||||
|
subs \WIDTH, \WIDTH, #16
|
||||||
|
1:
|
||||||
|
adds \WIDTH, \WIDTH, #16
|
||||||
|
beq 9f
|
||||||
|
|
||||||
|
/* load the remaining 1-15 pixels */
|
||||||
|
add \INPTR, \INPTR, \WIDTH
|
||||||
|
tst \WIDTH, #1
|
||||||
|
beq 2f
|
||||||
|
sub \INPTR, \INPTR, #1
|
||||||
|
vld1.8 {d0[0]}, [\INPTR]
|
||||||
|
2:
|
||||||
|
tst \WIDTH, #2
|
||||||
|
beq 2f
|
||||||
|
vext.8 d0, d0, d0, #6
|
||||||
|
sub \INPTR, \INPTR, #1
|
||||||
|
vld1.8 {d0[1]}, [\INPTR]
|
||||||
|
sub \INPTR, \INPTR, #1
|
||||||
|
vld1.8 {d0[0]}, [\INPTR]
|
||||||
|
2:
|
||||||
|
tst \WIDTH, #4
|
||||||
|
beq 2f
|
||||||
|
vrev64.32 d0, d0
|
||||||
|
sub \INPTR, \INPTR, #1
|
||||||
|
vld1.8 {d0[3]}, [\INPTR]
|
||||||
|
sub \INPTR, \INPTR, #1
|
||||||
|
vld1.8 {d0[2]}, [\INPTR]
|
||||||
|
sub \INPTR, \INPTR, #1
|
||||||
|
vld1.8 {d0[1]}, [\INPTR]
|
||||||
|
sub \INPTR, \INPTR, #1
|
||||||
|
vld1.8 {d0[0]}, [\INPTR]
|
||||||
|
2:
|
||||||
|
tst \WIDTH, #8
|
||||||
|
beq 2f
|
||||||
|
vmov d1, d0
|
||||||
|
sub \INPTR, \INPTR, #8
|
||||||
|
vld1.8 {d0}, [\INPTR]
|
||||||
|
2: /* upsample the remaining pixels */
|
||||||
|
vmovl.u8 q8, d0
|
||||||
|
vext.8 q2, q1, q0, #15
|
||||||
|
vmovl.u8 q9, d1
|
||||||
|
vaddw.u8 q10, q15, d4
|
||||||
|
vaddw.u8 q11, q15, d5
|
||||||
|
vmlal.u8 q8, d4, d28
|
||||||
|
vmlal.u8 q9, d5, d28
|
||||||
|
vmlal.u8 q10, d0, d28
|
||||||
|
vmlal.u8 q11, d1, d28
|
||||||
|
vrshrn.u16 d10, q8, #2
|
||||||
|
vrshrn.u16 d12, q9, #2
|
||||||
|
vshrn.u16 d11, q10, #2
|
||||||
|
vshrn.u16 d13, q11, #2
|
||||||
|
vzip.8 d10, d11
|
||||||
|
vzip.8 d12, d13
|
||||||
|
/* store the remaining pixels */
|
||||||
|
tst \WIDTH, #8
|
||||||
|
beq 2f
|
||||||
|
vst1.8 {d10, d11}, [\OUTPTR]!
|
||||||
|
vmov q5, q6
|
||||||
|
2:
|
||||||
|
tst \WIDTH, #4
|
||||||
|
beq 2f
|
||||||
|
vst1.8 {d10}, [\OUTPTR]!
|
||||||
|
vmov d10, d11
|
||||||
|
2:
|
||||||
|
tst \WIDTH, #2
|
||||||
|
beq 2f
|
||||||
|
vst1.8 {d10[0]}, [\OUTPTR]!
|
||||||
|
vst1.8 {d10[1]}, [\OUTPTR]!
|
||||||
|
vst1.8 {d10[2]}, [\OUTPTR]!
|
||||||
|
vst1.8 {d10[3]}, [\OUTPTR]!
|
||||||
|
vext.8 d10, d10, d10, #4
|
||||||
|
2:
|
||||||
|
tst \WIDTH, #1
|
||||||
|
beq 2f
|
||||||
|
vst1.8 {d10[0]}, [\OUTPTR]!
|
||||||
|
vst1.8 {d10[1]}, [\OUTPTR]!
|
||||||
|
2:
|
||||||
|
9:
|
||||||
|
.endm
|
||||||
|
|
||||||
|
asm_function jsimd_h2v1_fancy_upsample_neon
|
||||||
|
|
||||||
|
MAX_V_SAMP_FACTOR .req r0
|
||||||
|
DOWNSAMPLED_WIDTH .req r1
|
||||||
|
INPUT_DATA .req r2
|
||||||
|
OUTPUT_DATA_PTR .req r3
|
||||||
|
OUTPUT_DATA .req OUTPUT_DATA_PTR
|
||||||
|
|
||||||
|
OUTPTR .req r4
|
||||||
|
INPTR .req r5
|
||||||
|
WIDTH .req ip
|
||||||
|
TMP .req lr
|
||||||
|
|
||||||
|
push {r4, r5, r6, lr}
|
||||||
|
vpush {d8-d15}
|
||||||
|
|
||||||
|
ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
|
||||||
|
cmp MAX_V_SAMP_FACTOR, #0
|
||||||
|
ble 99f
|
||||||
|
|
||||||
|
/* initialize constants */
|
||||||
|
vmov.u8 d28, #3
|
||||||
|
vmov.u16 q15, #1
|
||||||
|
11:
|
||||||
|
ldr INPTR, [INPUT_DATA], #4
|
||||||
|
ldr OUTPTR, [OUTPUT_DATA], #4
|
||||||
|
mov WIDTH, DOWNSAMPLED_WIDTH
|
||||||
|
upsample_row OUTPTR, INPTR, WIDTH, TMP
|
||||||
|
subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
|
||||||
|
bgt 11b
|
||||||
|
|
||||||
|
99:
|
||||||
|
vpop {d8-d15}
|
||||||
|
pop {r4, r5, r6, pc}
|
||||||
|
|
||||||
|
.unreq MAX_V_SAMP_FACTOR
|
||||||
|
.unreq DOWNSAMPLED_WIDTH
|
||||||
|
.unreq INPUT_DATA
|
||||||
|
.unreq OUTPUT_DATA_PTR
|
||||||
|
.unreq OUTPUT_DATA
|
||||||
|
|
||||||
|
.unreq OUTPTR
|
||||||
|
.unreq INPTR
|
||||||
|
.unreq WIDTH
|
||||||
|
.unreq TMP
|
||||||
|
|
||||||
|
.endfunc
|
||||||
|
|
||||||
|
.purgem upsample16
|
||||||
|
.purgem upsample32
|
||||||
|
.purgem upsample_row
|
||||||
|
|||||||
Reference in New Issue
Block a user