Accelerated 4:2:2 upsampling routine for ARM (improves performance ~20-30% when decompressing 4:2:2 JPEGs using fancy upsampling)

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@773 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
DRC
2012-02-02 22:32:45 +00:00
3 changed files with 254 additions and 0 deletions

View File

@@ -522,6 +522,10 @@ EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
EXTERN(void) jsimd_h2v1_fancy_upsample_neon
JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
/* SIMD Sample Conversion */
EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
JDIMENSION start_col,

View File

@@ -338,6 +338,15 @@ jsimd_can_h2v1_fancy_upsample (void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_ARM_NEON)
return 1;
return 0;
}
@@ -355,6 +364,9 @@ jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
JSAMPARRAY input_data,
JSAMPARRAY * output_data_ptr)
{
if (simd_support & JSIMD_ARM_NEON)
jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data, output_data_ptr);
}
GLOBAL(int)

View File

@@ -2157,3 +2157,241 @@ asm_function jsimd_quantize_neon
.unreq SHIFT
.unreq LOOP_COUNT
.endfunc
/*****************************************************************************/
/*
* GLOBAL(void)
* jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
* JDIMENSION downsampled_width,
* JSAMPARRAY input_data,
* JSAMPARRAY * output_data_ptr);
*
* Note: the use of unaligned writes is the main remaining bottleneck in
* this code, which can be potentially solved to get up to tens
* of percents performance improvement on Cortex-A8/Cortex-A9.
*/
/*
* Upsample 16 source pixels to 32 destination pixels. The new 16 source
* pixels are loaded to q0. The previous 16 source pixels are in q1. The
* shifted-by-one source pixels are constructed in q2 by using q0 and q1.
* Register d28 is used for multiplication by 3. Register q15 is used
* for adding +1 bias.
*/
.macro upsample16 OUTPTR, INPTR
vld1.8 {q0}, [\INPTR]!
vmovl.u8 q8, d0
vext.8 q2, q1, q0, #15
vmovl.u8 q9, d1
vaddw.u8 q10, q15, d4
vaddw.u8 q11, q15, d5
vmlal.u8 q8, d4, d28
vmlal.u8 q9, d5, d28
vmlal.u8 q10, d0, d28
vmlal.u8 q11, d1, d28
vmov q1, q0 /* backup source pixels to q1 */
vrshrn.u16 d6, q8, #2
vrshrn.u16 d7, q9, #2
vshrn.u16 d8, q10, #2
vshrn.u16 d9, q11, #2
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
.endm
/*
* Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
* macro, the roles of q0 and q1 registers are reversed for even and odd
* groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
* Also this unrolling allows to reorder loads and stores to compensate
* multiplication latency and reduce stalls.
*/
.macro upsample32 OUTPTR, INPTR
/* even 16 pixels group */
vld1.8 {q0}, [\INPTR]!
vmovl.u8 q8, d0
vext.8 q2, q1, q0, #15
vmovl.u8 q9, d1
vaddw.u8 q10, q15, d4
vaddw.u8 q11, q15, d5
vmlal.u8 q8, d4, d28
vmlal.u8 q9, d5, d28
vmlal.u8 q10, d0, d28
vmlal.u8 q11, d1, d28
/* odd 16 pixels group */
vld1.8 {q1}, [\INPTR]!
vrshrn.u16 d6, q8, #2
vrshrn.u16 d7, q9, #2
vshrn.u16 d8, q10, #2
vshrn.u16 d9, q11, #2
vmovl.u8 q8, d2
vext.8 q2, q0, q1, #15
vmovl.u8 q9, d3
vaddw.u8 q10, q15, d4
vaddw.u8 q11, q15, d5
vmlal.u8 q8, d4, d28
vmlal.u8 q9, d5, d28
vmlal.u8 q10, d2, d28
vmlal.u8 q11, d3, d28
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
vrshrn.u16 d6, q8, #2
vrshrn.u16 d7, q9, #2
vshrn.u16 d8, q10, #2
vshrn.u16 d9, q11, #2
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
.endm
/*
* Upsample a row of WIDTH pixels from INPTR to OUTPTR.
*/
.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
/* special case for the first and last pixels */
sub \WIDTH, \WIDTH, #1
add \OUTPTR, \OUTPTR, #1
ldrb \TMP1, [\INPTR, \WIDTH]
strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
ldrb \TMP1, [\INPTR], #1
strb \TMP1, [\OUTPTR, #-1]
vmov.8 d3[7], \TMP1
subs \WIDTH, \WIDTH, #32
blt 5f
0: /* process 32 pixels per iteration */
upsample32 \OUTPTR, \INPTR
subs \WIDTH, \WIDTH, #32
bge 0b
5:
adds \WIDTH, \WIDTH, #16
blt 1f
0: /* process 16 pixels if needed */
upsample16 \OUTPTR, \INPTR
subs \WIDTH, \WIDTH, #16
1:
adds \WIDTH, \WIDTH, #16
beq 9f
/* load the remaining 1-15 pixels */
add \INPTR, \INPTR, \WIDTH
tst \WIDTH, #1
beq 2f
sub \INPTR, \INPTR, #1
vld1.8 {d0[0]}, [\INPTR]
2:
tst \WIDTH, #2
beq 2f
vext.8 d0, d0, d0, #6
sub \INPTR, \INPTR, #1
vld1.8 {d0[1]}, [\INPTR]
sub \INPTR, \INPTR, #1
vld1.8 {d0[0]}, [\INPTR]
2:
tst \WIDTH, #4
beq 2f
vrev64.32 d0, d0
sub \INPTR, \INPTR, #1
vld1.8 {d0[3]}, [\INPTR]
sub \INPTR, \INPTR, #1
vld1.8 {d0[2]}, [\INPTR]
sub \INPTR, \INPTR, #1
vld1.8 {d0[1]}, [\INPTR]
sub \INPTR, \INPTR, #1
vld1.8 {d0[0]}, [\INPTR]
2:
tst \WIDTH, #8
beq 2f
vmov d1, d0
sub \INPTR, \INPTR, #8
vld1.8 {d0}, [\INPTR]
2: /* upsample the remaining pixels */
vmovl.u8 q8, d0
vext.8 q2, q1, q0, #15
vmovl.u8 q9, d1
vaddw.u8 q10, q15, d4
vaddw.u8 q11, q15, d5
vmlal.u8 q8, d4, d28
vmlal.u8 q9, d5, d28
vmlal.u8 q10, d0, d28
vmlal.u8 q11, d1, d28
vrshrn.u16 d10, q8, #2
vrshrn.u16 d12, q9, #2
vshrn.u16 d11, q10, #2
vshrn.u16 d13, q11, #2
vzip.8 d10, d11
vzip.8 d12, d13
/* store the remaining pixels */
tst \WIDTH, #8
beq 2f
vst1.8 {d10, d11}, [\OUTPTR]!
vmov q5, q6
2:
tst \WIDTH, #4
beq 2f
vst1.8 {d10}, [\OUTPTR]!
vmov d10, d11
2:
tst \WIDTH, #2
beq 2f
vst1.8 {d10[0]}, [\OUTPTR]!
vst1.8 {d10[1]}, [\OUTPTR]!
vst1.8 {d10[2]}, [\OUTPTR]!
vst1.8 {d10[3]}, [\OUTPTR]!
vext.8 d10, d10, d10, #4
2:
tst \WIDTH, #1
beq 2f
vst1.8 {d10[0]}, [\OUTPTR]!
vst1.8 {d10[1]}, [\OUTPTR]!
2:
9:
.endm
asm_function jsimd_h2v1_fancy_upsample_neon
MAX_V_SAMP_FACTOR .req r0
DOWNSAMPLED_WIDTH .req r1
INPUT_DATA .req r2
OUTPUT_DATA_PTR .req r3
OUTPUT_DATA .req OUTPUT_DATA_PTR
OUTPTR .req r4
INPTR .req r5
WIDTH .req ip
TMP .req lr
push {r4, r5, r6, lr}
vpush {d8-d15}
ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
cmp MAX_V_SAMP_FACTOR, #0
ble 99f
/* initialize constants */
vmov.u8 d28, #3
vmov.u16 q15, #1
11:
ldr INPTR, [INPUT_DATA], #4
ldr OUTPTR, [OUTPUT_DATA], #4
mov WIDTH, DOWNSAMPLED_WIDTH
upsample_row OUTPTR, INPTR, WIDTH, TMP
subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
bgt 11b
99:
vpop {d8-d15}
pop {r4, r5, r6, pc}
.unreq MAX_V_SAMP_FACTOR
.unreq DOWNSAMPLED_WIDTH
.unreq INPUT_DATA
.unreq OUTPUT_DATA_PTR
.unreq OUTPUT_DATA
.unreq OUTPTR
.unreq INPTR
.unreq WIDTH
.unreq TMP
.endfunc
.purgem upsample16
.purgem upsample32
.purgem upsample_row