Accelerated 4:2:2 upsampling routine for ARM (improves performance ~20-30% when decompressing 4:2:2 JPEGs using fancy upsampling)

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@837 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
DRC
2012-06-13 05:17:03 +00:00
parent 69799275be
commit 316617faf4
4 changed files with 258 additions and 0 deletions

View File

@@ -26,6 +26,10 @@ it is painfully slow on Bobcat processors in particular. Eliminating the use
of this instruction improved performance by an order of magnitude on Bobcat of this instruction improved performance by an order of magnitude on Bobcat
processors and by a small amount (typically 5%) on AMD desktop processors. processors and by a small amount (typically 5%) on AMD desktop processors.
[6] Added SIMD acceleration for performing 4:2:2 upsampling on NEON-capable ARM
platforms. This speeds up the decompression of 4:2:2 JPEGs by 20-25% on such
platforms.
1.2.0 1.2.0
===== =====

View File

@@ -522,6 +522,10 @@ EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
EXTERN(void) jsimd_h2v1_fancy_upsample_neon
JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
/* SIMD Sample Conversion */ /* SIMD Sample Conversion */
EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data, EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
JDIMENSION start_col, JDIMENSION start_col,

View File

@@ -338,6 +338,15 @@ jsimd_can_h2v1_fancy_upsample (void)
{ {
init_simd(); init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_ARM_NEON)
return 1;
return 0; return 0;
} }
@@ -355,6 +364,9 @@ jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
JSAMPARRAY input_data, JSAMPARRAY input_data,
JSAMPARRAY * output_data_ptr) JSAMPARRAY * output_data_ptr)
{ {
if (simd_support & JSIMD_ARM_NEON)
jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data, output_data_ptr);
} }
GLOBAL(int) GLOBAL(int)

View File

@@ -2157,3 +2157,241 @@ asm_function jsimd_quantize_neon
.unreq SHIFT .unreq SHIFT
.unreq LOOP_COUNT .unreq LOOP_COUNT
.endfunc .endfunc
/*****************************************************************************/
/*
* GLOBAL(void)
* jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
* JDIMENSION downsampled_width,
* JSAMPARRAY input_data,
* JSAMPARRAY * output_data_ptr);
*
* Note: the use of unaligned writes is the main remaining bottleneck in
* this code, which can be potentially solved to get up to tens
* of percents performance improvement on Cortex-A8/Cortex-A9.
*/
/*
* Upsample 16 source pixels to 32 destination pixels. The new 16 source
* pixels are loaded to q0. The previous 16 source pixels are in q1. The
* shifted-by-one source pixels are constructed in q2 by using q0 and q1.
* Register d28 is used for multiplication by 3. Register q15 is used
* for adding +1 bias.
*/
.macro upsample16 OUTPTR, INPTR
vld1.8 {q0}, [\INPTR]!
vmovl.u8 q8, d0
vext.8 q2, q1, q0, #15
vmovl.u8 q9, d1
vaddw.u8 q10, q15, d4
vaddw.u8 q11, q15, d5
vmlal.u8 q8, d4, d28
vmlal.u8 q9, d5, d28
vmlal.u8 q10, d0, d28
vmlal.u8 q11, d1, d28
vmov q1, q0 /* backup source pixels to q1 */
vrshrn.u16 d6, q8, #2
vrshrn.u16 d7, q9, #2
vshrn.u16 d8, q10, #2
vshrn.u16 d9, q11, #2
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
.endm
/*
* Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
* macro, the roles of q0 and q1 registers are reversed for even and odd
* groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
* Also this unrolling allows to reorder loads and stores to compensate
* multiplication latency and reduce stalls.
*/
.macro upsample32 OUTPTR, INPTR
/* even 16 pixels group */
vld1.8 {q0}, [\INPTR]!
vmovl.u8 q8, d0
vext.8 q2, q1, q0, #15
vmovl.u8 q9, d1
vaddw.u8 q10, q15, d4
vaddw.u8 q11, q15, d5
vmlal.u8 q8, d4, d28
vmlal.u8 q9, d5, d28
vmlal.u8 q10, d0, d28
vmlal.u8 q11, d1, d28
/* odd 16 pixels group */
vld1.8 {q1}, [\INPTR]!
vrshrn.u16 d6, q8, #2
vrshrn.u16 d7, q9, #2
vshrn.u16 d8, q10, #2
vshrn.u16 d9, q11, #2
vmovl.u8 q8, d2
vext.8 q2, q0, q1, #15
vmovl.u8 q9, d3
vaddw.u8 q10, q15, d4
vaddw.u8 q11, q15, d5
vmlal.u8 q8, d4, d28
vmlal.u8 q9, d5, d28
vmlal.u8 q10, d2, d28
vmlal.u8 q11, d3, d28
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
vrshrn.u16 d6, q8, #2
vrshrn.u16 d7, q9, #2
vshrn.u16 d8, q10, #2
vshrn.u16 d9, q11, #2
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
.endm
/*
* Upsample a row of WIDTH pixels from INPTR to OUTPTR.
*/
.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
/* special case for the first and last pixels */
sub \WIDTH, \WIDTH, #1
add \OUTPTR, \OUTPTR, #1
ldrb \TMP1, [\INPTR, \WIDTH]
strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
ldrb \TMP1, [\INPTR], #1
strb \TMP1, [\OUTPTR, #-1]
vmov.8 d3[7], \TMP1
subs \WIDTH, \WIDTH, #32
blt 5f
0: /* process 32 pixels per iteration */
upsample32 \OUTPTR, \INPTR
subs \WIDTH, \WIDTH, #32
bge 0b
5:
adds \WIDTH, \WIDTH, #16
blt 1f
0: /* process 16 pixels if needed */
upsample16 \OUTPTR, \INPTR
subs \WIDTH, \WIDTH, #16
1:
adds \WIDTH, \WIDTH, #16
beq 9f
/* load the remaining 1-15 pixels */
add \INPTR, \INPTR, \WIDTH
tst \WIDTH, #1
beq 2f
sub \INPTR, \INPTR, #1
vld1.8 {d0[0]}, [\INPTR]
2:
tst \WIDTH, #2
beq 2f
vext.8 d0, d0, d0, #6
sub \INPTR, \INPTR, #1
vld1.8 {d0[1]}, [\INPTR]
sub \INPTR, \INPTR, #1
vld1.8 {d0[0]}, [\INPTR]
2:
tst \WIDTH, #4
beq 2f
vrev64.32 d0, d0
sub \INPTR, \INPTR, #1
vld1.8 {d0[3]}, [\INPTR]
sub \INPTR, \INPTR, #1
vld1.8 {d0[2]}, [\INPTR]
sub \INPTR, \INPTR, #1
vld1.8 {d0[1]}, [\INPTR]
sub \INPTR, \INPTR, #1
vld1.8 {d0[0]}, [\INPTR]
2:
tst \WIDTH, #8
beq 2f
vmov d1, d0
sub \INPTR, \INPTR, #8
vld1.8 {d0}, [\INPTR]
2: /* upsample the remaining pixels */
vmovl.u8 q8, d0
vext.8 q2, q1, q0, #15
vmovl.u8 q9, d1
vaddw.u8 q10, q15, d4
vaddw.u8 q11, q15, d5
vmlal.u8 q8, d4, d28
vmlal.u8 q9, d5, d28
vmlal.u8 q10, d0, d28
vmlal.u8 q11, d1, d28
vrshrn.u16 d10, q8, #2
vrshrn.u16 d12, q9, #2
vshrn.u16 d11, q10, #2
vshrn.u16 d13, q11, #2
vzip.8 d10, d11
vzip.8 d12, d13
/* store the remaining pixels */
tst \WIDTH, #8
beq 2f
vst1.8 {d10, d11}, [\OUTPTR]!
vmov q5, q6
2:
tst \WIDTH, #4
beq 2f
vst1.8 {d10}, [\OUTPTR]!
vmov d10, d11
2:
tst \WIDTH, #2
beq 2f
vst1.8 {d10[0]}, [\OUTPTR]!
vst1.8 {d10[1]}, [\OUTPTR]!
vst1.8 {d10[2]}, [\OUTPTR]!
vst1.8 {d10[3]}, [\OUTPTR]!
vext.8 d10, d10, d10, #4
2:
tst \WIDTH, #1
beq 2f
vst1.8 {d10[0]}, [\OUTPTR]!
vst1.8 {d10[1]}, [\OUTPTR]!
2:
9:
.endm
asm_function jsimd_h2v1_fancy_upsample_neon
MAX_V_SAMP_FACTOR .req r0
DOWNSAMPLED_WIDTH .req r1
INPUT_DATA .req r2
OUTPUT_DATA_PTR .req r3
OUTPUT_DATA .req OUTPUT_DATA_PTR
OUTPTR .req r4
INPTR .req r5
WIDTH .req ip
TMP .req lr
push {r4, r5, r6, lr}
vpush {d8-d15}
ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
cmp MAX_V_SAMP_FACTOR, #0
ble 99f
/* initialize constants */
vmov.u8 d28, #3
vmov.u16 q15, #1
11:
ldr INPTR, [INPUT_DATA], #4
ldr OUTPTR, [OUTPUT_DATA], #4
mov WIDTH, DOWNSAMPLED_WIDTH
upsample_row OUTPTR, INPTR, WIDTH, TMP
subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
bgt 11b
99:
vpop {d8-d15}
pop {r4, r5, r6, pc}
.unreq MAX_V_SAMP_FACTOR
.unreq DOWNSAMPLED_WIDTH
.unreq INPUT_DATA
.unreq OUTPUT_DATA_PTR
.unreq OUTPUT_DATA
.unreq OUTPTR
.unreq INPTR
.unreq WIDTH
.unreq TMP
.endfunc
.purgem upsample16
.purgem upsample32
.purgem upsample_row