Accelerated 4:2:2 upsampling routine for ARM (improves performance ~20-30% when decompressing 4:2:2 JPEGs using fancy upsampling)
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@837 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
@@ -26,6 +26,10 @@ it is painfully slow on Bobcat processors in particular. Eliminating the use
|
||||
of this instruction improved performance by an order of magnitude on Bobcat
|
||||
processors and by a small amount (typically 5%) on AMD desktop processors.
|
||||
|
||||
[6] Added SIMD acceleration for performing 4:2:2 upsampling on NEON-capable ARM
|
||||
platforms. This speeds up the decompression of 4:2:2 JPEGs by 20-25% on such
|
||||
platforms.
|
||||
|
||||
|
||||
1.2.0
|
||||
=====
|
||||
|
||||
@@ -522,6 +522,10 @@ EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
|
||||
JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
|
||||
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
|
||||
|
||||
EXTERN(void) jsimd_h2v1_fancy_upsample_neon
|
||||
JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
|
||||
JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
|
||||
|
||||
/* SIMD Sample Conversion */
|
||||
EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
|
||||
JDIMENSION start_col,
|
||||
|
||||
@@ -338,6 +338,15 @@ jsimd_can_h2v1_fancy_upsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ARM_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -355,6 +364,9 @@ jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY * output_data_ptr)
|
||||
{
|
||||
if (simd_support & JSIMD_ARM_NEON)
|
||||
jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data, output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
|
||||
@@ -2157,3 +2157,241 @@ asm_function jsimd_quantize_neon
|
||||
.unreq SHIFT
|
||||
.unreq LOOP_COUNT
|
||||
.endfunc
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
/*
|
||||
* GLOBAL(void)
|
||||
* jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
|
||||
* JDIMENSION downsampled_width,
|
||||
* JSAMPARRAY input_data,
|
||||
* JSAMPARRAY * output_data_ptr);
|
||||
*
|
||||
* Note: the use of unaligned writes is the main remaining bottleneck in
|
||||
* this code, which can be potentially solved to get up to tens
|
||||
* of percents performance improvement on Cortex-A8/Cortex-A9.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Upsample 16 source pixels to 32 destination pixels. The new 16 source
|
||||
* pixels are loaded to q0. The previous 16 source pixels are in q1. The
|
||||
* shifted-by-one source pixels are constructed in q2 by using q0 and q1.
|
||||
* Register d28 is used for multiplication by 3. Register q15 is used
|
||||
* for adding +1 bias.
|
||||
*/
|
||||
.macro upsample16 OUTPTR, INPTR
|
||||
vld1.8 {q0}, [\INPTR]!
|
||||
vmovl.u8 q8, d0
|
||||
vext.8 q2, q1, q0, #15
|
||||
vmovl.u8 q9, d1
|
||||
vaddw.u8 q10, q15, d4
|
||||
vaddw.u8 q11, q15, d5
|
||||
vmlal.u8 q8, d4, d28
|
||||
vmlal.u8 q9, d5, d28
|
||||
vmlal.u8 q10, d0, d28
|
||||
vmlal.u8 q11, d1, d28
|
||||
vmov q1, q0 /* backup source pixels to q1 */
|
||||
vrshrn.u16 d6, q8, #2
|
||||
vrshrn.u16 d7, q9, #2
|
||||
vshrn.u16 d8, q10, #2
|
||||
vshrn.u16 d9, q11, #2
|
||||
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
|
||||
* macro, the roles of q0 and q1 registers are reversed for even and odd
|
||||
* groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
|
||||
* Also this unrolling allows to reorder loads and stores to compensate
|
||||
* multiplication latency and reduce stalls.
|
||||
*/
|
||||
.macro upsample32 OUTPTR, INPTR
|
||||
/* even 16 pixels group */
|
||||
vld1.8 {q0}, [\INPTR]!
|
||||
vmovl.u8 q8, d0
|
||||
vext.8 q2, q1, q0, #15
|
||||
vmovl.u8 q9, d1
|
||||
vaddw.u8 q10, q15, d4
|
||||
vaddw.u8 q11, q15, d5
|
||||
vmlal.u8 q8, d4, d28
|
||||
vmlal.u8 q9, d5, d28
|
||||
vmlal.u8 q10, d0, d28
|
||||
vmlal.u8 q11, d1, d28
|
||||
/* odd 16 pixels group */
|
||||
vld1.8 {q1}, [\INPTR]!
|
||||
vrshrn.u16 d6, q8, #2
|
||||
vrshrn.u16 d7, q9, #2
|
||||
vshrn.u16 d8, q10, #2
|
||||
vshrn.u16 d9, q11, #2
|
||||
vmovl.u8 q8, d2
|
||||
vext.8 q2, q0, q1, #15
|
||||
vmovl.u8 q9, d3
|
||||
vaddw.u8 q10, q15, d4
|
||||
vaddw.u8 q11, q15, d5
|
||||
vmlal.u8 q8, d4, d28
|
||||
vmlal.u8 q9, d5, d28
|
||||
vmlal.u8 q10, d2, d28
|
||||
vmlal.u8 q11, d3, d28
|
||||
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
|
||||
vrshrn.u16 d6, q8, #2
|
||||
vrshrn.u16 d7, q9, #2
|
||||
vshrn.u16 d8, q10, #2
|
||||
vshrn.u16 d9, q11, #2
|
||||
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Upsample a row of WIDTH pixels from INPTR to OUTPTR.
|
||||
*/
|
||||
.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
|
||||
/* special case for the first and last pixels */
|
||||
sub \WIDTH, \WIDTH, #1
|
||||
add \OUTPTR, \OUTPTR, #1
|
||||
ldrb \TMP1, [\INPTR, \WIDTH]
|
||||
strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
|
||||
ldrb \TMP1, [\INPTR], #1
|
||||
strb \TMP1, [\OUTPTR, #-1]
|
||||
vmov.8 d3[7], \TMP1
|
||||
|
||||
subs \WIDTH, \WIDTH, #32
|
||||
blt 5f
|
||||
0: /* process 32 pixels per iteration */
|
||||
upsample32 \OUTPTR, \INPTR
|
||||
subs \WIDTH, \WIDTH, #32
|
||||
bge 0b
|
||||
5:
|
||||
adds \WIDTH, \WIDTH, #16
|
||||
blt 1f
|
||||
0: /* process 16 pixels if needed */
|
||||
upsample16 \OUTPTR, \INPTR
|
||||
subs \WIDTH, \WIDTH, #16
|
||||
1:
|
||||
adds \WIDTH, \WIDTH, #16
|
||||
beq 9f
|
||||
|
||||
/* load the remaining 1-15 pixels */
|
||||
add \INPTR, \INPTR, \WIDTH
|
||||
tst \WIDTH, #1
|
||||
beq 2f
|
||||
sub \INPTR, \INPTR, #1
|
||||
vld1.8 {d0[0]}, [\INPTR]
|
||||
2:
|
||||
tst \WIDTH, #2
|
||||
beq 2f
|
||||
vext.8 d0, d0, d0, #6
|
||||
sub \INPTR, \INPTR, #1
|
||||
vld1.8 {d0[1]}, [\INPTR]
|
||||
sub \INPTR, \INPTR, #1
|
||||
vld1.8 {d0[0]}, [\INPTR]
|
||||
2:
|
||||
tst \WIDTH, #4
|
||||
beq 2f
|
||||
vrev64.32 d0, d0
|
||||
sub \INPTR, \INPTR, #1
|
||||
vld1.8 {d0[3]}, [\INPTR]
|
||||
sub \INPTR, \INPTR, #1
|
||||
vld1.8 {d0[2]}, [\INPTR]
|
||||
sub \INPTR, \INPTR, #1
|
||||
vld1.8 {d0[1]}, [\INPTR]
|
||||
sub \INPTR, \INPTR, #1
|
||||
vld1.8 {d0[0]}, [\INPTR]
|
||||
2:
|
||||
tst \WIDTH, #8
|
||||
beq 2f
|
||||
vmov d1, d0
|
||||
sub \INPTR, \INPTR, #8
|
||||
vld1.8 {d0}, [\INPTR]
|
||||
2: /* upsample the remaining pixels */
|
||||
vmovl.u8 q8, d0
|
||||
vext.8 q2, q1, q0, #15
|
||||
vmovl.u8 q9, d1
|
||||
vaddw.u8 q10, q15, d4
|
||||
vaddw.u8 q11, q15, d5
|
||||
vmlal.u8 q8, d4, d28
|
||||
vmlal.u8 q9, d5, d28
|
||||
vmlal.u8 q10, d0, d28
|
||||
vmlal.u8 q11, d1, d28
|
||||
vrshrn.u16 d10, q8, #2
|
||||
vrshrn.u16 d12, q9, #2
|
||||
vshrn.u16 d11, q10, #2
|
||||
vshrn.u16 d13, q11, #2
|
||||
vzip.8 d10, d11
|
||||
vzip.8 d12, d13
|
||||
/* store the remaining pixels */
|
||||
tst \WIDTH, #8
|
||||
beq 2f
|
||||
vst1.8 {d10, d11}, [\OUTPTR]!
|
||||
vmov q5, q6
|
||||
2:
|
||||
tst \WIDTH, #4
|
||||
beq 2f
|
||||
vst1.8 {d10}, [\OUTPTR]!
|
||||
vmov d10, d11
|
||||
2:
|
||||
tst \WIDTH, #2
|
||||
beq 2f
|
||||
vst1.8 {d10[0]}, [\OUTPTR]!
|
||||
vst1.8 {d10[1]}, [\OUTPTR]!
|
||||
vst1.8 {d10[2]}, [\OUTPTR]!
|
||||
vst1.8 {d10[3]}, [\OUTPTR]!
|
||||
vext.8 d10, d10, d10, #4
|
||||
2:
|
||||
tst \WIDTH, #1
|
||||
beq 2f
|
||||
vst1.8 {d10[0]}, [\OUTPTR]!
|
||||
vst1.8 {d10[1]}, [\OUTPTR]!
|
||||
2:
|
||||
9:
|
||||
.endm
|
||||
|
||||
asm_function jsimd_h2v1_fancy_upsample_neon
|
||||
|
||||
MAX_V_SAMP_FACTOR .req r0
|
||||
DOWNSAMPLED_WIDTH .req r1
|
||||
INPUT_DATA .req r2
|
||||
OUTPUT_DATA_PTR .req r3
|
||||
OUTPUT_DATA .req OUTPUT_DATA_PTR
|
||||
|
||||
OUTPTR .req r4
|
||||
INPTR .req r5
|
||||
WIDTH .req ip
|
||||
TMP .req lr
|
||||
|
||||
push {r4, r5, r6, lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
|
||||
cmp MAX_V_SAMP_FACTOR, #0
|
||||
ble 99f
|
||||
|
||||
/* initialize constants */
|
||||
vmov.u8 d28, #3
|
||||
vmov.u16 q15, #1
|
||||
11:
|
||||
ldr INPTR, [INPUT_DATA], #4
|
||||
ldr OUTPTR, [OUTPUT_DATA], #4
|
||||
mov WIDTH, DOWNSAMPLED_WIDTH
|
||||
upsample_row OUTPTR, INPTR, WIDTH, TMP
|
||||
subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
|
||||
bgt 11b
|
||||
|
||||
99:
|
||||
vpop {d8-d15}
|
||||
pop {r4, r5, r6, pc}
|
||||
|
||||
.unreq MAX_V_SAMP_FACTOR
|
||||
.unreq DOWNSAMPLED_WIDTH
|
||||
.unreq INPUT_DATA
|
||||
.unreq OUTPUT_DATA_PTR
|
||||
.unreq OUTPUT_DATA
|
||||
|
||||
.unreq OUTPTR
|
||||
.unreq INPTR
|
||||
.unreq WIDTH
|
||||
.unreq TMP
|
||||
|
||||
.endfunc
|
||||
|
||||
.purgem upsample16
|
||||
.purgem upsample32
|
||||
.purgem upsample_row
|
||||
|
||||
Reference in New Issue
Block a user