Accelerated 4:2:2 upsampling routine for ARM (improves performance ~20-30% when decompressing 4:2:2 JPEGs using fancy upsampling)

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@837 632fc199-4ca6-4c93-a231-07263d6284db
2012-06-13 05:17:03 +00:00
parent 69799275be
commit 316617faf4
4 changed files with 258 additions and 0 deletions
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -26,6 +26,10 @@ it is painfully slow on Bobcat processors in particular.  Eliminating the use
 of this instruction improved performance by an order of magnitude on Bobcat
 processors and by a small amount (typically 5%) on AMD desktop processors.
 [6] Added SIMD acceleration for performing 4:2:2 upsampling on NEON-capable ARM
 platforms.  This speeds up the decompression of 4:2:2 JPEGs by 20-25% on such
 platforms.
 1.2.0
 =====
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -522,6 +522,10 @@ EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
 EXTERN(void) jsimd_h2v1_fancy_upsample_neon
        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
 /* SIMD Sample Conversion */
 EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
                                     JDIMENSION start_col,
--- a/simd/jsimd_arm.c
+++ b/simd/jsimd_arm.c
@@ -338,6 +338,15 @@ jsimd_can_h2v1_fancy_upsample (void)
 {
  init_simd();
  /* The code is optimised for these values only */
  if (BITS_IN_JSAMPLE != 8)
    return 0;
  if (sizeof(JDIMENSION) != 4)
    return 0;
  if (simd_support & JSIMD_ARM_NEON)
    return 1;
  return 0;
 }
@@ -355,6 +364,9 @@ jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
                           JSAMPARRAY input_data,
                           JSAMPARRAY * output_data_ptr)
 {
  if (simd_support & JSIMD_ARM_NEON)
    jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
        compptr->downsampled_width, input_data, output_data_ptr);
 }
 GLOBAL(int)
--- a/simd/jsimd_arm_neon.S
+++ b/simd/jsimd_arm_neon.S
@@ -2157,3 +2157,241 @@ asm_function jsimd_quantize_neon
    .unreq          SHIFT
    .unreq          LOOP_COUNT
 .endfunc
 /*****************************************************************************/
 /*
 * GLOBAL(void)
 * jsimd_h2v1_fancy_upsample_neon (int          max_v_samp_factor,
 *                                 JDIMENSION   downsampled_width,
 *                                 JSAMPARRAY   input_data,
 *                                 JSAMPARRAY * output_data_ptr);
 *
 * Note: the use of unaligned writes is the main remaining bottleneck in
 *       this code, which can be potentially solved to get up to tens
 *       of percents performance improvement on Cortex-A8/Cortex-A9.
 */
 /*
 * Upsample 16 source pixels to 32 destination pixels. The new 16 source
 * pixels are loaded to q0. The previous 16 source pixels are in q1. The
 * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
 * Register d28 is used for multiplication by 3. Register q15 is used
 * for adding +1 bias.
 */
 .macro upsample16   OUTPTR, INPTR
    vld1.8          {q0}, [\INPTR]!
    vmovl.u8        q8,  d0
    vext.8          q2,  q1,  q0, #15
    vmovl.u8        q9,  d1
    vaddw.u8        q10, q15, d4
    vaddw.u8        q11, q15, d5
    vmlal.u8        q8,  d4,  d28
    vmlal.u8        q9,  d5,  d28
    vmlal.u8        q10, d0,  d28
    vmlal.u8        q11, d1,  d28
    vmov            q1,  q0       /* backup source pixels to q1 */
    vrshrn.u16      d6,  q8,  #2
    vrshrn.u16      d7,  q9,  #2
    vshrn.u16       d8,  q10, #2
    vshrn.u16       d9,  q11, #2
    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
 .endm
 /*
 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
 * macro, the roles of q0 and q1 registers are reversed for even and odd
 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
 * Also this unrolling allows to reorder loads and stores to compensate
 * multiplication latency and reduce stalls.
 */
 .macro upsample32   OUTPTR, INPTR
    /* even 16 pixels group */
    vld1.8          {q0}, [\INPTR]!
    vmovl.u8        q8,  d0
    vext.8          q2,  q1,  q0, #15
    vmovl.u8        q9,  d1
    vaddw.u8        q10, q15, d4
    vaddw.u8        q11, q15, d5
    vmlal.u8        q8,  d4,  d28
    vmlal.u8        q9,  d5,  d28
    vmlal.u8        q10, d0,  d28
    vmlal.u8        q11, d1,  d28
        /* odd 16 pixels group */
        vld1.8          {q1}, [\INPTR]!
    vrshrn.u16      d6,  q8,  #2
    vrshrn.u16      d7,  q9,  #2
    vshrn.u16       d8,  q10, #2
    vshrn.u16       d9,  q11, #2
        vmovl.u8        q8,  d2
        vext.8          q2,  q0,  q1, #15
        vmovl.u8        q9,  d3
        vaddw.u8        q10, q15, d4
        vaddw.u8        q11, q15, d5
        vmlal.u8        q8,  d4,  d28
        vmlal.u8        q9,  d5,  d28
        vmlal.u8        q10, d2,  d28
        vmlal.u8        q11, d3,  d28
    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
        vrshrn.u16      d6,  q8,  #2
        vrshrn.u16      d7,  q9,  #2
        vshrn.u16       d8,  q10, #2
        vshrn.u16       d9,  q11, #2
        vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
 .endm
 /*
 * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
 */
 .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
    /* special case for the first and last pixels */
    sub             \WIDTH, \WIDTH, #1
    add             \OUTPTR, \OUTPTR, #1
    ldrb            \TMP1, [\INPTR, \WIDTH]
    strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
    ldrb            \TMP1, [\INPTR], #1
    strb            \TMP1, [\OUTPTR, #-1]
    vmov.8          d3[7], \TMP1
    subs            \WIDTH, \WIDTH, #32
    blt             5f
 0:  /* process 32 pixels per iteration */
    upsample32      \OUTPTR, \INPTR
    subs            \WIDTH, \WIDTH, #32
    bge             0b
 5:
    adds            \WIDTH, \WIDTH, #16
    blt             1f
 0:  /* process 16 pixels if needed */
    upsample16      \OUTPTR, \INPTR
    subs            \WIDTH, \WIDTH, #16
 1:
    adds            \WIDTH, \WIDTH, #16
    beq             9f
    /* load the remaining 1-15 pixels */
    add             \INPTR, \INPTR, \WIDTH
    tst             \WIDTH, #1
    beq             2f
    sub             \INPTR, \INPTR, #1
    vld1.8          {d0[0]}, [\INPTR]
 2:
    tst             \WIDTH, #2
    beq             2f
    vext.8          d0, d0, d0, #6
    sub             \INPTR, \INPTR, #1
    vld1.8          {d0[1]}, [\INPTR]
    sub             \INPTR, \INPTR, #1
    vld1.8          {d0[0]}, [\INPTR]
 2:
    tst             \WIDTH, #4
    beq             2f
    vrev64.32       d0, d0
    sub             \INPTR, \INPTR, #1
    vld1.8          {d0[3]}, [\INPTR]
    sub             \INPTR, \INPTR, #1
    vld1.8          {d0[2]}, [\INPTR]
    sub             \INPTR, \INPTR, #1
    vld1.8          {d0[1]}, [\INPTR]
    sub             \INPTR, \INPTR, #1
    vld1.8          {d0[0]}, [\INPTR]
 2:
    tst             \WIDTH, #8
    beq             2f
    vmov            d1,  d0
    sub             \INPTR, \INPTR, #8
    vld1.8          {d0}, [\INPTR]
 2:  /* upsample the remaining pixels */
    vmovl.u8        q8,  d0
    vext.8          q2,  q1,  q0, #15
    vmovl.u8        q9,  d1
    vaddw.u8        q10, q15, d4
    vaddw.u8        q11, q15, d5
    vmlal.u8        q8,  d4,  d28
    vmlal.u8        q9,  d5,  d28
    vmlal.u8        q10, d0,  d28
    vmlal.u8        q11, d1,  d28
    vrshrn.u16      d10, q8,  #2
    vrshrn.u16      d12, q9,  #2
    vshrn.u16       d11, q10, #2
    vshrn.u16       d13, q11, #2
    vzip.8          d10, d11
    vzip.8          d12, d13
    /* store the remaining pixels */
    tst             \WIDTH, #8
    beq             2f
    vst1.8          {d10, d11}, [\OUTPTR]!
    vmov            q5,  q6
 2:
    tst             \WIDTH, #4
    beq             2f
    vst1.8          {d10}, [\OUTPTR]!
    vmov            d10,  d11
 2:
    tst             \WIDTH, #2
    beq             2f
    vst1.8          {d10[0]}, [\OUTPTR]!
    vst1.8          {d10[1]}, [\OUTPTR]!
    vst1.8          {d10[2]}, [\OUTPTR]!
    vst1.8          {d10[3]}, [\OUTPTR]!
    vext.8          d10, d10, d10, #4
 2:
    tst             \WIDTH, #1
    beq             2f
    vst1.8          {d10[0]}, [\OUTPTR]!
    vst1.8          {d10[1]}, [\OUTPTR]!
 2:
 9:
 .endm
 asm_function jsimd_h2v1_fancy_upsample_neon
    MAX_V_SAMP_FACTOR .req r0
    DOWNSAMPLED_WIDTH .req r1
    INPUT_DATA        .req r2
    OUTPUT_DATA_PTR   .req r3
    OUTPUT_DATA       .req OUTPUT_DATA_PTR
    OUTPTR            .req r4
    INPTR             .req r5
    WIDTH             .req ip
    TMP               .req lr
    push            {r4, r5, r6, lr}
    vpush           {d8-d15}
    ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
    cmp             MAX_V_SAMP_FACTOR, #0
    ble             99f
    /* initialize constants */
    vmov.u8         d28, #3
    vmov.u16        q15, #1
 11:
    ldr             INPTR, [INPUT_DATA], #4
    ldr             OUTPTR, [OUTPUT_DATA], #4
    mov             WIDTH, DOWNSAMPLED_WIDTH
    upsample_row    OUTPTR, INPTR, WIDTH, TMP
    subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
    bgt             11b
 99:
    vpop            {d8-d15}
    pop             {r4, r5, r6, pc}
    .unreq          MAX_V_SAMP_FACTOR
    .unreq          DOWNSAMPLED_WIDTH
    .unreq          INPUT_DATA
    .unreq          OUTPUT_DATA_PTR
    .unreq          OUTPUT_DATA
    .unreq          OUTPTR
    .unreq          INPTR
    .unreq          WIDTH
    .unreq          TMP
 .endfunc
 .purgem upsample16
 .purgem upsample32
 .purgem upsample_row