Fix performance and other issues uncovered in testing with actual ARM64 hardware; formatting tweaks; remove NEON platform check (NEON is always available with ARMv8)

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1333 632fc199-4ca6-4c93-a231-07263d6284db
2014-07-23 14:14:14 +00:00
parent d762c19b98
commit 3728aa01d8
2 changed files with 193 additions and 270 deletions
--- a/simd/jsimd_arm64.c
+++ b/simd/jsimd_arm64.c
@@ -27,98 +27,29 @@
 static unsigned int simd_support = ~0;
 #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
 #define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
 LOCAL(int)
 check_feature (char *buffer, char *feature)
 {
  char *p;
  if (*feature == 0)
    return 0;
  if (strncmp(buffer, "Features", 8) != 0)
    return 0;
  buffer += 8;
  while (isspace(*buffer))
    buffer++;
  /* Check if 'feature' is present in the buffer as a separate word */
  while ((p = strstr(buffer, feature))) {
    if (p > buffer && !isspace(*(p - 1))) {
      buffer++;
      continue;
    }
    p += strlen(feature);
    if (*p != 0 && !isspace(*p)) {
      buffer++;
      continue;
    }
    return 1;
  }
  return 0;
 }
 LOCAL(int)
 parse_proc_cpuinfo (int bufsize)
 {
  char *buffer = (char *)malloc(bufsize);
  FILE *fd;
  simd_support = 0;
  if (!buffer)
    return 0;
  fd = fopen("/proc/cpuinfo", "r");
  if (fd) {
    while (fgets(buffer, bufsize, fd)) {
      if (!strchr(buffer, '\n') && !feof(fd)) {
        /* "impossible" happened - insufficient size of the buffer! */
        fclose(fd);
        free(buffer);
        return 0;
      }
      if (check_feature(buffer, "neon"))
        simd_support |= JSIMD_ARM_NEON;
    }
    fclose(fd);
  }
  free(buffer);
  return 1;
 }
 #endif
 /*
 * Check what SIMD accelerations are supported.
 *
 * FIXME: This code is racy under a multi-threaded environment.
 */
 /* 
 * ARMv8 architectures support NEON extensions by default.
 * It is no longer optional as it was with ARMv7.
 */ 
 LOCAL(void)
 init_simd (void)
 {
  char *env = NULL;
 #if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
  int bufsize = 1024; /* an initial guess for the line buffer size limit */
 #endif
  if (simd_support != ~0U)
    return;
  simd_support = 0;
 #if defined(__ARM_NEON__)
  simd_support |= JSIMD_ARM_NEON;
 #elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
  /* We still have a chance to use NEON regardless of globally used
   * -mcpu/-mfpu options passed to gcc by performing runtime detection via
   * /proc/cpuinfo parsing on linux/android */
  while (!parse_proc_cpuinfo(bufsize)) {
    bufsize *= 2;
    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
      break;
  }
 #endif
  /* Force different settings through environment variables */
  env = getenv("JSIMD_FORCENEON");
--- a/simd/jsimd_arm_neon_64.S
+++ b/simd/jsimd_arm_neon_64.S
@@ -34,7 +34,6 @@
 #define RESPECT_STRICT_ALIGNMENT 1
 #define RTSM_SQSHRN_SIM_ISSUE
 /*****************************************************************************/
@@ -257,8 +256,18 @@ asm_function jsimd_idct_islow_neon
    ROW6R           .req v29
    ROW7L           .req v30
    ROW7R           .req v31
-
+    /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
    sub             sp, sp, 272
    str             x15, [sp], 16
    adr             x15, jsimd_idct_islow_neon_consts
    st1             {v0.8b - v3.8b}, [sp], 32
    st1             {v4.8b - v7.8b}, [sp], 32
    st1             {v8.8b - v11.8b}, [sp], 32
    st1             {v12.8b - v15.8b}, [sp], 32
    st1             {v16.8b - v19.8b}, [sp], 32
    st1             {v20.8b - v23.8b}, [sp], 32
    st1             {v24.8b - v27.8b}, [sp], 32
    st1             {v28.8b - v31.8b}, [sp], 32
    ld1             {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
    ld1             {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
@@ -277,7 +286,7 @@ asm_function jsimd_idct_islow_neon
    mul             v22.4h, v22.4h, v6.4h
    mul             v23.4h, v23.4h, v7.4h
    ins             v22.2d[1], v23.2d[0]  /* 128 bit q11 */
-    ld1             {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK], 32
+    ld1             {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
    mul             v24.4h, v24.4h, v0.4h
    mul             v25.4h, v25.4h, v1.4h
    ins             v24.2d[1], v25.2d[0]  /* 128 bit q12 */
@@ -293,80 +302,79 @@ asm_function jsimd_idct_islow_neon
    mul             v30.4h, v30.4h, v6.4h
    mul             v31.4h, v31.4h, v7.4h
    ins             v30.2d[1], v31.2d[0]  /* 128 bit q15 */
-    sub             sp, sp, #32
+    /* Go to the bottom of the stack */
-    st1             {v8.4h-v11.4h}, [sp]  /* save NEON registers */
+    sub             sp, sp, 352
-    sub             sp, sp, #32
+    stp             x4, x5, [sp], 16
-    st1             {v12.4h-v15.4h}, [sp]
+    st1             {v8.4h - v11.4h}, [sp], 32  /* save NEON registers */
    st1             {v12.4h - v15.4h}, [sp], 32
    /* 1-D IDCT, pass 1, left 4x8 half */
    add             v4.4h,    ROW7L.4h, ROW3L.4h
    add             v5.4h,    ROW5L.4h, ROW1L.4h
    smull           v12.4s,   v4.4h,    XFIX_1_175875602_MINUS_1_961570560
    smlal           v12.4s,   v5.4h,    XFIX_1_175875602
    smull           v14.4s,   v4.4h,    XFIX_1_175875602
-      /* Check for the zero coefficients in the right 4x8 half */
+    /* Check for the zero coefficients in the right 4x8 half */
      /* push            {x4, x5} */
      stp             x4, x5, [sp, -16]!
      mov             x5, #0
    smlal           v14.4s,   v5.4h,    XFIX_1_175875602_MINUS_0_390180644
    ssubl           v6.4s,    ROW0L.4h, ROW4L.4h
-      ldr             x4,       [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
+      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
    smlal           v4.4s,    ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
-      orr             x0,       x4,       x5
+      orr           x0,       x4,       x5
    mov             v8.16b,   v12.16b
    smlsl           v12.4s,   ROW5L.4h, XFIX_2_562915447
-      ldr             x4,       [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
+      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
    shl             v6.4s,    v6.4s,    #13
-      orr             x0,       x0,       x4
+      orr           x0,       x0,       x4
    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
-      orr             x0,       x0 ,      x5
+      orr           x0,       x0 ,      x5
    add             v2.4s,    v6.4s,    v4.4s
-      ldr             x4,       [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
+      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
    mov             v10.16b,  v14.16b
    add             v2.4s,    v2.4s,    v12.4s
-      orr             x0,       x0,       x4
+      orr           x0,       x0,       x4
    smlsl           v14.4s,   ROW7L.4h, XFIX_0_899976223
-      orr             x0,       x0,       x5
+      orr           x0,       x0,       x5
    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
    rshrn           ROW1L.4h, v2.4s,    #11
-      ldr             x4,       [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
+      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
    sub             v2.4s,    v2.4s,    v12.4s
    smlal           v10.4s,   ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
-      orr             x0,       x0,       x4
+      orr           x0,       x0,       x4
    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
-      orr             x0,       x0,       x5
+      orr           x0,       x0,       x5
    sub             v2.4s,    v2.4s,    v12.4s
    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
-      ldr             x4,       [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
+      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
    smlal           v12.4s,   ROW6L.4h, XFIX_0_541196100
    sub             v6.4s,    v6.4s,    v4.4s
-      orr             x0,       x0,       x4
+      orr           x0,       x0,       x4
    rshrn           ROW6L.4h, v2.4s,    #11
-      orr             x0,       x0,       x5
+      orr           x0,       x0,       x5
    add             v2.4s,    v6.4s,    v10.4s
-      ldr             x4,       [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
+      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
    sub             v6.4s,    v6.4s,    v10.4s
    saddl           v10.4s,   ROW0L.4h, ROW4L.4h
-      orr             x0,       x0,       x4
+      orr           x0,       x0,       x4
    rshrn           ROW2L.4h, v2.4s,    #11
-      orr             x0,       x0,       x5
+      orr           x0,       x0,       x5
    rshrn           ROW5L.4h, v6.4s,    #11
-      ldr             x4,       [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
+      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
    shl             v10.4s,   v10.4s,   #13
    smlal           v8.4s,    ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
-      orr             x0,       x0,       x4
+      orr           x0,       x0,       x4
    add             v4.4s,    v10.4s,   v12.4s
-      orr             x0,       x0,       x5
+      orr           x0,       x0,       x5
    sub             v2.4s,    v10.4s,   v12.4s
    add             v12.4s,   v4.4s,    v14.4s
-      ldr             x4,       [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
+      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
    sub             v4.4s,    v4.4s,    v14.4s
    add             v10.4s,   v2.4s,    v8.4s
-      orr             x0,       x4,       x5
+      orr           x0,       x4,       x5
    sub             v6.4s,    v2.4s,    v8.4s
      /* pop             {x4, x5} */
-      ldp             x4, x5, [sp], 16
+      sub           sp, sp, 80
      ldp           x4, x5, [sp], 16
    rshrn           ROW7L.4h, v4.4s,    #11
    rshrn           ROW3L.4h, v10.4s,   #11
    rshrn           ROW0L.4h, v12.4s,   #11
@@ -552,48 +560,27 @@ asm_function jsimd_idct_islow_neon
    ins             v18.2d[1], v19.2d[0]
    ins             v20.2d[1], v21.2d[0]
    ins             v22.2d[1], v23.2d[0]
 #ifdef RTSM_SQSHRN_SIM_ISSUE
    sqrshrn         v16.8b,   v16.8h,   #2
    sqrshrn2        v16.16b,  v18.8h,   #2
    sqrshrn         v18.8b,   v20.8h,   #2
    sqrshrn2        v18.16b,  v22.8h,   #2
 #else
    sqrshrn         v16.4h,   v16.4s,   #2
    sqrshrn2        v16.8h,   v18.4s,   #2
    sqrshrn         v18.4h,   v20.4s,   #2
    sqrshrn2        v18.8h,   v22.4s,   #2
 #endif
    /* vpop            {v8.4h-d15.4h} */ /* restore NEON registers */
-    ld1             {v12.4h-v15.4h}, [sp], 32
+    /* vpop            {v8.4h - d15.4h} */ /* restore NEON registers */
-    ld1             {v8.4h-v11.4h}, [sp], 32
+    ld1             {v8.4h - v11.4h}, [sp], 32
    ld1             {v12.4h - v15.4h}, [sp], 32
    ins             v24.2d[1], v25.2d[0]
 #ifdef RTSM_SQSHRN_SIM_ISSUE
    sqrshrn         v20.8b,   v24.8h,   #2
 #else
    sqrshrn         v20.4h,   v24.4s,   #2
 #endif
      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
    /* trn1            v16.8h,    v16.8h,  v18.8h */
    transpose       v16, v18, v3, .16b, .8h
    ins             v26.2d[1], v27.2d[0]
    ins             v28.2d[1], v29.2d[0]
    ins             v30.2d[1], v31.2d[0]
 #ifdef RTSM_SQSHRN_SIM_ISSUE
    sqrshrn2        v20.16b,  v26.8h,   #2
    sqrshrn         v22.8b,   v28.8h,   #2
 #else
    sqrshrn2        v20.8h,   v26.4s,   #2
    sqrshrn         v22.4h,   v28.4s,   #2
 #endif
    movi            v0.16b,   #(CENTERJSAMPLE)
 #ifdef RTSM_SQSHRN_SIM_ISSUE
    sqrshrn2        v22.16b,  v30.8h,   #2
 #else
    sqrshrn2        v22.8h,   v30.4s,   #2
 #endif
    transpose_single v16, v17, v3, .2d, .8b
    transpose_single v18, v19, v3, .2d, .8b
    add             v16.8b,   v16.8b,   v0.8b
@@ -628,6 +615,15 @@ asm_function jsimd_idct_islow_neon
    st1             {v21.8b}, [TMP2]
    st1             {v22.8b}, [TMP3]
    st1             {v23.8b}, [TMP4]
    ldr             x15, [sp], 16
    ld1             {v0.8b - v3.8b}, [sp], 32
    ld1             {v4.8b - v7.8b}, [sp], 32
    ld1             {v8.8b - v11.8b}, [sp], 32
    ld1             {v12.8b - v15.8b}, [sp], 32
    ld1             {v16.8b - v19.8b}, [sp], 32
    ld1             {v20.8b - v23.8b}, [sp], 32
    ld1             {v24.8b - v27.8b}, [sp], 32
    ld1             {v28.8b - v31.8b}, [sp], 32
    blr             x30
 3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
@@ -799,7 +795,8 @@ asm_function jsimd_idct_ifast_neon
    TMP1            .req x0
    TMP2            .req x1
    TMP3            .req x2
-    TMP4            .req x15
+    TMP4            .req x22
    TMP5            .req x23
    /* Load and dequantize coefficients into NEON registers
     * with the following allocation:
@@ -814,7 +811,15 @@ asm_function jsimd_idct_ifast_neon
     *   6 | d28     | d29     ( v14.8h )
     *   7 | d30     | d31     ( v15.8h )
     */
-    adr             x15, jsimd_idct_ifast_neon_consts
+    /* Save NEON registers used in fast IDCT */
    sub             sp, sp, #176
    stp             x22, x23, [sp], 16
    adr             x23, jsimd_idct_ifast_neon_consts
    st1             {v0.8b - v3.8b}, [sp], 32
    st1             {v4.8b - v7.8b}, [sp], 32
    st1             {v8.8b - v11.8b}, [sp], 32
    st1             {v12.8b - v15.8b}, [sp], 32
    st1             {v16.8b - v19.8b}, [sp], 32
    ld1             {v8.8h, v9.8h}, [COEF_BLOCK], 32
    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
    ld1             {v10.8h, v11.8h}, [COEF_BLOCK], 32
@@ -830,14 +835,9 @@ asm_function jsimd_idct_ifast_neon
    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
    mul             v14.8h, v14.8h, v2.8h
    mul             v13.8h, v13.8h, v1.8h
-    ld1             {v0.4h}, [x15]      /* load constants */
+    ld1             {v0.4h}, [x23]      /* load constants */
    mul             v15.8h, v15.8h, v3.8h
    /* vpush           {v4.8h-v6.8h} */ /* save NEON registers */
    sub             sp, sp, #32
    st1             {v4.8h-v5.8h}, [sp] /* save NEON registers */
    sub             sp, sp, #16
    st1             {v6.8h}, [sp]
    /* 1-D IDCT, pass 1 */
    sub             v2.8h,    v10.8h,   v14.8h
    add             v14.8h,   v10.8h,   v14.8h
@@ -912,25 +912,25 @@ asm_function jsimd_idct_ifast_neon
    trn1            v13.4s,   v13.4s,   v15.4s
    trn2            v15.4s,   v18.4s,   v15.4s
    /* vswp            v14.4h,   v10-MSB.4h */
-    umov            x10, v14.d[0]
+    umov            x22, v14.d[0]
    ins             v14.2d[0], v10.2d[1]
-    ins             v10.2d[1], x10
+    ins             v10.2d[1], x22
    /* vswp            v13.4h,   v9MSB.4h */
-    umov            x10, v13.d[0]
+    umov            x22, v13.d[0]
    ins             v13.2d[0], v9.2d[1]
-    ins             v9.2d[1], x10
+    ins             v9.2d[1], x22
    /* 1-D IDCT, pass 2 */
    sub             v2.8h,    v10.8h,   v14.8h
    /* vswp            v15.4h,   v11MSB.4h */
-    umov            x10, v15.d[0]
+    umov            x22, v15.d[0]
    ins             v15.2d[0], v11.2d[1]
-    ins             v11.2d[1], x10
+    ins             v11.2d[1], x22
    add             v14.8h,   v10.8h,   v14.8h
    /* vswp            v12.4h,   v8-MSB.4h */
-    umov            x10, v12.d[0]
+    umov            x22, v12.d[0]
    ins             v12.2d[0], v8.2d[1]
-    ins             v8.2d[1], x10
+    ins             v8.2d[1], x22
    sub             v1.8h,    v11.8h,   v13.8h
    add             v13.8h,   v11.8h,   v13.8h
    sub             v5.8h,    v9.8h,    v15.8h
@@ -966,15 +966,11 @@ asm_function jsimd_idct_ifast_neon
    add             v14.8h,   v5.8h,    v3.8h
    sub             v9.8h,    v5.8h,    v3.8h
    sub             v13.8h,   v10.8h,   v2.8h
    /* vpop            {v4.8h-v7.4h} */  /* restore NEON registers...not available */
    ld1             {v6.8h}, [sp], 16
    ld1             {v4.8h-v5.8h}, [sp], 32
    add             v10.8h,   v10.8h,   v2.8h
    sub             v11.8h,   v12.8h,   v1.8h
    add             v12.8h,   v12.8h,   v1.8h
    /* Descale to 8-bit and range limit */
    movi            v0.16b,   #0x80
 #ifdef RTSM_SQSHRN_SIM_ISSUE
    sqshrn          v8.8b,    v8.8h,    #5
    sqshrn2         v8.16b,   v9.8h,    #5
    sqshrn          v9.8b,    v10.8h,   #5
@@ -983,16 +979,6 @@ asm_function jsimd_idct_ifast_neon
    sqshrn2         v10.16b,  v13.8h,   #5
    sqshrn          v11.8b,   v14.8h,   #5
    sqshrn2         v11.16b,  v15.8h,   #5
 #else
    sqshrn          v8.4h,    v8.4s,    #5
    sqshrn2         v8.8h,    v9.4s,    #5
    sqshrn          v9.4h,    v10.4s,   #5
    sqshrn2         v9.8h,    v11.4s,   #5
    sqshrn          v10.4h,   v12.4s,   #5
    sqshrn2         v10.8h,   v13.4s,   #5
    sqshrn          v11.4h,   v14.4s,   #5
    sqshrn2         v11.8h,   v15.4s,   #5
 #endif
    add             v8.16b,   v8.16b,   v0.16b
    add             v9.16b,   v9.16b,   v0.16b
    add             v10.16b,  v10.16b,  v0.16b
@@ -1036,26 +1022,33 @@ asm_function jsimd_idct_ifast_neon
    add             TMP2,     TMP2,     OUTPUT_COL
    st1             {v9.8b},  [TMP1]
    /* make copy */
-    ins             v21.2d[0], v10.2d[1]
+    ins             v7.2d[0], v10.2d[1]
    mov             v18.16b,  v10.16b
-    trn1            v10.8b,   v10.8b,   v21.8b
+    trn1            v10.8b,   v10.8b,   v7.8b
-    trn2            v21.8b,   v18.8b,   v21.8b
+    trn2            v7.8b,    v18.8b,   v7.8b
    st1             {v19.8b}, [TMP2]
    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
-    ldp             TMP3,     TMP4,     [OUTPUT_BUF]
+    ldp             TMP4,     TMP5,     [OUTPUT_BUF], 16
    add             TMP1,     TMP1,     OUTPUT_COL
    add             TMP2,     TMP2,     OUTPUT_COL
    add             TMP3,     TMP3,     OUTPUT_COL
    add             TMP4,     TMP4,     OUTPUT_COL
    add             TMP5,     TMP5,     OUTPUT_COL
    st1             {v10.8b}, [TMP1]
    /* make copy */
-    ins             v23.2d[0], v11.2d[1]
+    ins             v16.2d[0], v11.2d[1]
    mov             v18.16b,  v11.16b
-    trn1            v11.8b,   v11.8b,   v23.8b
+    trn1            v11.8b,   v11.8b,   v16.8b
-    trn2            v23.8b,   v18.8b,   v23.8b
+    trn2            v16.8b,   v18.8b,   v16.8b
-    st1             {v21.8b}, [TMP2]
+    st1             {v7.8b},  [TMP2]
-    st1             {v11.8b}, [TMP3]
+    st1             {v11.8b}, [TMP4]
-    st1             {v23.8b}, [TMP4]
+    st1             {v16.8b}, [TMP5]
    sub             sp, sp, #176
    ldp             x22, x23, [sp], 16
    ld1             {v0.8b - v3.8b}, [sp], 32
    ld1             {v4.8b - v7.8b}, [sp], 32
    ld1             {v8.8b - v11.8b}, [sp], 32
    ld1             {v12.8b - v15.8b}, [sp], 32
    ld1             {v16.8b - v19.8b}, [sp], 32
    blr             x30
    .unreq          DCT_TABLE
@@ -1179,14 +1172,19 @@ asm_function jsimd_idct_4x4_neon
    TMP3            .req x2
    TMP4            .req x15
-    /* vpush           {v8.4h-v15.4h} */
+    /* Save all used NEON registers */
-    sub             sp, sp, #32
+    sub             sp, sp, 272
-    st1             {v8.4h-v11.4h}, [sp] /* save NEON registers */
+    str             x15, [sp], 16
    sub             sp, sp, #32
    st1             {v12.4h-v15.4h}, [sp]
    /* Load constants (v3.4h is just used for padding) */
    adr             TMP4, jsimd_idct_4x4_neon_consts
    st1             {v0.8b - v3.8b}, [sp], 32
    st1             {v4.8b - v7.8b}, [sp], 32
    st1             {v8.8b - v11.8b}, [sp], 32
    st1             {v12.8b - v15.8b}, [sp], 32
    st1             {v16.8b - v19.8b}, [sp], 32
    st1             {v20.8b - v23.8b}, [sp], 32
    st1             {v24.8b - v27.8b}, [sp], 32
    st1             {v28.8b - v31.8b}, [sp], 32
    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
    /* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1290,10 +1288,17 @@ asm_function jsimd_idct_4x4_neon
    st1             {v27.b}[7], [TMP4], 1
 #endif
-    /* vpop            {v8.4h-v15.4h}    ;not available */
+    /* vpop            {v8.4h - v15.4h}    ;not available */
-    ld1             {v12.4h-v15.4h}, [sp], 32
+    sub             sp, sp, #272
-    ld1             {v8.4h-v11.4h}, [sp], 32
+    ldr             x15, [sp], 16
-
+    ld1             {v0.8b - v3.8b}, [sp], 32
    ld1             {v4.8b - v7.8b}, [sp], 32
    ld1             {v8.8b - v11.8b}, [sp], 32
    ld1             {v12.8b - v15.8b}, [sp], 32
    ld1             {v16.8b - v19.8b}, [sp], 32
    ld1             {v20.8b - v23.8b}, [sp], 32
    ld1             {v24.8b - v27.8b}, [sp], 32
    ld1             {v28.8b - v31.8b}, [sp], 32
    blr             x30
    .unreq          DCT_TABLE
@@ -1333,23 +1338,23 @@ jsimd_idct_2x2_neon_consts:
    .short     FIX_3_624509785     /* d0[3] */
 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    sshll      v28.4s, \x4,    #15
+    sshll      v15.4s, \x4,    #15
    smull      v26.4s, \x6,    v0.4h[3]
    smlal      v26.4s, \x10,   v0.4h[2]
    smlal      v26.4s, \x12,   v0.4h[1]
    smlal      v26.4s, \x16,   v0.4h[0]
-    add        v20.4s, v28.4s, v26.4s
+    add        v20.4s, v15.4s, v26.4s
-    sub        v28.4s, v28.4s, v26.4s
+    sub        v15.4s, v15.4s, v26.4s
 .if \shift > 16
    srshr      v20.4s, v20.4s, #\shift
-    srshr      v28.4s, v28.4s, #\shift
+    srshr      v15.4s, v15.4s, #\shift
    xtn        \y26,   v20.4s
-    xtn        \y27,   v28.4s
+    xtn        \y27,   v15.4s
 .else
    rshrn      \y26,   v20.4s, #\shift
-    rshrn      \y27,   v28.4s, #\shift
+    rshrn      \y27,   v15.4s, #\shift
 .endif
 .endm
@@ -1363,15 +1368,20 @@ asm_function jsimd_idct_2x2_neon
    TMP1            .req x0
    TMP2            .req x15
-    /* vpush           {v8.4h-v15.4h}            ; not available */
+    /* vpush           {v8.4h - v15.4h}            ; not available */
-    sub             sp, sp, #32
+    sub             sp, sp, 208
-    st1             {v8.4h-v11.4h}, [sp]  /* save NEON registers */
+    str             x15, [sp], 16
    sub             sp, sp, #32
    st1             {v12.4h-v15.4h}, [sp]
    /* Load constants */
    adr             TMP2, jsimd_idct_2x2_neon_consts
-    ld1             {v0.4h}, [TMP2]
+    st1             {v4.8b - v7.8b}, [sp], 32
    st1             {v8.8b - v11.8b}, [sp], 32
    st1             {v12.8b - v15.8b}, [sp], 32
    st1             {v16.8b - v19.8b}, [sp], 32
    st1             {v21.8b - v22.8b}, [sp], 16
    st1             {v24.8b - v27.8b}, [sp], 32
    st1             {v30.8b - v31.8b}, [sp], 16
    ld1             {v14.4h}, [TMP2]
    /* Load all COEF_BLOCK into NEON registers with the following allocation:
     *       0 1 2 3 | 4 5 6 7
@@ -1423,24 +1433,24 @@ asm_function jsimd_idct_2x2_neon
    idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
    transpose_4x4   v5.4h, v7.4h, v9.4h,  v11.4h
 #else
-    smull           v26.4s, v6.4h,  v0.4h[3]
+    smull           v26.4s, v6.4h,  v14.4h[3]
-    smlal           v26.4s, v10.4h, v0.4h[2]
+    smlal           v26.4s, v10.4h, v14.4h[2]
-    smlal           v26.4s, v12.4h, v0.4h[1]
+    smlal           v26.4s, v12.4h, v14.4h[1]
-    smlal           v26.4s, v16.4h, v0.4h[0]
+    smlal           v26.4s, v16.4h, v14.4h[0]
-    smull           v24.4s, v7.4h,  v0.4h[3]
+    smull           v24.4s, v7.4h,  v14.4h[3]
-    smlal           v24.4s, v11.4h, v0.4h[2]
+    smlal           v24.4s, v11.4h, v14.4h[2]
-    smlal           v24.4s, v13.4h, v0.4h[1]
+    smlal           v24.4s, v13.4h, v14.4h[1]
-    smlal           v24.4s, v17.4h, v0.4h[0]
+    smlal           v24.4s, v17.4h, v14.4h[0]
-    sshll           v28.4s, v4.4h,  #15
+    sshll           v15.4s, v4.4h,  #15
    sshll           v30.4s, v5.4h,  #15
-    add             v20.4s, v28.4s, v26.4s
+    add             v20.4s, v15.4s, v26.4s
-    sub             v28.4s, v28.4s, v26.4s
+    sub             v15.4s, v15.4s, v26.4s
    rshrn           v4.4h,  v20.4s, #13
-    rshrn           v6.4h,  v28.4s, #13
+    rshrn           v6.4h,  v15.4s, #13
    add             v20.4s, v30.4s, v24.4s
-    sub             v28.4s, v30.4s, v24.4s
+    sub             v15.4s, v30.4s, v24.4s
    rshrn           v5.4h,  v20.4s, #13
-    rshrn           v7.4h,  v28.4s, #13
+    rshrn           v7.4h,  v15.4s, #13
    transpose       v4, v6, v3, .16b, .8h
    transpose       v6, v10, v3, .16b, .4s
 #endif
@@ -1466,11 +1476,15 @@ asm_function jsimd_idct_2x2_neon
    st1             {v26.b}[1], [TMP2], 1
    st1             {v27.b}[5], [TMP2], 1
-    /* vpop            {v8.4h-v15.4h} ;not available */
+    sub             sp, sp, #208
-
+    ldr             x15, [sp], 16
-    ld1             {v12.4h-v15.4h}, [sp], 32
+    ld1             {v4.8b - v7.8b}, [sp], 32
-    ld1             {v8.4h-v11.4h}, [sp], 32
+    ld1             {v8.8b - v11.8b}, [sp], 32
-
+    ld1             {v12.8b - v15.8b}, [sp], 32
    ld1             {v16.8b - v19.8b}, [sp], 32
    ld1             {v21.8b - v22.8b}, [sp], 16
    ld1             {v24.8b - v27.8b}, [sp], 32
    ld1             {v30.8b - v31.8b}, [sp], 16
    blr             x30
    .unreq          DCT_TABLE
@@ -1572,13 +1586,11 @@ asm_function jsimd_idct_2x2_neon
        .error unsupported bpp
    .endif
 .endm
-#ifdef RTSM_SQSHRN_SIM_ISSUE
+
 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
-#else
+
 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize
 #endif
 /*
- * 2 stage pipelined YCbCr->RGB conversion
+ * 2-stage pipelined YCbCr->RGB conversion
 */
 .macro do_yuv_to_rgb_stage1
@@ -1604,16 +1616,10 @@ asm_function jsimd_idct_2x2_neon
    uaddw        v20.8h, v20.8h, v0.8b
    uaddw        v24.8h, v24.8h, v0.8b
    uaddw        v28.8h, v28.8h, v0.8b
 #ifdef RTSM_SQSHRN_SIM_ISSUE
    sqxtun       v1\g_offs\defsize, v20.8h
    sqxtun       v1\r_offs\defsize, v24.8h
    sqxtun       v1\b_offs\defsize, v28.8h
 #else
    sqxtun       v1\g_offs\gsize, v20.4s
    sqxtun       v1\r_offs\rsize, v24.4s
    sqxtun       v1\b_offs\bsize, v28.4s
 #endif
 .endm
 .macro do_yuv_to_rgb_stage2_store_load_stage1
@@ -1628,25 +1634,13 @@ asm_function jsimd_idct_2x2_neon
    uaddw        v20.8h, v20.8h, v0.8b
    uaddw        v24.8h, v24.8h, v0.8b
    uaddw        v28.8h, v28.8h, v0.8b
 #ifdef RTSM_SQSHRN_SIM_ISSUE
    sqxtun       v1\g_offs\defsize, v20.8h
 #else
    sqxtun       v1\g_offs\gsize, v20.4s
 #endif
    ld1          {v0.8b}, [Y], 8
 #ifdef RTSM_SQSHRN_SIM_ISSUE
    sqxtun       v1\r_offs\defsize, v24.8h
 #else
    sqxtun       v1\r_offs\rsize, v24.4s
 #endif
    prfm         PLDL1KEEP, [U, #64]
    prfm         PLDL1KEEP, [V, #64]
    prfm         PLDL1KEEP, [Y, #64]
 #ifdef RTSM_SQSHRN_SIM_ISSUE
    sqxtun       v1\b_offs\defsize, v28.8h
 #else
    sqxtun       v1\b_offs\gsize, v28.4s
 #endif
    uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
    do_store     \bpp, 8
@@ -1693,29 +1687,33 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
    V               .req x10
    N               .req x15
    sub             sp, sp, 336
    str             x15, [sp], 16
    /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
    adr             x15, jsimd_ycc_\colorid\()_neon_consts
    /* Save NEON registers */
    st1             {v0.8b - v3.8b}, [sp], 32
    st1             {v4.8b - v7.8b}, [sp], 32
    st1             {v8.8b - v11.8b}, [sp], 32
    st1             {v12.8b - v15.8b}, [sp], 32
    st1             {v16.8b - v19.8b}, [sp], 32
    st1             {v20.8b - v23.8b}, [sp], 32
    st1             {v24.8b - v27.8b}, [sp], 32
    st1             {v28.8b - v31.8b}, [sp], 32
    ld1             {v0.4h, v1.4h}, [x15], 16
    ld1             {v2.8h}, [x15]
    /* Save ARM registers and handle input arguments */
    /* push            {x4, x5, x6, x7, x8, x9, x10, x30} */
-    stp             x4, x5, [sp,-16]!
+    stp             x4, x5, [sp], 16
-    stp             x6, x7, [sp,-16]!
+    stp             x6, x7, [sp], 16
-    stp             x8, x9, [sp,-16]!
+    stp             x8, x9, [sp], 16
-    stp             x10, x30, [sp,-16]!
+    stp             x10, x30, [sp], 16
    ldr             INPUT_BUF0, [INPUT_BUF]
    ldr             INPUT_BUF1, [INPUT_BUF, 8]
    ldr             INPUT_BUF2, [INPUT_BUF, 16]
    .unreq          INPUT_BUF
    /* Save NEON registers */
    /* vpush           {v8.4h-v15.4h} */
    sub             sp, sp, #32
    st1             {v8.4h-v11.4h}, [sp]
    sub             sp, sp, #32
    st1             {v12.4h-v15.4h}, [sp]
    /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
    movi            v10.16b, #255
    movi            v12.16b, #255
@@ -1778,14 +1776,21 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
    bgt             0b
 9:
    /* Restore all registers and return */
-    /* vpop            {v8.4h-v15.4h} */
+    sub             sp, sp, #336
-    ld1             {v12.4h-v15.4h}, [sp], #32
+    ldr             x15, [sp], 16
-    ld1             {v8.4h-v11.4h}, [sp], #32
+    ld1             {v0.8b - v3.8b}, [sp], 32
    ld1             {v4.8b - v7.8b}, [sp], 32
    ld1             {v8.8b - v11.8b}, [sp], 32
    ld1             {v12.8b - v15.8b}, [sp], 32
    ld1             {v16.8b - v19.8b}, [sp], 32
    ld1             {v20.8b - v23.8b}, [sp], 32
    ld1             {v24.8b - v27.8b}, [sp], 32
    ld1             {v28.8b - v31.8b}, [sp], 32
    /* pop             {r4, r5, r6, r7, r8, r9, r10, pc} */
-    ldp             x10, x30, [sp], #16
+    ldp             x4, x5, [sp], 16
-    ldp             x8, x9, [sp], #16
+    ldp             x6, x7, [sp], 16
-    ldp             x6, x5, [sp], #16
+    ldp             x8, x9, [sp], 16
-    ldp             x4, x5, [sp], #16
+    ldp             x10, x30, [sp], 16
    br              x30
    .unreq          OUTPUT_WIDTH
    .unreq          INPUT_ROW
@@ -1807,10 +1812,6 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
 .purgem do_yuv_to_rgb_stage2_store_load_stage1
 .endm
 /* RTSM simulator fix integer saturation works on 8b boundry add a new parameter
 * as a workaround for the simulator fix
 */
 #ifdef RTSM_SQSHRN_SIM_ISSUE
 /*--------------------------------- id ----- bpp R  rsize  G  gsize  B  bsize  defsize   */
 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,   1, .4h,   2, .4h,   .8b
 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,   1, .4h,   0, .4h,   .8b
@@ -1818,15 +1819,6 @@ generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,   1, .4h,   2, .4h,   .
 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,   1, .4h,   0, .4h,   .8b
 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,   2, .4h,   1, .4h,   .8b
 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,   2, .4h,   3, .4h,   .8b
 #else
 /*--------------------------------- id ----- bpp R  rsize  G  gsize  B  bsize  */
 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,   1, .4h,   2, .4h
 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,   1, .4h,   0, .4h
 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,   1, .4h,   2, .4h
 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,   1, .4h,   0, .4h
 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,   2, .4h,   1, .4h
 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,   2, .4h,   3, .4h
 #endif
 .purgem do_load
 .purgem do_store