ARM64 NEON SIMD impl. of prog. Huffman encoding

This commit adds ARM64 NEON optimizations for the encode_mcu_AC_first() and encode_mcu_AC_refine() functions used in progressive Huffman encoding. Compression speedups for the typical set of five libjpeg-turbo test images (https://libjpeg-turbo.org/About/Performance): Cortex-A53: 23.8-39.2% (avg. 32.2%) Cortex-A72: 26.8-41.1% (avg. 33.5%) Apple A7: 29.7-45.9% (avg. 39.6%) Closes #229
2018-04-03 12:47:54 +02:00
parent b8a7680e12
commit e821464f79
4 changed files with 675 additions and 1 deletions
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -92,6 +92,11 @@ segfault or other user-visible errant behavior, and given that the lossless
 transformer (unlike the decompressor) is not generally exposed to arbitrary
 data exploits, this issue did not likely pose a security risk.

+12. Added SIMD acceleration for progressive Huffman encoding on ARM 64-bit
+(ARMv8) platforms.  This speeds up the compression of full-color progressive
+JPEGs by about 30-40% on average (relative to libjpeg-turbo 2.0.x) when using
+modern ARMv8 CPUs.
+

 2.0.3
 =====
--- a/simd/arm64/jsimd.c
+++ b/simd/arm64/jsimd.c
@@ -22,6 +22,7 @@
 #include "../../jdct.h"
 #include "../../jsimddct.h"
 #include "../jsimd.h"
+#include "jconfigint.h"

 #include <stdio.h>
 #include <string.h>
@@ -773,6 +774,18 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
 GLOBAL(int)
 jsimd_can_encode_mcu_AC_first_prepare(void)
 {
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 8)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
  return 0;
 }

@@ -781,11 +794,25 @@ jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
                                  const int *jpeg_natural_order_start, int Sl,
                                  int Al, JCOEF *values, size_t *zerobits)
 {
+  jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
 }

 GLOBAL(int)
 jsimd_can_encode_mcu_AC_refine_prepare(void)
 {
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 8)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
  return 0;
 }

@@ -794,5 +821,7 @@ jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
                                   const int *jpeg_natural_order_start, int Sl,
                                   int Al, JCOEF *absvalues, size_t *bits)
 {
-  return 0;
+  return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+                                                 jpeg_natural_order_start,
+                                                 Sl, Al, absvalues, bits);
 }
--- a/simd/arm64/jsimd_neon.S
+++ b/simd/arm64/jsimd_neon.S
@@ -3425,3 +3425,635 @@ generate_jsimd_huff_encode_one_block 0
 .purgem put_bits
 .purgem checkbuf31
 .purgem checkbuf47
+
+
+/*****************************************************************************/
+
+/*
+ * Macros to load data for jsimd_encode_mcu_AC_first_prepare_neon() and
+ * jsimd_encode_mcu_AC_refine_prepare_neon()
+ */
+
+.macro LOAD16
+    ldr             T0d, [LUT, #(0*4)]
+    ldr             T1d, [LUT, #(8*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[0], [T0]
+    ld1             {Y1.h}[0], [T1]
+
+    ldr             T0d, [LUT, #(1*4)]
+    ldr             T1d, [LUT, #(9*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[1], [T0]
+    ld1             {Y1.h}[1], [T1]
+
+    ldr             T0d, [LUT, #(2*4)]
+    ldr             T1d, [LUT, #(10*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[2], [T0]
+    ld1             {Y1.h}[2], [T1]
+
+    ldr             T0d, [LUT, #(3*4)]
+    ldr             T1d, [LUT, #(11*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[3], [T0]
+    ld1             {Y1.h}[3], [T1]
+
+    ldr             T0d, [LUT, #(4*4)]
+    ldr             T1d, [LUT, #(12*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[4], [T0]
+    ld1             {Y1.h}[4], [T1]
+
+    ldr             T0d, [LUT, #(5*4)]
+    ldr             T1d, [LUT, #(13*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[5], [T0]
+    ld1             {Y1.h}[5], [T1]
+
+    ldr             T0d, [LUT, #(6*4)]
+    ldr             T1d, [LUT, #(14*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[6], [T0]
+    ld1             {Y1.h}[6], [T1]
+
+    ldr             T0d, [LUT, #(7*4)]
+    ldr             T1d, [LUT, #(15*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[7], [T0]
+    ld1             {Y1.h}[7], [T1]
+
+    add             LUT, LUT, #(16*4)
+.endm
+
+.macro LOAD15
+    eor             Y1.16b, Y1.16b, Y1.16b
+
+    ldr             T0d, [LUT, #(0*4)]
+    ldr             T1d, [LUT, #(8*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[0], [T0]
+    ld1             {Y1.h}[0], [T1]
+
+    ldr             T0d, [LUT, #(1*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[1], [T0]
+
+    ldr             T0d, [LUT, #(2*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[2], [T0]
+
+    ldr             T0d, [LUT, #(3*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[3], [T0]
+
+    ldr             T0d, [LUT, #(4*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[4], [T0]
+
+    ldr             T0d, [LUT, #(5*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[5], [T0]
+
+    ldr             T0d, [LUT, #(6*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[6], [T0]
+
+    ldr             T0d, [LUT, #(7*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[7], [T0]
+
+    cmp             LENEND, #2
+    b.lt            1515f
+    ldr             T1d, [LUT, #(9*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y1.h}[1], [T1]
+
+    cmp             LENEND, #3
+    b.lt            1515f
+    ldr             T1d, [LUT, #(10*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y1.h}[2], [T1]
+
+    cmp             LENEND, #4
+    b.lt            1515f
+    ldr             T1d, [LUT, #(11*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y1.h}[3], [T1]
+
+    cmp             LENEND, #5
+    b.lt            1515f
+    ldr             T1d, [LUT, #(12*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y1.h}[4], [T1]
+
+    cmp             LENEND, #6
+    b.lt            1515f
+    ldr             T1d, [LUT, #(13*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y1.h}[5], [T1]
+
+    cmp             LENEND, #7
+    b.lt            1515f
+    ldr             T1d, [LUT, #(14*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y1.h}[6], [T1]
+
+1515:
+.endm
+
+.macro LOAD8
+    ldr             T0d, [LUT, #(0*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[0], [T0]
+
+    ldr             T0d, [LUT, #(1*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[1], [T0]
+
+    ldr             T0d, [LUT, #(2*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[2], [T0]
+
+    ldr             T0d, [LUT, #(3*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[3], [T0]
+
+    ldr             T0d, [LUT, #(4*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[4], [T0]
+
+    ldr             T0d, [LUT, #(5*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[5], [T0]
+
+    ldr             T0d, [LUT, #(6*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[6], [T0]
+
+    ldr             T0d, [LUT, #(7*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[7], [T0]
+.endm
+
+.macro LOAD7
+    eor             Y0.16b, Y0.16b, Y0.16b
+
+    ldr             T0d, [LUT, #(0*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[0], [T0]
+
+    cmp             LENEND, #2
+    b.lt            77f
+    ldr             T1d, [LUT, #(1*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[1], [T1]
+
+    cmp             LENEND, #3
+    b.lt            77f
+    ldr             T1d, [LUT, #(2*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[2], [T1]
+
+    cmp             LENEND, #4
+    b.lt            77f
+    ldr             T1d, [LUT, #(3*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[3], [T1]
+
+    cmp             LENEND, #5
+    b.lt            77f
+    ldr             T1d, [LUT, #(4*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[4], [T1]
+
+    cmp             LENEND, #6
+    b.lt            77f
+    ldr             T1d, [LUT, #(5*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[5], [T1]
+
+    cmp             LENEND, #7
+    b.lt            77f
+    ldr             T1d, [LUT, #(6*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[6], [T1]
+
+77:
+.endm
+
+.macro REDUCE0
+    ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [VALUES], #64
+    ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [VALUES], #64
+
+    cmeq            v0.8h, v0.8h, #0
+    cmeq            v1.8h, v1.8h, #0
+    cmeq            v2.8h, v2.8h, #0
+    cmeq            v3.8h, v3.8h, #0
+    cmeq            v4.8h, v4.8h, #0
+    cmeq            v5.8h, v5.8h, #0
+    cmeq            v6.8h, v6.8h, #0
+    cmeq            v7.8h, v7.8h, #0
+
+    xtn             v0.8b, v0.8h
+    xtn             v2.8b, v2.8h
+    xtn             v4.8b, v4.8h
+    xtn             v6.8b, v6.8h
+    xtn2            v0.16b, v1.8h
+    xtn2            v2.16b, v3.8h
+    xtn2            v4.16b, v5.8h
+    xtn2            v6.16b, v7.8h
+
+    and             v0.16b, v0.16b, ANDMASK.16b
+    and             v2.16b, v2.16b, ANDMASK.16b
+    and             v4.16b, v4.16b, ANDMASK.16b
+    and             v6.16b, v6.16b, ANDMASK.16b
+    addp            v0.16b, v0.16b, v2.16b
+    addp            v4.16b, v4.16b, v6.16b
+    addp            v0.16b, v0.16b, v4.16b
+    addp            v0.16b, v0.16b, v0.16b
+    umov            T0, v0.D[0]
+    mvn             T0, T0
+    str             T0, [BITS]
+.endm
+
+/*
+ * Prepare data for jsimd_encode_mcu_AC_first().
+ *
+ * GLOBAL(int)
+ * jsimd_encode_mcu_AC_first_prepare_neon(const JCOEF *block,
+ *                                        const int *jpeg_natural_order_start,
+ *                                        int Sl, int Al, JCOEF *values,
+ *                                        size_t *zerobits)
+ *
+ * x0 = const JCOEF *block
+ * x1 = const int *jpeg_natural_order_start
+ * w2 = int Sl
+ * w3 = int Al
+ * x4 = JCOEF *values
+ * x5 = size_t *zerobits
+ *
+ */
+
+    ZERO            .req v0
+    Y0              .req v2
+    Y1              .req v3
+    N0              .req v4
+    N1              .req v5
+    AL              .req v6
+    ANDMASK         .req v20
+    K               .req w12
+    LUT             .req x1
+    T0              .req x10
+    T0d             .req w10
+    T1              .req x11
+    T1d             .req w11
+    BLOCK           .req x0
+    VALUES          .req x4
+    XORVALUES       .req x14
+    LEN             .req w2
+    LENEND          .req w9
+    BITS            .req x5
+
+.balign 16
+Ljsimd_encode_mcu_AC_first_prepare_neon_consts:
+    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+
+asm_function jsimd_encode_mcu_AC_first_prepare_neon
+    adr             T0, Ljsimd_encode_mcu_AC_first_prepare_neon_consts
+    neg             w3, w3                        /* Al = -Al */
+    eor             ZERO.16b, ZERO.16b, ZERO.16b
+    ld1             {ANDMASK.16b}, [T0]
+    dup             AL.8h, w3
+    add             XORVALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
+    and             LENEND, LEN, 7
+    lsr             K, LEN, 4
+    cbz             K, 3f
+1:
+    LOAD16
+    cmlt            N0.8h, Y0.8h, #0
+    cmlt            N1.8h, Y1.8h, #0
+    abs             Y0.8h, Y0.8h
+    abs             Y1.8h, Y1.8h
+    ushl            Y0.8h, Y0.8h, AL.8h
+    ushl            Y1.8h, Y1.8h, AL.8h
+    eor             N0.16b, N0.16b, Y0.16b
+    eor             N1.16b, N1.16b, Y1.16b
+    st1             {Y0.8h, Y1.8h}, [VALUES], #32
+    st1             {N0.8h, N1.8h}, [XORVALUES], #32
+    subs            K, K, #1
+    b.ne            1b
+3:
+    tst             LEN, #8
+    b.eq            3f
+    tst             LEN, #7
+    b.eq            2f
+
+    LOAD15
+    cmlt            N0.8h, Y0.8h, #0
+    cmlt            N1.8h, Y1.8h, #0
+    abs             Y0.8h, Y0.8h
+    abs             Y1.8h, Y1.8h
+    ushl            Y0.8h, Y0.8h, AL.8h
+    ushl            Y1.8h, Y1.8h, AL.8h
+    eor             N0.16b, N0.16b, Y0.16b
+    eor             N1.16b, N1.16b, Y1.16b
+    st1             {Y0.8h, Y1.8h}, [VALUES], #32
+    st1             {N0.8h, N1.8h}, [XORVALUES], #32
+    b               4f
+2:
+    LOAD8
+    cmlt            N0.8h, Y0.8h, #0
+    abs             Y0.8h, Y0.8h
+    ushl            Y0.8h, Y0.8h, AL.8h
+    eor             N0.16b, N0.16b, Y0.16b
+    st1             {Y0.8h}, [VALUES], #16
+    st1             {N0.8h}, [XORVALUES], #16
+    b               4f
+3:
+    cbz             LENEND, 4f
+    LOAD7
+    cmlt            N0.8h, Y0.8h, #0
+    abs             Y0.8h, Y0.8h
+    ushl            Y0.8h, Y0.8h, AL.8h
+    eor             N0.16b, N0.16b, Y0.16b
+    st1             {Y0.8h}, [VALUES], #16
+    st1             {N0.8h}, [XORVALUES], #16
+    /* b               4f */
+    /* fallthrough */
+4:
+    add             K, LEN, #7
+    lsr             K, K, #3
+    subs            K, K, #(/*DCTSIZE2*/ 64 / 8)
+    b.eq            5f
+1:
+    st1             {ZERO.8h}, [VALUES], #16
+    st1             {ZERO.8h}, [XORVALUES], #16
+    adds            K, K, #1
+    b.ne            1b
+5:
+    sub             VALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
+
+    REDUCE0
+
+    br              x30
+
+    .unreq          ZERO
+    .unreq          Y0
+    .unreq          Y1
+    .unreq          N0
+    .unreq          N1
+    .unreq          AL
+    .unreq          ANDMASK
+    .unreq          K
+    .unreq          LUT
+    .unreq          T0
+    .unreq          T0d
+    .unreq          T1
+    .unreq          T1d
+    .unreq          BLOCK
+    .unreq          VALUES
+    .unreq          XORVALUES
+    .unreq          LEN
+    .unreq          LENEND
+    .unreq          BITS
+
+/*
+ * Prepare data for jsimd_encode_mcu_AC_refine.
+ *
+ * GLOBAL(int)
+ * jsimd_encode_mcu_AC_refine_prepare_neon(const JCOEF *block,
+ *                                         const int *jpeg_natural_order_start,
+ *                                         int Sl, int Al, JCOEF *absvalues,
+ *                                         size_t *bits)
+ *
+ * x0 = const JCOEF *block
+ * x1 = const int *jpeg_natural_order_start
+ * w2 = int Sl
+ * w3 = int Al
+ * x4 = JCOEF *absvalues
+ * x5 = size_t *bits
+ *
+ */
+
+    ZERO            .req v0
+    ONE             .req v1
+    Y0              .req v2
+    Y1              .req v3
+    N0              .req v4
+    N1              .req v5
+    AL              .req v6
+    ANDMASK         .req v20
+    K               .req w12
+    KK              .req w13
+    EOB             .req w14
+    SIGN            .req x15
+    LUT             .req x1
+    T0              .req x10
+    T0d             .req w10
+    T1              .req x11
+    T1d             .req w11
+    BLOCK           .req x0
+    VALUES          .req x4
+    LEN             .req w2
+    LENEND          .req w9
+    BITS            .req x5
+
+.balign 16
+Ljsimd_encode_mcu_AC_refine_prepare_neon_consts:
+    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+
+asm_function jsimd_encode_mcu_AC_refine_prepare_neon
+    adr             T0, Ljsimd_encode_mcu_AC_refine_prepare_neon_consts
+    neg             w3, w3                        /* Al = -Al */
+    movi            ONE.8h, #1
+    eor             SIGN, SIGN, SIGN
+    eor             ZERO.16b, ZERO.16b, ZERO.16b
+    eor             EOB, EOB, EOB
+    ld1             {ANDMASK.16b}, [T0]
+    eor             KK, KK, KK
+    dup             AL.8h, w3
+    and             LENEND, LEN, 7
+    lsr             K, LEN, 4
+    cbz             K, 3f
+1:
+    LOAD16
+    cmlt            N0.8h, Y0.8h, #0
+    cmlt            N1.8h, Y1.8h, #0
+    abs             Y0.8h, Y0.8h
+    abs             Y1.8h, Y1.8h
+    ushl            Y0.8h, Y0.8h, AL.8h
+    ushl            Y1.8h, Y1.8h, AL.8h
+    st1             {Y0.8h, Y1.8h}, [VALUES], #32
+    xtn             N0.8b, N0.8h
+    xtn             N1.8b, N1.8h
+    cmeq            Y0.8h, Y0.8h, ONE.8h
+    cmeq            Y1.8h, Y1.8h, ONE.8h
+    xtn             Y0.8b, Y0.8h
+    xtn             Y1.8b, Y1.8h
+    and             N0.8b, N0.8b, ANDMASK.8b
+    and             N1.8b, N1.8b, ANDMASK.8b
+    and             Y0.8b, Y0.8b, ANDMASK.8b
+    and             Y1.8b, Y1.8b, ANDMASK.8b
+    addv            B28, N0.8b
+    addv            B29, N1.8b
+    addv            B30, Y0.8b
+    addv            B31, Y1.8b
+    ins             v28.b[1], v29.b[0]
+    ins             v30.b[1], v31.b[0]
+    umov            T0d, v28.h[0]    /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
+    umov            T1d, v30.h[0]    /* idx = _mm_movemask_epi8(x1); */
+    lsr             SIGN, SIGN, #16  /* make room for sizebits */
+    orr             SIGN, SIGN, T0, lsl #48
+    cbz             T1d, 2f
+    rbit            T1d, T1d
+    clz             T1d, T1d
+    add             EOB, KK, T1d     /* EOB = k + idx; */
+2:
+    add             KK, KK, #16
+    subs            K, K, #1
+    b.ne            1b
+3:
+    tst             LEN, #8
+    b.eq            3f
+    tst             LEN, #7
+    b.eq            2f
+
+    LOAD15
+    cmlt            N0.8h, Y0.8h, #0
+    cmlt            N1.8h, Y1.8h, #0
+    abs             Y0.8h, Y0.8h
+    abs             Y1.8h, Y1.8h
+    ushl            Y0.8h, Y0.8h, AL.8h
+    ushl            Y1.8h, Y1.8h, AL.8h
+    st1             {Y0.8h, Y1.8h}, [VALUES], #32
+    xtn             N0.8b, N0.8h
+    xtn             N1.8b, N1.8h
+    cmeq            Y0.8h, Y0.8h, ONE.8h
+    cmeq            Y1.8h, Y1.8h, ONE.8h
+    xtn             Y0.8b, Y0.8h
+    xtn             Y1.8b, Y1.8h
+    and             N0.8b, N0.8b, ANDMASK.8b
+    and             N1.8b, N1.8b, ANDMASK.8b
+    and             Y0.8b, Y0.8b, ANDMASK.8b
+    and             Y1.8b, Y1.8b, ANDMASK.8b
+    addv            B28, N0.8b
+    addv            B29, N1.8b
+    addv            B30, Y0.8b
+    addv            B31, Y1.8b
+    ins             v28.b[1], v29.b[0]
+    ins             v30.b[1], v31.b[0]
+    umov            T0d, v28.h[0]    /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
+    umov            T1d, v30.h[0]    /* idx = _mm_movemask_epi8(x1); */
+    lsr             SIGN, SIGN, #16  /* make room for sizebits */
+    orr             SIGN, SIGN, T0, lsl #48
+    cbz             T1d, 4f
+    rbit            T1d, T1d
+    clz             T1d, T1d
+    add             EOB, KK, T1d     /* EOB = k + idx; */
+    b               4f
+2:
+    LOAD8
+    cmlt            N0.8h, Y0.8h, #0
+    abs             Y0.8h, Y0.8h
+    ushl            Y0.8h, Y0.8h, AL.8h
+    st1             {Y0.8h}, [VALUES], #16
+    xtn             N0.8b, N0.8h
+    cmeq            Y0.8h, Y0.8h, ONE.8h
+    xtn             Y0.8b, Y0.8h
+    and             N0.8b, N0.8b, ANDMASK.8b
+    and             Y0.8b, Y0.8b, ANDMASK.8b
+    addv            B28, N0.8b
+    addv            B30, Y0.8b
+    umov            T0d, v28.b[0]    /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
+    umov            T1d, v30.b[0]    /* idx = _mm_movemask_epi8(x1); */
+    lsr             SIGN, SIGN, #8   /* make room for sizebits */
+    orr             SIGN, SIGN, T0, lsl #56
+    cbz             T1d, 4f
+    rbit            T1d, T1d
+    clz             T1d, T1d
+    add             EOB, KK, T1d     /* EOB = k + idx; */
+    b               4f
+3:
+    cbz             LENEND, 4f
+    LOAD7
+    cmlt            N0.8h, Y0.8h, #0
+    abs             Y0.8h, Y0.8h
+    ushl            Y0.8h, Y0.8h, AL.8h
+    st1             {Y0.8h}, [VALUES], #16
+    xtn             N0.8b, N0.8h
+    cmeq            Y0.8h, Y0.8h, ONE.8h
+    xtn             Y0.8b, Y0.8h
+    and             N0.8b, N0.8b, ANDMASK.8b
+    and             Y0.8b, Y0.8b, ANDMASK.8b
+    addv            B28, N0.8b
+    addv            B30, Y0.8b
+    umov            T0d, v28.b[0]    /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
+    umov            T1d, v30.b[0]    /* idx = _mm_movemask_epi8(x1); */
+    lsr             SIGN, SIGN, #8   /* make room for sizebits */
+    orr             SIGN, SIGN, T0, lsl #56
+    cbz             T1d, 4f
+    rbit            T1d, T1d
+    clz             T1d, T1d
+    add             EOB, KK, T1d     /* EOB = k + idx; */
+    /* b               4f */
+    /* fallthrough */
+4:
+    add             K, LEN, #7
+    lsr             K, K, #3
+    subs            K, K, #(/*DCTSIZE2*/ 64 / 8)
+    b.eq            5f
+1:
+    st1             {ZERO.8h}, [VALUES], #16
+    lsr             SIGN, SIGN, #8
+    adds            K, K, #1
+    b.ne            1b
+5:
+    mvn             SIGN, SIGN
+    sub             VALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
+    str             SIGN, [BITS, #8]
+
+    REDUCE0
+
+    mov             w0, EOB
+    br              x30
+
+    .unreq          ZERO
+    .unreq          ONE
+    .unreq          Y0
+    .unreq          Y1
+    .unreq          N0
+    .unreq          N1
+    .unreq          AL
+    .unreq          ANDMASK
+    .unreq          K
+    .unreq          KK
+    .unreq          EOB
+    .unreq          SIGN
+    .unreq          LUT
+    .unreq          T0
+    .unreq          T0d
+    .unreq          T1
+    .unreq          T1d
+    .unreq          BLOCK
+    .unreq          VALUES
+    .unreq          LEN
+    .unreq          LENEND
+    .unreq          BITS
+
+.purgem LOAD16
+.purgem LOAD15
+.purgem LOAD8
+.purgem LOAD7
+.purgem REDUCE0
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -1153,6 +1153,14 @@ EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
   JCOEF *values, size_t *zerobits);

+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits);
+
 EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
   JCOEF *absvalues, size_t *bits);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits);