ARM64 NEON SIMD impl. of prog. Huffman encoding

This commit adds ARM64 NEON optimizations for the
encode_mcu_AC_first() and encode_mcu_AC_refine() functions used in
progressive Huffman encoding.

Compression speedups for the typical set of five libjpeg-turbo test
images (https://libjpeg-turbo.org/About/Performance):
Cortex-A53: 23.8-39.2% (avg. 32.2%)
Cortex-A72: 26.8-41.1% (avg. 33.5%)
Apple A7:  29.7-45.9% (avg. 39.6%)

Closes #229
This commit is contained in:
mayeut
2018-04-03 12:47:54 +02:00
committed by DRC
parent b8a7680e12
commit e821464f79
4 changed files with 675 additions and 1 deletions

View File

@@ -92,6 +92,11 @@ segfault or other user-visible errant behavior, and given that the lossless
transformer (unlike the decompressor) is not generally exposed to arbitrary
data exploits, this issue did not likely pose a security risk.
12. Added SIMD acceleration for progressive Huffman encoding on ARM 64-bit
(ARMv8) platforms. This speeds up the compression of full-color progressive
JPEGs by about 30-40% on average (relative to libjpeg-turbo 2.0.x) when using
modern ARMv8 CPUs.
2.0.3
=====

View File

@@ -22,6 +22,7 @@
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "jconfigint.h"
#include <stdio.h>
#include <string.h>
@@ -773,6 +774,18 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
GLOBAL(int)
jsimd_can_encode_mcu_AC_first_prepare(void)
{
init_simd();
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (SIZEOF_SIZE_T != 8)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
@@ -781,11 +794,25 @@ jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *zerobits)
{
jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
Sl, Al, values, zerobits);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
init_simd();
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (SIZEOF_SIZE_T != 8)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
@@ -794,5 +821,7 @@ jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *absvalues, size_t *bits)
{
return 0;
return jsimd_encode_mcu_AC_refine_prepare_neon(block,
jpeg_natural_order_start,
Sl, Al, absvalues, bits);
}

View File

@@ -3425,3 +3425,635 @@ generate_jsimd_huff_encode_one_block 0
.purgem put_bits
.purgem checkbuf31
.purgem checkbuf47
/*****************************************************************************/
/*
* Macros to load data for jsimd_encode_mcu_AC_first_prepare_neon() and
* jsimd_encode_mcu_AC_refine_prepare_neon()
*/
.macro LOAD16
ldr T0d, [LUT, #(0*4)]
ldr T1d, [LUT, #(8*4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[0], [T0]
ld1 {Y1.h}[0], [T1]
ldr T0d, [LUT, #(1*4)]
ldr T1d, [LUT, #(9*4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[1], [T0]
ld1 {Y1.h}[1], [T1]
ldr T0d, [LUT, #(2*4)]
ldr T1d, [LUT, #(10*4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[2], [T0]
ld1 {Y1.h}[2], [T1]
ldr T0d, [LUT, #(3*4)]
ldr T1d, [LUT, #(11*4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[3], [T0]
ld1 {Y1.h}[3], [T1]
ldr T0d, [LUT, #(4*4)]
ldr T1d, [LUT, #(12*4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[4], [T0]
ld1 {Y1.h}[4], [T1]
ldr T0d, [LUT, #(5*4)]
ldr T1d, [LUT, #(13*4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[5], [T0]
ld1 {Y1.h}[5], [T1]
ldr T0d, [LUT, #(6*4)]
ldr T1d, [LUT, #(14*4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[6], [T0]
ld1 {Y1.h}[6], [T1]
ldr T0d, [LUT, #(7*4)]
ldr T1d, [LUT, #(15*4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[7], [T0]
ld1 {Y1.h}[7], [T1]
add LUT, LUT, #(16*4)
.endm
.macro LOAD15
eor Y1.16b, Y1.16b, Y1.16b
ldr T0d, [LUT, #(0*4)]
ldr T1d, [LUT, #(8*4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[0], [T0]
ld1 {Y1.h}[0], [T1]
ldr T0d, [LUT, #(1*4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[1], [T0]
ldr T0d, [LUT, #(2*4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[2], [T0]
ldr T0d, [LUT, #(3*4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[3], [T0]
ldr T0d, [LUT, #(4*4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[4], [T0]
ldr T0d, [LUT, #(5*4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[5], [T0]
ldr T0d, [LUT, #(6*4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[6], [T0]
ldr T0d, [LUT, #(7*4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[7], [T0]
cmp LENEND, #2
b.lt 1515f
ldr T1d, [LUT, #(9*4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y1.h}[1], [T1]
cmp LENEND, #3
b.lt 1515f
ldr T1d, [LUT, #(10*4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y1.h}[2], [T1]
cmp LENEND, #4
b.lt 1515f
ldr T1d, [LUT, #(11*4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y1.h}[3], [T1]
cmp LENEND, #5
b.lt 1515f
ldr T1d, [LUT, #(12*4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y1.h}[4], [T1]
cmp LENEND, #6
b.lt 1515f
ldr T1d, [LUT, #(13*4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y1.h}[5], [T1]
cmp LENEND, #7
b.lt 1515f
ldr T1d, [LUT, #(14*4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y1.h}[6], [T1]
1515:
.endm
.macro LOAD8
ldr T0d, [LUT, #(0*4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[0], [T0]
ldr T0d, [LUT, #(1*4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[1], [T0]
ldr T0d, [LUT, #(2*4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[2], [T0]
ldr T0d, [LUT, #(3*4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[3], [T0]
ldr T0d, [LUT, #(4*4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[4], [T0]
ldr T0d, [LUT, #(5*4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[5], [T0]
ldr T0d, [LUT, #(6*4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[6], [T0]
ldr T0d, [LUT, #(7*4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[7], [T0]
.endm
.macro LOAD7
eor Y0.16b, Y0.16b, Y0.16b
ldr T0d, [LUT, #(0*4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[0], [T0]
cmp LENEND, #2
b.lt 77f
ldr T1d, [LUT, #(1*4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[1], [T1]
cmp LENEND, #3
b.lt 77f
ldr T1d, [LUT, #(2*4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[2], [T1]
cmp LENEND, #4
b.lt 77f
ldr T1d, [LUT, #(3*4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[3], [T1]
cmp LENEND, #5
b.lt 77f
ldr T1d, [LUT, #(4*4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[4], [T1]
cmp LENEND, #6
b.lt 77f
ldr T1d, [LUT, #(5*4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[5], [T1]
cmp LENEND, #7
b.lt 77f
ldr T1d, [LUT, #(6*4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[6], [T1]
77:
.endm
.macro REDUCE0
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [VALUES], #64
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [VALUES], #64
cmeq v0.8h, v0.8h, #0
cmeq v1.8h, v1.8h, #0
cmeq v2.8h, v2.8h, #0
cmeq v3.8h, v3.8h, #0
cmeq v4.8h, v4.8h, #0
cmeq v5.8h, v5.8h, #0
cmeq v6.8h, v6.8h, #0
cmeq v7.8h, v7.8h, #0
xtn v0.8b, v0.8h
xtn v2.8b, v2.8h
xtn v4.8b, v4.8h
xtn v6.8b, v6.8h
xtn2 v0.16b, v1.8h
xtn2 v2.16b, v3.8h
xtn2 v4.16b, v5.8h
xtn2 v6.16b, v7.8h
and v0.16b, v0.16b, ANDMASK.16b
and v2.16b, v2.16b, ANDMASK.16b
and v4.16b, v4.16b, ANDMASK.16b
and v6.16b, v6.16b, ANDMASK.16b
addp v0.16b, v0.16b, v2.16b
addp v4.16b, v4.16b, v6.16b
addp v0.16b, v0.16b, v4.16b
addp v0.16b, v0.16b, v0.16b
umov T0, v0.D[0]
mvn T0, T0
str T0, [BITS]
.endm
/*
* Prepare data for jsimd_encode_mcu_AC_first().
*
* GLOBAL(int)
* jsimd_encode_mcu_AC_first_prepare_neon(const JCOEF *block,
* const int *jpeg_natural_order_start,
* int Sl, int Al, JCOEF *values,
* size_t *zerobits)
*
* x0 = const JCOEF *block
* x1 = const int *jpeg_natural_order_start
* w2 = int Sl
* w3 = int Al
* x4 = JCOEF *values
* x5 = size_t *zerobits
*
*/
ZERO .req v0
Y0 .req v2
Y1 .req v3
N0 .req v4
N1 .req v5
AL .req v6
ANDMASK .req v20
K .req w12
LUT .req x1
T0 .req x10
T0d .req w10
T1 .req x11
T1d .req w11
BLOCK .req x0
VALUES .req x4
XORVALUES .req x14
LEN .req w2
LENEND .req w9
BITS .req x5
.balign 16
Ljsimd_encode_mcu_AC_first_prepare_neon_consts:
.byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
asm_function jsimd_encode_mcu_AC_first_prepare_neon
adr T0, Ljsimd_encode_mcu_AC_first_prepare_neon_consts
neg w3, w3 /* Al = -Al */
eor ZERO.16b, ZERO.16b, ZERO.16b
ld1 {ANDMASK.16b}, [T0]
dup AL.8h, w3
add XORVALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
and LENEND, LEN, 7
lsr K, LEN, 4
cbz K, 3f
1:
LOAD16
cmlt N0.8h, Y0.8h, #0
cmlt N1.8h, Y1.8h, #0
abs Y0.8h, Y0.8h
abs Y1.8h, Y1.8h
ushl Y0.8h, Y0.8h, AL.8h
ushl Y1.8h, Y1.8h, AL.8h
eor N0.16b, N0.16b, Y0.16b
eor N1.16b, N1.16b, Y1.16b
st1 {Y0.8h, Y1.8h}, [VALUES], #32
st1 {N0.8h, N1.8h}, [XORVALUES], #32
subs K, K, #1
b.ne 1b
3:
tst LEN, #8
b.eq 3f
tst LEN, #7
b.eq 2f
LOAD15
cmlt N0.8h, Y0.8h, #0
cmlt N1.8h, Y1.8h, #0
abs Y0.8h, Y0.8h
abs Y1.8h, Y1.8h
ushl Y0.8h, Y0.8h, AL.8h
ushl Y1.8h, Y1.8h, AL.8h
eor N0.16b, N0.16b, Y0.16b
eor N1.16b, N1.16b, Y1.16b
st1 {Y0.8h, Y1.8h}, [VALUES], #32
st1 {N0.8h, N1.8h}, [XORVALUES], #32
b 4f
2:
LOAD8
cmlt N0.8h, Y0.8h, #0
abs Y0.8h, Y0.8h
ushl Y0.8h, Y0.8h, AL.8h
eor N0.16b, N0.16b, Y0.16b
st1 {Y0.8h}, [VALUES], #16
st1 {N0.8h}, [XORVALUES], #16
b 4f
3:
cbz LENEND, 4f
LOAD7
cmlt N0.8h, Y0.8h, #0
abs Y0.8h, Y0.8h
ushl Y0.8h, Y0.8h, AL.8h
eor N0.16b, N0.16b, Y0.16b
st1 {Y0.8h}, [VALUES], #16
st1 {N0.8h}, [XORVALUES], #16
/* b 4f */
/* fallthrough */
4:
add K, LEN, #7
lsr K, K, #3
subs K, K, #(/*DCTSIZE2*/ 64 / 8)
b.eq 5f
1:
st1 {ZERO.8h}, [VALUES], #16
st1 {ZERO.8h}, [XORVALUES], #16
adds K, K, #1
b.ne 1b
5:
sub VALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
REDUCE0
br x30
.unreq ZERO
.unreq Y0
.unreq Y1
.unreq N0
.unreq N1
.unreq AL
.unreq ANDMASK
.unreq K
.unreq LUT
.unreq T0
.unreq T0d
.unreq T1
.unreq T1d
.unreq BLOCK
.unreq VALUES
.unreq XORVALUES
.unreq LEN
.unreq LENEND
.unreq BITS
/*
* Prepare data for jsimd_encode_mcu_AC_refine.
*
* GLOBAL(int)
* jsimd_encode_mcu_AC_refine_prepare_neon(const JCOEF *block,
* const int *jpeg_natural_order_start,
* int Sl, int Al, JCOEF *absvalues,
* size_t *bits)
*
* x0 = const JCOEF *block
* x1 = const int *jpeg_natural_order_start
* w2 = int Sl
* w3 = int Al
* x4 = JCOEF *absvalues
* x5 = size_t *bits
*
*/
ZERO .req v0
ONE .req v1
Y0 .req v2
Y1 .req v3
N0 .req v4
N1 .req v5
AL .req v6
ANDMASK .req v20
K .req w12
KK .req w13
EOB .req w14
SIGN .req x15
LUT .req x1
T0 .req x10
T0d .req w10
T1 .req x11
T1d .req w11
BLOCK .req x0
VALUES .req x4
LEN .req w2
LENEND .req w9
BITS .req x5
.balign 16
Ljsimd_encode_mcu_AC_refine_prepare_neon_consts:
.byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
asm_function jsimd_encode_mcu_AC_refine_prepare_neon
adr T0, Ljsimd_encode_mcu_AC_refine_prepare_neon_consts
neg w3, w3 /* Al = -Al */
movi ONE.8h, #1
eor SIGN, SIGN, SIGN
eor ZERO.16b, ZERO.16b, ZERO.16b
eor EOB, EOB, EOB
ld1 {ANDMASK.16b}, [T0]
eor KK, KK, KK
dup AL.8h, w3
and LENEND, LEN, 7
lsr K, LEN, 4
cbz K, 3f
1:
LOAD16
cmlt N0.8h, Y0.8h, #0
cmlt N1.8h, Y1.8h, #0
abs Y0.8h, Y0.8h
abs Y1.8h, Y1.8h
ushl Y0.8h, Y0.8h, AL.8h
ushl Y1.8h, Y1.8h, AL.8h
st1 {Y0.8h, Y1.8h}, [VALUES], #32
xtn N0.8b, N0.8h
xtn N1.8b, N1.8h
cmeq Y0.8h, Y0.8h, ONE.8h
cmeq Y1.8h, Y1.8h, ONE.8h
xtn Y0.8b, Y0.8h
xtn Y1.8b, Y1.8h
and N0.8b, N0.8b, ANDMASK.8b
and N1.8b, N1.8b, ANDMASK.8b
and Y0.8b, Y0.8b, ANDMASK.8b
and Y1.8b, Y1.8b, ANDMASK.8b
addv B28, N0.8b
addv B29, N1.8b
addv B30, Y0.8b
addv B31, Y1.8b
ins v28.b[1], v29.b[0]
ins v30.b[1], v31.b[0]
umov T0d, v28.h[0] /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
umov T1d, v30.h[0] /* idx = _mm_movemask_epi8(x1); */
lsr SIGN, SIGN, #16 /* make room for sizebits */
orr SIGN, SIGN, T0, lsl #48
cbz T1d, 2f
rbit T1d, T1d
clz T1d, T1d
add EOB, KK, T1d /* EOB = k + idx; */
2:
add KK, KK, #16
subs K, K, #1
b.ne 1b
3:
tst LEN, #8
b.eq 3f
tst LEN, #7
b.eq 2f
LOAD15
cmlt N0.8h, Y0.8h, #0
cmlt N1.8h, Y1.8h, #0
abs Y0.8h, Y0.8h
abs Y1.8h, Y1.8h
ushl Y0.8h, Y0.8h, AL.8h
ushl Y1.8h, Y1.8h, AL.8h
st1 {Y0.8h, Y1.8h}, [VALUES], #32
xtn N0.8b, N0.8h
xtn N1.8b, N1.8h
cmeq Y0.8h, Y0.8h, ONE.8h
cmeq Y1.8h, Y1.8h, ONE.8h
xtn Y0.8b, Y0.8h
xtn Y1.8b, Y1.8h
and N0.8b, N0.8b, ANDMASK.8b
and N1.8b, N1.8b, ANDMASK.8b
and Y0.8b, Y0.8b, ANDMASK.8b
and Y1.8b, Y1.8b, ANDMASK.8b
addv B28, N0.8b
addv B29, N1.8b
addv B30, Y0.8b
addv B31, Y1.8b
ins v28.b[1], v29.b[0]
ins v30.b[1], v31.b[0]
umov T0d, v28.h[0] /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
umov T1d, v30.h[0] /* idx = _mm_movemask_epi8(x1); */
lsr SIGN, SIGN, #16 /* make room for sizebits */
orr SIGN, SIGN, T0, lsl #48
cbz T1d, 4f
rbit T1d, T1d
clz T1d, T1d
add EOB, KK, T1d /* EOB = k + idx; */
b 4f
2:
LOAD8
cmlt N0.8h, Y0.8h, #0
abs Y0.8h, Y0.8h
ushl Y0.8h, Y0.8h, AL.8h
st1 {Y0.8h}, [VALUES], #16
xtn N0.8b, N0.8h
cmeq Y0.8h, Y0.8h, ONE.8h
xtn Y0.8b, Y0.8h
and N0.8b, N0.8b, ANDMASK.8b
and Y0.8b, Y0.8b, ANDMASK.8b
addv B28, N0.8b
addv B30, Y0.8b
umov T0d, v28.b[0] /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
umov T1d, v30.b[0] /* idx = _mm_movemask_epi8(x1); */
lsr SIGN, SIGN, #8 /* make room for sizebits */
orr SIGN, SIGN, T0, lsl #56
cbz T1d, 4f
rbit T1d, T1d
clz T1d, T1d
add EOB, KK, T1d /* EOB = k + idx; */
b 4f
3:
cbz LENEND, 4f
LOAD7
cmlt N0.8h, Y0.8h, #0
abs Y0.8h, Y0.8h
ushl Y0.8h, Y0.8h, AL.8h
st1 {Y0.8h}, [VALUES], #16
xtn N0.8b, N0.8h
cmeq Y0.8h, Y0.8h, ONE.8h
xtn Y0.8b, Y0.8h
and N0.8b, N0.8b, ANDMASK.8b
and Y0.8b, Y0.8b, ANDMASK.8b
addv B28, N0.8b
addv B30, Y0.8b
umov T0d, v28.b[0] /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
umov T1d, v30.b[0] /* idx = _mm_movemask_epi8(x1); */
lsr SIGN, SIGN, #8 /* make room for sizebits */
orr SIGN, SIGN, T0, lsl #56
cbz T1d, 4f
rbit T1d, T1d
clz T1d, T1d
add EOB, KK, T1d /* EOB = k + idx; */
/* b 4f */
/* fallthrough */
4:
add K, LEN, #7
lsr K, K, #3
subs K, K, #(/*DCTSIZE2*/ 64 / 8)
b.eq 5f
1:
st1 {ZERO.8h}, [VALUES], #16
lsr SIGN, SIGN, #8
adds K, K, #1
b.ne 1b
5:
mvn SIGN, SIGN
sub VALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
str SIGN, [BITS, #8]
REDUCE0
mov w0, EOB
br x30
.unreq ZERO
.unreq ONE
.unreq Y0
.unreq Y1
.unreq N0
.unreq N1
.unreq AL
.unreq ANDMASK
.unreq K
.unreq KK
.unreq EOB
.unreq SIGN
.unreq LUT
.unreq T0
.unreq T0d
.unreq T1
.unreq T1d
.unreq BLOCK
.unreq VALUES
.unreq LEN
.unreq LENEND
.unreq BITS
.purgem LOAD16
.purgem LOAD15
.purgem LOAD8
.purgem LOAD7
.purgem REDUCE0

View File

@@ -1153,6 +1153,14 @@ EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *values, size_t *zerobits);
EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *values, size_t *zerobits);
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *absvalues, size_t *bits);
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *absvalues, size_t *bits);