ARM64 NEON SIMD impl. of prog. Huffman encoding
This commit adds ARM64 NEON optimizations for the encode_mcu_AC_first() and encode_mcu_AC_refine() functions used in progressive Huffman encoding. Compression speedups for the typical set of five libjpeg-turbo test images (https://libjpeg-turbo.org/About/Performance): Cortex-A53: 23.8-39.2% (avg. 32.2%) Cortex-A72: 26.8-41.1% (avg. 33.5%) Apple A7: 29.7-45.9% (avg. 39.6%) Closes #229
This commit is contained in:
@@ -92,6 +92,11 @@ segfault or other user-visible errant behavior, and given that the lossless
|
||||
transformer (unlike the decompressor) is not generally exposed to arbitrary
|
||||
data exploits, this issue did not likely pose a security risk.
|
||||
|
||||
12. Added SIMD acceleration for progressive Huffman encoding on ARM 64-bit
|
||||
(ARMv8) platforms. This speeds up the compression of full-color progressive
|
||||
JPEGs by about 30-40% on average (relative to libjpeg-turbo 2.0.x) when using
|
||||
modern ARMv8 CPUs.
|
||||
|
||||
|
||||
2.0.3
|
||||
=====
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include "jconfigint.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
@@ -773,6 +774,18 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_first_prepare(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (SIZEOF_SIZE_T != 8)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -781,11 +794,25 @@ jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *values, size_t *zerobits)
|
||||
{
|
||||
jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
|
||||
Sl, Al, values, zerobits);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (SIZEOF_SIZE_T != 8)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -794,5 +821,7 @@ jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *absvalues, size_t *bits)
|
||||
{
|
||||
return 0;
|
||||
return jsimd_encode_mcu_AC_refine_prepare_neon(block,
|
||||
jpeg_natural_order_start,
|
||||
Sl, Al, absvalues, bits);
|
||||
}
|
||||
|
||||
@@ -3425,3 +3425,635 @@ generate_jsimd_huff_encode_one_block 0
|
||||
.purgem put_bits
|
||||
.purgem checkbuf31
|
||||
.purgem checkbuf47
|
||||
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
/*
|
||||
* Macros to load data for jsimd_encode_mcu_AC_first_prepare_neon() and
|
||||
* jsimd_encode_mcu_AC_refine_prepare_neon()
|
||||
*/
|
||||
|
||||
.macro LOAD16
|
||||
ldr T0d, [LUT, #(0*4)]
|
||||
ldr T1d, [LUT, #(8*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y0.h}[0], [T0]
|
||||
ld1 {Y1.h}[0], [T1]
|
||||
|
||||
ldr T0d, [LUT, #(1*4)]
|
||||
ldr T1d, [LUT, #(9*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y0.h}[1], [T0]
|
||||
ld1 {Y1.h}[1], [T1]
|
||||
|
||||
ldr T0d, [LUT, #(2*4)]
|
||||
ldr T1d, [LUT, #(10*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y0.h}[2], [T0]
|
||||
ld1 {Y1.h}[2], [T1]
|
||||
|
||||
ldr T0d, [LUT, #(3*4)]
|
||||
ldr T1d, [LUT, #(11*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y0.h}[3], [T0]
|
||||
ld1 {Y1.h}[3], [T1]
|
||||
|
||||
ldr T0d, [LUT, #(4*4)]
|
||||
ldr T1d, [LUT, #(12*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y0.h}[4], [T0]
|
||||
ld1 {Y1.h}[4], [T1]
|
||||
|
||||
ldr T0d, [LUT, #(5*4)]
|
||||
ldr T1d, [LUT, #(13*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y0.h}[5], [T0]
|
||||
ld1 {Y1.h}[5], [T1]
|
||||
|
||||
ldr T0d, [LUT, #(6*4)]
|
||||
ldr T1d, [LUT, #(14*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y0.h}[6], [T0]
|
||||
ld1 {Y1.h}[6], [T1]
|
||||
|
||||
ldr T0d, [LUT, #(7*4)]
|
||||
ldr T1d, [LUT, #(15*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y0.h}[7], [T0]
|
||||
ld1 {Y1.h}[7], [T1]
|
||||
|
||||
add LUT, LUT, #(16*4)
|
||||
.endm
|
||||
|
||||
.macro LOAD15
|
||||
eor Y1.16b, Y1.16b, Y1.16b
|
||||
|
||||
ldr T0d, [LUT, #(0*4)]
|
||||
ldr T1d, [LUT, #(8*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y0.h}[0], [T0]
|
||||
ld1 {Y1.h}[0], [T1]
|
||||
|
||||
ldr T0d, [LUT, #(1*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
ld1 {Y0.h}[1], [T0]
|
||||
|
||||
ldr T0d, [LUT, #(2*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
ld1 {Y0.h}[2], [T0]
|
||||
|
||||
ldr T0d, [LUT, #(3*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
ld1 {Y0.h}[3], [T0]
|
||||
|
||||
ldr T0d, [LUT, #(4*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
ld1 {Y0.h}[4], [T0]
|
||||
|
||||
ldr T0d, [LUT, #(5*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
ld1 {Y0.h}[5], [T0]
|
||||
|
||||
ldr T0d, [LUT, #(6*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
ld1 {Y0.h}[6], [T0]
|
||||
|
||||
ldr T0d, [LUT, #(7*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
ld1 {Y0.h}[7], [T0]
|
||||
|
||||
cmp LENEND, #2
|
||||
b.lt 1515f
|
||||
ldr T1d, [LUT, #(9*4)]
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y1.h}[1], [T1]
|
||||
|
||||
cmp LENEND, #3
|
||||
b.lt 1515f
|
||||
ldr T1d, [LUT, #(10*4)]
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y1.h}[2], [T1]
|
||||
|
||||
cmp LENEND, #4
|
||||
b.lt 1515f
|
||||
ldr T1d, [LUT, #(11*4)]
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y1.h}[3], [T1]
|
||||
|
||||
cmp LENEND, #5
|
||||
b.lt 1515f
|
||||
ldr T1d, [LUT, #(12*4)]
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y1.h}[4], [T1]
|
||||
|
||||
cmp LENEND, #6
|
||||
b.lt 1515f
|
||||
ldr T1d, [LUT, #(13*4)]
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y1.h}[5], [T1]
|
||||
|
||||
cmp LENEND, #7
|
||||
b.lt 1515f
|
||||
ldr T1d, [LUT, #(14*4)]
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y1.h}[6], [T1]
|
||||
|
||||
1515:
|
||||
.endm
|
||||
|
||||
.macro LOAD8
|
||||
ldr T0d, [LUT, #(0*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
ld1 {Y0.h}[0], [T0]
|
||||
|
||||
ldr T0d, [LUT, #(1*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
ld1 {Y0.h}[1], [T0]
|
||||
|
||||
ldr T0d, [LUT, #(2*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
ld1 {Y0.h}[2], [T0]
|
||||
|
||||
ldr T0d, [LUT, #(3*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
ld1 {Y0.h}[3], [T0]
|
||||
|
||||
ldr T0d, [LUT, #(4*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
ld1 {Y0.h}[4], [T0]
|
||||
|
||||
ldr T0d, [LUT, #(5*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
ld1 {Y0.h}[5], [T0]
|
||||
|
||||
ldr T0d, [LUT, #(6*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
ld1 {Y0.h}[6], [T0]
|
||||
|
||||
ldr T0d, [LUT, #(7*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
ld1 {Y0.h}[7], [T0]
|
||||
.endm
|
||||
|
||||
.macro LOAD7
|
||||
eor Y0.16b, Y0.16b, Y0.16b
|
||||
|
||||
ldr T0d, [LUT, #(0*4)]
|
||||
add T0, BLOCK, T0, lsl #1
|
||||
ld1 {Y0.h}[0], [T0]
|
||||
|
||||
cmp LENEND, #2
|
||||
b.lt 77f
|
||||
ldr T1d, [LUT, #(1*4)]
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y0.h}[1], [T1]
|
||||
|
||||
cmp LENEND, #3
|
||||
b.lt 77f
|
||||
ldr T1d, [LUT, #(2*4)]
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y0.h}[2], [T1]
|
||||
|
||||
cmp LENEND, #4
|
||||
b.lt 77f
|
||||
ldr T1d, [LUT, #(3*4)]
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y0.h}[3], [T1]
|
||||
|
||||
cmp LENEND, #5
|
||||
b.lt 77f
|
||||
ldr T1d, [LUT, #(4*4)]
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y0.h}[4], [T1]
|
||||
|
||||
cmp LENEND, #6
|
||||
b.lt 77f
|
||||
ldr T1d, [LUT, #(5*4)]
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y0.h}[5], [T1]
|
||||
|
||||
cmp LENEND, #7
|
||||
b.lt 77f
|
||||
ldr T1d, [LUT, #(6*4)]
|
||||
add T1, BLOCK, T1, lsl #1
|
||||
ld1 {Y0.h}[6], [T1]
|
||||
|
||||
77:
|
||||
.endm
|
||||
|
||||
.macro REDUCE0
|
||||
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [VALUES], #64
|
||||
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [VALUES], #64
|
||||
|
||||
cmeq v0.8h, v0.8h, #0
|
||||
cmeq v1.8h, v1.8h, #0
|
||||
cmeq v2.8h, v2.8h, #0
|
||||
cmeq v3.8h, v3.8h, #0
|
||||
cmeq v4.8h, v4.8h, #0
|
||||
cmeq v5.8h, v5.8h, #0
|
||||
cmeq v6.8h, v6.8h, #0
|
||||
cmeq v7.8h, v7.8h, #0
|
||||
|
||||
xtn v0.8b, v0.8h
|
||||
xtn v2.8b, v2.8h
|
||||
xtn v4.8b, v4.8h
|
||||
xtn v6.8b, v6.8h
|
||||
xtn2 v0.16b, v1.8h
|
||||
xtn2 v2.16b, v3.8h
|
||||
xtn2 v4.16b, v5.8h
|
||||
xtn2 v6.16b, v7.8h
|
||||
|
||||
and v0.16b, v0.16b, ANDMASK.16b
|
||||
and v2.16b, v2.16b, ANDMASK.16b
|
||||
and v4.16b, v4.16b, ANDMASK.16b
|
||||
and v6.16b, v6.16b, ANDMASK.16b
|
||||
addp v0.16b, v0.16b, v2.16b
|
||||
addp v4.16b, v4.16b, v6.16b
|
||||
addp v0.16b, v0.16b, v4.16b
|
||||
addp v0.16b, v0.16b, v0.16b
|
||||
umov T0, v0.D[0]
|
||||
mvn T0, T0
|
||||
str T0, [BITS]
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Prepare data for jsimd_encode_mcu_AC_first().
|
||||
*
|
||||
* GLOBAL(int)
|
||||
* jsimd_encode_mcu_AC_first_prepare_neon(const JCOEF *block,
|
||||
* const int *jpeg_natural_order_start,
|
||||
* int Sl, int Al, JCOEF *values,
|
||||
* size_t *zerobits)
|
||||
*
|
||||
* x0 = const JCOEF *block
|
||||
* x1 = const int *jpeg_natural_order_start
|
||||
* w2 = int Sl
|
||||
* w3 = int Al
|
||||
* x4 = JCOEF *values
|
||||
* x5 = size_t *zerobits
|
||||
*
|
||||
*/
|
||||
|
||||
ZERO .req v0
|
||||
Y0 .req v2
|
||||
Y1 .req v3
|
||||
N0 .req v4
|
||||
N1 .req v5
|
||||
AL .req v6
|
||||
ANDMASK .req v20
|
||||
K .req w12
|
||||
LUT .req x1
|
||||
T0 .req x10
|
||||
T0d .req w10
|
||||
T1 .req x11
|
||||
T1d .req w11
|
||||
BLOCK .req x0
|
||||
VALUES .req x4
|
||||
XORVALUES .req x14
|
||||
LEN .req w2
|
||||
LENEND .req w9
|
||||
BITS .req x5
|
||||
|
||||
.balign 16
|
||||
Ljsimd_encode_mcu_AC_first_prepare_neon_consts:
|
||||
.byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
||||
|
||||
asm_function jsimd_encode_mcu_AC_first_prepare_neon
|
||||
adr T0, Ljsimd_encode_mcu_AC_first_prepare_neon_consts
|
||||
neg w3, w3 /* Al = -Al */
|
||||
eor ZERO.16b, ZERO.16b, ZERO.16b
|
||||
ld1 {ANDMASK.16b}, [T0]
|
||||
dup AL.8h, w3
|
||||
add XORVALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
|
||||
and LENEND, LEN, 7
|
||||
lsr K, LEN, 4
|
||||
cbz K, 3f
|
||||
1:
|
||||
LOAD16
|
||||
cmlt N0.8h, Y0.8h, #0
|
||||
cmlt N1.8h, Y1.8h, #0
|
||||
abs Y0.8h, Y0.8h
|
||||
abs Y1.8h, Y1.8h
|
||||
ushl Y0.8h, Y0.8h, AL.8h
|
||||
ushl Y1.8h, Y1.8h, AL.8h
|
||||
eor N0.16b, N0.16b, Y0.16b
|
||||
eor N1.16b, N1.16b, Y1.16b
|
||||
st1 {Y0.8h, Y1.8h}, [VALUES], #32
|
||||
st1 {N0.8h, N1.8h}, [XORVALUES], #32
|
||||
subs K, K, #1
|
||||
b.ne 1b
|
||||
3:
|
||||
tst LEN, #8
|
||||
b.eq 3f
|
||||
tst LEN, #7
|
||||
b.eq 2f
|
||||
|
||||
LOAD15
|
||||
cmlt N0.8h, Y0.8h, #0
|
||||
cmlt N1.8h, Y1.8h, #0
|
||||
abs Y0.8h, Y0.8h
|
||||
abs Y1.8h, Y1.8h
|
||||
ushl Y0.8h, Y0.8h, AL.8h
|
||||
ushl Y1.8h, Y1.8h, AL.8h
|
||||
eor N0.16b, N0.16b, Y0.16b
|
||||
eor N1.16b, N1.16b, Y1.16b
|
||||
st1 {Y0.8h, Y1.8h}, [VALUES], #32
|
||||
st1 {N0.8h, N1.8h}, [XORVALUES], #32
|
||||
b 4f
|
||||
2:
|
||||
LOAD8
|
||||
cmlt N0.8h, Y0.8h, #0
|
||||
abs Y0.8h, Y0.8h
|
||||
ushl Y0.8h, Y0.8h, AL.8h
|
||||
eor N0.16b, N0.16b, Y0.16b
|
||||
st1 {Y0.8h}, [VALUES], #16
|
||||
st1 {N0.8h}, [XORVALUES], #16
|
||||
b 4f
|
||||
3:
|
||||
cbz LENEND, 4f
|
||||
LOAD7
|
||||
cmlt N0.8h, Y0.8h, #0
|
||||
abs Y0.8h, Y0.8h
|
||||
ushl Y0.8h, Y0.8h, AL.8h
|
||||
eor N0.16b, N0.16b, Y0.16b
|
||||
st1 {Y0.8h}, [VALUES], #16
|
||||
st1 {N0.8h}, [XORVALUES], #16
|
||||
/* b 4f */
|
||||
/* fallthrough */
|
||||
4:
|
||||
add K, LEN, #7
|
||||
lsr K, K, #3
|
||||
subs K, K, #(/*DCTSIZE2*/ 64 / 8)
|
||||
b.eq 5f
|
||||
1:
|
||||
st1 {ZERO.8h}, [VALUES], #16
|
||||
st1 {ZERO.8h}, [XORVALUES], #16
|
||||
adds K, K, #1
|
||||
b.ne 1b
|
||||
5:
|
||||
sub VALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
|
||||
|
||||
REDUCE0
|
||||
|
||||
br x30
|
||||
|
||||
.unreq ZERO
|
||||
.unreq Y0
|
||||
.unreq Y1
|
||||
.unreq N0
|
||||
.unreq N1
|
||||
.unreq AL
|
||||
.unreq ANDMASK
|
||||
.unreq K
|
||||
.unreq LUT
|
||||
.unreq T0
|
||||
.unreq T0d
|
||||
.unreq T1
|
||||
.unreq T1d
|
||||
.unreq BLOCK
|
||||
.unreq VALUES
|
||||
.unreq XORVALUES
|
||||
.unreq LEN
|
||||
.unreq LENEND
|
||||
.unreq BITS
|
||||
|
||||
/*
|
||||
* Prepare data for jsimd_encode_mcu_AC_refine.
|
||||
*
|
||||
* GLOBAL(int)
|
||||
* jsimd_encode_mcu_AC_refine_prepare_neon(const JCOEF *block,
|
||||
* const int *jpeg_natural_order_start,
|
||||
* int Sl, int Al, JCOEF *absvalues,
|
||||
* size_t *bits)
|
||||
*
|
||||
* x0 = const JCOEF *block
|
||||
* x1 = const int *jpeg_natural_order_start
|
||||
* w2 = int Sl
|
||||
* w3 = int Al
|
||||
* x4 = JCOEF *absvalues
|
||||
* x5 = size_t *bits
|
||||
*
|
||||
*/
|
||||
|
||||
ZERO .req v0
|
||||
ONE .req v1
|
||||
Y0 .req v2
|
||||
Y1 .req v3
|
||||
N0 .req v4
|
||||
N1 .req v5
|
||||
AL .req v6
|
||||
ANDMASK .req v20
|
||||
K .req w12
|
||||
KK .req w13
|
||||
EOB .req w14
|
||||
SIGN .req x15
|
||||
LUT .req x1
|
||||
T0 .req x10
|
||||
T0d .req w10
|
||||
T1 .req x11
|
||||
T1d .req w11
|
||||
BLOCK .req x0
|
||||
VALUES .req x4
|
||||
LEN .req w2
|
||||
LENEND .req w9
|
||||
BITS .req x5
|
||||
|
||||
.balign 16
|
||||
Ljsimd_encode_mcu_AC_refine_prepare_neon_consts:
|
||||
.byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
||||
|
||||
asm_function jsimd_encode_mcu_AC_refine_prepare_neon
|
||||
adr T0, Ljsimd_encode_mcu_AC_refine_prepare_neon_consts
|
||||
neg w3, w3 /* Al = -Al */
|
||||
movi ONE.8h, #1
|
||||
eor SIGN, SIGN, SIGN
|
||||
eor ZERO.16b, ZERO.16b, ZERO.16b
|
||||
eor EOB, EOB, EOB
|
||||
ld1 {ANDMASK.16b}, [T0]
|
||||
eor KK, KK, KK
|
||||
dup AL.8h, w3
|
||||
and LENEND, LEN, 7
|
||||
lsr K, LEN, 4
|
||||
cbz K, 3f
|
||||
1:
|
||||
LOAD16
|
||||
cmlt N0.8h, Y0.8h, #0
|
||||
cmlt N1.8h, Y1.8h, #0
|
||||
abs Y0.8h, Y0.8h
|
||||
abs Y1.8h, Y1.8h
|
||||
ushl Y0.8h, Y0.8h, AL.8h
|
||||
ushl Y1.8h, Y1.8h, AL.8h
|
||||
st1 {Y0.8h, Y1.8h}, [VALUES], #32
|
||||
xtn N0.8b, N0.8h
|
||||
xtn N1.8b, N1.8h
|
||||
cmeq Y0.8h, Y0.8h, ONE.8h
|
||||
cmeq Y1.8h, Y1.8h, ONE.8h
|
||||
xtn Y0.8b, Y0.8h
|
||||
xtn Y1.8b, Y1.8h
|
||||
and N0.8b, N0.8b, ANDMASK.8b
|
||||
and N1.8b, N1.8b, ANDMASK.8b
|
||||
and Y0.8b, Y0.8b, ANDMASK.8b
|
||||
and Y1.8b, Y1.8b, ANDMASK.8b
|
||||
addv B28, N0.8b
|
||||
addv B29, N1.8b
|
||||
addv B30, Y0.8b
|
||||
addv B31, Y1.8b
|
||||
ins v28.b[1], v29.b[0]
|
||||
ins v30.b[1], v31.b[0]
|
||||
umov T0d, v28.h[0] /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
|
||||
umov T1d, v30.h[0] /* idx = _mm_movemask_epi8(x1); */
|
||||
lsr SIGN, SIGN, #16 /* make room for sizebits */
|
||||
orr SIGN, SIGN, T0, lsl #48
|
||||
cbz T1d, 2f
|
||||
rbit T1d, T1d
|
||||
clz T1d, T1d
|
||||
add EOB, KK, T1d /* EOB = k + idx; */
|
||||
2:
|
||||
add KK, KK, #16
|
||||
subs K, K, #1
|
||||
b.ne 1b
|
||||
3:
|
||||
tst LEN, #8
|
||||
b.eq 3f
|
||||
tst LEN, #7
|
||||
b.eq 2f
|
||||
|
||||
LOAD15
|
||||
cmlt N0.8h, Y0.8h, #0
|
||||
cmlt N1.8h, Y1.8h, #0
|
||||
abs Y0.8h, Y0.8h
|
||||
abs Y1.8h, Y1.8h
|
||||
ushl Y0.8h, Y0.8h, AL.8h
|
||||
ushl Y1.8h, Y1.8h, AL.8h
|
||||
st1 {Y0.8h, Y1.8h}, [VALUES], #32
|
||||
xtn N0.8b, N0.8h
|
||||
xtn N1.8b, N1.8h
|
||||
cmeq Y0.8h, Y0.8h, ONE.8h
|
||||
cmeq Y1.8h, Y1.8h, ONE.8h
|
||||
xtn Y0.8b, Y0.8h
|
||||
xtn Y1.8b, Y1.8h
|
||||
and N0.8b, N0.8b, ANDMASK.8b
|
||||
and N1.8b, N1.8b, ANDMASK.8b
|
||||
and Y0.8b, Y0.8b, ANDMASK.8b
|
||||
and Y1.8b, Y1.8b, ANDMASK.8b
|
||||
addv B28, N0.8b
|
||||
addv B29, N1.8b
|
||||
addv B30, Y0.8b
|
||||
addv B31, Y1.8b
|
||||
ins v28.b[1], v29.b[0]
|
||||
ins v30.b[1], v31.b[0]
|
||||
umov T0d, v28.h[0] /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
|
||||
umov T1d, v30.h[0] /* idx = _mm_movemask_epi8(x1); */
|
||||
lsr SIGN, SIGN, #16 /* make room for sizebits */
|
||||
orr SIGN, SIGN, T0, lsl #48
|
||||
cbz T1d, 4f
|
||||
rbit T1d, T1d
|
||||
clz T1d, T1d
|
||||
add EOB, KK, T1d /* EOB = k + idx; */
|
||||
b 4f
|
||||
2:
|
||||
LOAD8
|
||||
cmlt N0.8h, Y0.8h, #0
|
||||
abs Y0.8h, Y0.8h
|
||||
ushl Y0.8h, Y0.8h, AL.8h
|
||||
st1 {Y0.8h}, [VALUES], #16
|
||||
xtn N0.8b, N0.8h
|
||||
cmeq Y0.8h, Y0.8h, ONE.8h
|
||||
xtn Y0.8b, Y0.8h
|
||||
and N0.8b, N0.8b, ANDMASK.8b
|
||||
and Y0.8b, Y0.8b, ANDMASK.8b
|
||||
addv B28, N0.8b
|
||||
addv B30, Y0.8b
|
||||
umov T0d, v28.b[0] /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
|
||||
umov T1d, v30.b[0] /* idx = _mm_movemask_epi8(x1); */
|
||||
lsr SIGN, SIGN, #8 /* make room for sizebits */
|
||||
orr SIGN, SIGN, T0, lsl #56
|
||||
cbz T1d, 4f
|
||||
rbit T1d, T1d
|
||||
clz T1d, T1d
|
||||
add EOB, KK, T1d /* EOB = k + idx; */
|
||||
b 4f
|
||||
3:
|
||||
cbz LENEND, 4f
|
||||
LOAD7
|
||||
cmlt N0.8h, Y0.8h, #0
|
||||
abs Y0.8h, Y0.8h
|
||||
ushl Y0.8h, Y0.8h, AL.8h
|
||||
st1 {Y0.8h}, [VALUES], #16
|
||||
xtn N0.8b, N0.8h
|
||||
cmeq Y0.8h, Y0.8h, ONE.8h
|
||||
xtn Y0.8b, Y0.8h
|
||||
and N0.8b, N0.8b, ANDMASK.8b
|
||||
and Y0.8b, Y0.8b, ANDMASK.8b
|
||||
addv B28, N0.8b
|
||||
addv B30, Y0.8b
|
||||
umov T0d, v28.b[0] /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
|
||||
umov T1d, v30.b[0] /* idx = _mm_movemask_epi8(x1); */
|
||||
lsr SIGN, SIGN, #8 /* make room for sizebits */
|
||||
orr SIGN, SIGN, T0, lsl #56
|
||||
cbz T1d, 4f
|
||||
rbit T1d, T1d
|
||||
clz T1d, T1d
|
||||
add EOB, KK, T1d /* EOB = k + idx; */
|
||||
/* b 4f */
|
||||
/* fallthrough */
|
||||
4:
|
||||
add K, LEN, #7
|
||||
lsr K, K, #3
|
||||
subs K, K, #(/*DCTSIZE2*/ 64 / 8)
|
||||
b.eq 5f
|
||||
1:
|
||||
st1 {ZERO.8h}, [VALUES], #16
|
||||
lsr SIGN, SIGN, #8
|
||||
adds K, K, #1
|
||||
b.ne 1b
|
||||
5:
|
||||
mvn SIGN, SIGN
|
||||
sub VALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
|
||||
str SIGN, [BITS, #8]
|
||||
|
||||
REDUCE0
|
||||
|
||||
mov w0, EOB
|
||||
br x30
|
||||
|
||||
.unreq ZERO
|
||||
.unreq ONE
|
||||
.unreq Y0
|
||||
.unreq Y1
|
||||
.unreq N0
|
||||
.unreq N1
|
||||
.unreq AL
|
||||
.unreq ANDMASK
|
||||
.unreq K
|
||||
.unreq KK
|
||||
.unreq EOB
|
||||
.unreq SIGN
|
||||
.unreq LUT
|
||||
.unreq T0
|
||||
.unreq T0d
|
||||
.unreq T1
|
||||
.unreq T1d
|
||||
.unreq BLOCK
|
||||
.unreq VALUES
|
||||
.unreq LEN
|
||||
.unreq LENEND
|
||||
.unreq BITS
|
||||
|
||||
.purgem LOAD16
|
||||
.purgem LOAD15
|
||||
.purgem LOAD8
|
||||
.purgem LOAD7
|
||||
.purgem REDUCE0
|
||||
|
||||
@@ -1153,6 +1153,14 @@ EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
|
||||
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
|
||||
JCOEF *values, size_t *zerobits);
|
||||
|
||||
EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon
|
||||
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
|
||||
JCOEF *values, size_t *zerobits);
|
||||
|
||||
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
|
||||
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
|
||||
JCOEF *absvalues, size_t *bits);
|
||||
|
||||
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon
|
||||
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
|
||||
JCOEF *absvalues, size_t *bits);
|
||||
|
||||
Reference in New Issue
Block a user