C/SSE2 optimization of encode_mcu_AC_first()

This commit adds C and SSE2 optimizations for the encode_mcu_AC_first()
function used in progressive Huffman encoding.

The image used for testing can be retrieved from this page:
https://blog.cloudflare.com/doubling-the-speed-of-jpegtran

All timings done on `Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz`
clang version is `Apple LLVM version 9.0.0 (clang-900.0.39.2)`
gcc-5 version is `gcc-5 (Homebrew GCC 5.5.0) 5.5.0`
gcc-7 version is `gcc-7 (Homebrew GCC 7.2.0) 7.2.0`

Here are the results in comparison to libjpeg-turbo@293263c using
`time ./jpegtran -outfile /dev/null -progressive -optimise -copy none print_poster_0025.jpg`

C
clang x86_64: +19%
gcc-5 x86_64: +80%
gcc-7 x86_64: +57%
clang i386: +5%
gcc-5 i386: +59%
gcc-7 i386: +51%

SSE2
clang x86_64: +79%
gcc-5 x86_64: +158%
gcc-7 x86_64: +122%
clang i386: +71%
gcc-5 i386: +134%
gcc-7 i386: +135%

Discussion in libjpeg-turbo/libjpeg-turbo#46
This commit is contained in:
mayeut
2018-03-22 11:36:43 -05:00
committed by DRC
parent 16bd984557
commit 5b177b3cab
14 changed files with 641 additions and 56 deletions

View File

@@ -122,6 +122,11 @@ approximately 2-3.5x.
14. Fixed a build error when building with older MinGW releases (regression
caused by 1.5.1[7].)
15. Added SIMD acceleration for progressive Huffman encoding on SSE2-capable
x86 and x86-64 platforms. This speeds up the compression of full-color
progressive JPEGs by about 85-90% on average (relative to libjpeg-turbo 1.5.x)
when using modern Intel and AMD CPUs.
1.5.3
=====

201
jcphuff.c
View File

@@ -73,6 +73,10 @@
typedef struct {
struct jpeg_entropy_encoder pub; /* public fields */
/* Pointer to routine to prepare data for encode_mcu_AC_first() */
void (*AC_first_prepare) (const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *zerobits);
/* Pointer to routine to prepare data for encode_mcu_AC_refine() */
int (*AC_refine_prepare) (const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
@@ -144,6 +148,9 @@ typedef phuff_entropy_encoder *phuff_entropy_ptr;
/* Forward declarations */
METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
METHODDEF(void) encode_mcu_AC_first_prepare
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *values, size_t *zerobits);
METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo,
@@ -208,6 +215,10 @@ start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics)
entropy->pub.encode_mcu = encode_mcu_DC_first;
else
entropy->pub.encode_mcu = encode_mcu_AC_first;
if (jsimd_can_encode_mcu_AC_first_prepare())
entropy->AC_first_prepare = jsimd_encode_mcu_AC_first_prepare;
else
entropy->AC_first_prepare = encode_mcu_AC_first_prepare;
} else {
if (is_DC_band)
entropy->pub.encode_mcu = encode_mcu_DC_refine;
@@ -541,21 +552,116 @@ encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
}
/*
* Data preparation for encode_mcu_AC_first().
*/
#define COMPUTE_ABSVALUES_AC_FIRST(Sl) { \
for (k = 0; k < Sl; k++) { \
temp = block[jpeg_natural_order_start[k]]; \
if (temp == 0) \
continue; \
/* We must apply the point transform by Al. For AC coefficients this \
* is an integer division with rounding towards 0. To do this portably \
* in C, we shift after obtaining the absolute value; so the code is \
* interwoven with finding the abs value (temp) and output bits (temp2). \
*/ \
temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
temp ^= temp2; \
temp -= temp2; /* temp is abs value of input */ \
temp >>= Al; /* apply the point transform */ \
/* Watch out for case that nonzero coef is zero after point transform */ \
if (temp == 0) \
continue; \
/* For a negative coef, want temp2 = bitwise complement of abs(coef) */ \
temp2 ^= temp; \
values[k] = temp; \
values[k + DCTSIZE2] = temp2; \
zerobits |= ((size_t)1U) << k; \
} \
}
METHODDEF(void)
encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *bits)
{
register int k, temp, temp2;
size_t zerobits = 0U;
int Sl0 = Sl;
#if SIZEOF_SIZE_T == 4
if (Sl0 > 32)
Sl0 = 32;
#endif
COMPUTE_ABSVALUES_AC_FIRST(Sl0);
bits[0] = zerobits;
#if SIZEOF_SIZE_T == 4
zerobits = 0U;
if (Sl > 32) {
Sl -= 32;
jpeg_natural_order_start += 32;
values += 32;
COMPUTE_ABSVALUES_AC_FIRST(Sl);
}
bits[1] = zerobits;
#endif
}
/*
* MCU encoding for AC initial scan (either spectral selection,
* or first pass of successive approximation).
*/
#define ENCODE_COEFS_AC_FIRST(label) { \
while (zerobits) { \
r = count_zeroes(&zerobits); \
cvalue += r; \
label \
temp = cvalue[0]; \
temp2 = cvalue[DCTSIZE2]; \
\
/* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
while (r > 15) { \
emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
r -= 16; \
} \
\
/* Find the number of bits needed for the magnitude of the coefficient */ \
nbits = JPEG_NBITS_NONZERO(temp); /* there must be at least one 1 bit */ \
/* Check for out-of-range coefficient values */ \
if (nbits > MAX_COEF_BITS) \
ERREXIT(cinfo, JERR_BAD_DCT_COEF); \
\
/* Count/emit Huffman symbol for run length / number of bits */ \
emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits); \
\
/* Emit that number of bits of the value, if positive, */ \
/* or the complement of its magnitude, if negative. */ \
emit_bits(entropy, (unsigned int)temp2, nbits); \
\
cvalue++; \
zerobits >>= 1; \
} \
}
METHODDEF(boolean)
encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
register int temp, temp2, temp3;
register int nbits;
register int r, k;
int Se = cinfo->Se;
register int temp, temp2;
register int nbits, r;
int Sl = cinfo->Se - cinfo->Ss + 1;
int Al = cinfo->Al;
JBLOCKROW block;
JCOEF values_unaligned[2 * DCTSIZE2 + 15];
JCOEF *values;
const JCOEF *cvalue;
size_t zerobits;
size_t bits[8 / SIZEOF_SIZE_T];
entropy->next_output_byte = cinfo->dest->next_output_byte;
entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -565,61 +671,48 @@ encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
if (entropy->restarts_to_go == 0)
emit_restart(entropy, entropy->next_restart_num);
/* Encode the MCU data block */
block = MCU_data[0];
#ifdef WITH_SIMD
cvalue = values = (JCOEF *)PAD((size_t)values_unaligned, 16);
#else
/* Not using SIMD, so alignment is not needed */
cvalue = values = values_unaligned;
#endif
/* Prepare data */
entropy->AC_first_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
Sl, Al, values, bits);
zerobits = bits[0];
#if SIZEOF_SIZE_T == 4
zerobits |= bits[1];
#endif
/* Emit any pending EOBRUN */
if (zerobits && (entropy->EOBRUN > 0))
emit_eobrun(entropy);
#if SIZEOF_SIZE_T == 4
zerobits = bits[0];
#endif
/* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
r = 0; /* r = run length of zeros */
ENCODE_COEFS_AC_FIRST();
for (k = cinfo->Ss; k <= Se; k++) {
if ((temp = (*block)[jpeg_natural_order[k]]) == 0) {
r++;
continue;
}
/* We must apply the point transform by Al. For AC coefficients this
* is an integer division with rounding towards 0. To do this portably
* in C, we shift after obtaining the absolute value; so the code is
* interwoven with finding the abs value (temp) and output bits (temp2).
*/
temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
temp ^= temp3;
temp -= temp3; /* temp is abs value of input */
temp >>= Al; /* apply the point transform */
/* For a negative coef, want temp2 = bitwise complement of abs(coef) */
temp2 = temp ^ temp3;
/* Watch out for case that nonzero coef is zero after point transform */
if (temp == 0) {
r++;
continue;
}
/* Emit any pending EOBRUN */
if (entropy->EOBRUN > 0)
emit_eobrun(entropy);
/* if run length > 15, must emit special run-length-16 codes (0xF0) */
while (r > 15) {
emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
r -= 16;
}
/* Find the number of bits needed for the magnitude of the coefficient */
nbits = JPEG_NBITS_NONZERO(temp); /* there must be at least one 1 bit */
/* Check for out-of-range coefficient values */
if (nbits > MAX_COEF_BITS)
ERREXIT(cinfo, JERR_BAD_DCT_COEF);
/* Count/emit Huffman symbol for run length / number of bits */
emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits);
/* Emit that number of bits of the value, if positive, */
/* or the complement of its magnitude, if negative. */
emit_bits(entropy, (unsigned int)temp2, nbits);
r = 0; /* reset zero run length */
#if SIZEOF_SIZE_T == 4
zerobits = bits[1];
if (zerobits) {
int diff = ((values + DCTSIZE2 / 2) - cvalue);
r = count_zeroes(&zerobits);
r += diff;
cvalue += r;
goto first_iter_ac_first;
}
if (r > 0) { /* If there are trailing zeroes, */
ENCODE_COEFS_AC_FIRST(first_iter_ac_first:);
#endif
if (cvalue < (values + Sl)) { /* If there are trailing zeroes, */
entropy->EOBRUN++; /* count an EOB */
if (entropy->EOBRUN == 0x7FFF)
emit_eobrun(entropy); /* force it out to avoid overflow */

View File

@@ -104,6 +104,12 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block(void *state, JOCTET *buffer,
c_derived_tbl *dctbl,
c_derived_tbl *actbl);
EXTERN(int) jsimd_can_encode_mcu_AC_first_prepare(void);
EXTERN(void) jsimd_encode_mcu_AC_first_prepare
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *values, size_t *zerobits);
EXTERN(int) jsimd_can_encode_mcu_AC_refine_prepare(void);
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare

View File

@@ -390,6 +390,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
return NULL;
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_first_prepare(void)
{
return 0;
}
GLOBAL(void)
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *zerobits)
{
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{

View File

@@ -692,6 +692,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
dctbl, actbl);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_first_prepare(void)
{
return 0;
}
GLOBAL(void)
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *zerobits)
{
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{

View File

@@ -770,6 +770,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
last_dc_val, dctbl, actbl);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_first_prepare(void)
{
return 0;
}
GLOBAL(void)
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *zerobits)
{
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{

View File

@@ -25,7 +25,8 @@
BITS 32
; --------------------------------------------------------------------------
; Macros to load data for jsimd_encode_mcu_AC_refine_prepare_sse2()
; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
; jsimd_encode_mcu_AC_refine_prepare_sse2()
%macro LOAD16 0
pxor N0, N0
@@ -245,6 +246,179 @@
mov INT [edi+SIZEOF_INT], edx
%endmacro
;
; Prepare data for jsimd_encode_mcu_AC_first().
;
; GLOBAL(void)
; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
; const int *jpeg_natural_order_start,
; int Sl, int Al, JCOEF *values,
; size_t *zerobits)
;
; eax + 8 = const JCOEF *block
; eax + 12 = const int *jpeg_natural_order_start
; eax + 16 = int Sl
; eax + 20 = int Al
; eax + 24 = JCOEF *values
; eax + 28 = size_t *zerobits
%define ZERO xmm7
%define X0 xmm0
%define X1 xmm1
%define N0 xmm2
%define N1 xmm3
%define AL xmm4
%define K eax
%define LENEND eax
%define LUT ebx
%define T0 ecx
%define T1 edx
%define BLOCK esi
%define VALUES edi
%define LEN ebp
%define ZEROBITS INT [esp + 5 * 4]
align 32
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
sub esp, 4
push ebx
push ecx
; push edx ; need not be preserved
push esi
push edi
push ebp
mov BLOCK, INT [eax + 8]
mov LUT, INT [eax + 12]
mov VALUES, INT [eax + 24]
movd AL, INT [eax + 20]
mov T0, INT [eax + 28]
mov ZEROBITS, T0
mov LEN, INT [eax + 16]
pxor ZERO, ZERO
mov K, LEN
and K, -16
shr K, 4
jz .ELOOP16
.BLOOP16:
LOAD16
pcmpgtw N0, X0
pcmpgtw N1, X1
paddw X0, N0
paddw X1, N1
pxor X0, N0
pxor X1, N1
psrlw X0, AL
psrlw X1, AL
pxor N0, X0
pxor N1, X1
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (8) * 2], X1
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
add VALUES, 16*2
add LUT, 16*SIZEOF_INT
dec K
jnz .BLOOP16
.ELOOP16:
mov LENEND, LEN
and LENEND, 7
test LEN, 8
jz .TRY7
test LEN, 7
jz .TRY8
LOAD15
pcmpgtw N0, X0
pcmpgtw N1, X1
paddw X0, N0
paddw X1, N1
pxor X0, N0
pxor X1, N1
psrlw X0, AL
psrlw X1, AL
pxor N0, X0
pxor N1, X1
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (8) * 2], X1
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
add VALUES, 16*2
jmp .PADDING
.TRY8:
LOAD8
pcmpgtw N0, X0
paddw X0, N0
pxor X0, N0
psrlw X0, AL
pxor N0, X0
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
add VALUES, 8*2
jmp .PADDING
.TRY7:
LOAD7
pcmpgtw N0, X0
paddw X0, N0
pxor X0, N0
psrlw X0, AL
pxor N0, X0
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
add VALUES, 8*2
.PADDING:
mov K, LEN
add K, 7
and K, -8
shr K, 3
sub K, DCTSIZE2/8
jz .EPADDING
align 16
.ZEROLOOP:
movdqa XMMWORD [VALUES + 0], ZERO
add VALUES, 8*2
inc K
jnz .ZEROLOOP
.EPADDING:
sub VALUES, DCTSIZE2*2
REDUCE0
pop ebp
pop edi
pop esi
; pop edx ; need not be preserved
pop ecx
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
%undef ZERO
%undef X0
%undef X1
%undef N0
%undef N1
%undef AL
%undef K
%undef LUT
%undef T0
%undef T1
%undef BLOCK
%undef VALUES
%undef LEN
;
; Prepare data for jsimd_encode_mcu_AC_refine().
;

View File

@@ -1199,6 +1199,37 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
dctbl, actbl);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_first_prepare(void)
{
init_simd();
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (SIZEOF_SIZE_T != 4)
return 0;
if (!(simd_support & JSIMD_SSE2))
return 0;
#if defined(HAVE_BUILTIN_CTZL)
return 1;
#elif defined(HAVE_BITSCANFORWARD)
return 1;
#else
return 0;
#endif
}
GLOBAL(void)
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *zerobits)
{
jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
Sl, Al, values, zerobits);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{

View File

@@ -1074,6 +1074,10 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
c_derived_tbl *dctbl, c_derived_tbl *actbl);
/* Progressive Huffman encoding */
EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *values, size_t *zerobits);
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *absvalues, size_t *bits);

View File

@@ -582,6 +582,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
return NULL;
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_first_prepare(void)
{
return 0;
}
GLOBAL(void)
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *zerobits)
{
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{

View File

@@ -1087,6 +1087,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
return NULL;
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_first_prepare(void)
{
return 0;
}
GLOBAL(void)
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *zerobits)
{
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{

View File

@@ -844,6 +844,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
return NULL;
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_first_prepare(void)
{
return 0;
}
GLOBAL(void)
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *zerobits)
{
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{

View File

@@ -26,7 +26,8 @@
BITS 64
; --------------------------------------------------------------------------
; Macros to load data for jsimd_encode_mcu_AC_refine_prepare_sse2()
; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
; jsimd_encode_mcu_AC_refine_prepare_sse2()
%macro LOAD16 0
pxor N0, N0
@@ -245,6 +246,168 @@
mov MMWORD [r15], rax
%endmacro
;
; Prepare data for jsimd_encode_mcu_AC_first().
;
; GLOBAL(void)
; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
; const int *jpeg_natural_order_start,
; int Sl, int Al, JCOEF *values,
; size_t *zerobits)
;
; r10 = const JCOEF *block
; r11 = const int *jpeg_natural_order_start
; r12 = int Sl
; r13 = int Al
; r14 = JCOEF *values
; r15 = size_t *zerobits
%define ZERO xmm9
%define X0 xmm0
%define X1 xmm1
%define N0 xmm2
%define N1 xmm3
%define AL xmm4
%define K eax
%define LUT r11
%define T0 rcx
%define T0d ecx
%define T1 rdx
%define T1d edx
%define BLOCK r10
%define VALUES r14
%define LEN r12d
%define LENEND r13d
align 32
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [rbp - 16]
collect_args 6
movdqa XMMWORD [rbp - 16], ZERO
movd AL, r13d
pxor ZERO, ZERO
mov K, LEN
mov LENEND, LEN
and K, -16
and LENEND, 7
shr K, 4
jz .ELOOP16
.BLOOP16:
LOAD16
pcmpgtw N0, X0
pcmpgtw N1, X1
paddw X0, N0
paddw X1, N1
pxor X0, N0
pxor X1, N1
psrlw X0, AL
psrlw X1, AL
pxor N0, X0
pxor N1, X1
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (8) * 2], X1
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
add VALUES, 16*2
add LUT, 16*SIZEOF_INT
dec K
jnz .BLOOP16
.ELOOP16:
test LEN, 8
jz .TRY7
test LEN, 7
jz .TRY8
LOAD15
pcmpgtw N0, X0
pcmpgtw N1, X1
paddw X0, N0
paddw X1, N1
pxor X0, N0
pxor X1, N1
psrlw X0, AL
psrlw X1, AL
pxor N0, X0
pxor N1, X1
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (8) * 2], X1
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
add VALUES, 16*2
jmp .PADDING
.TRY8:
LOAD8
pcmpgtw N0, X0
paddw X0, N0
pxor X0, N0
psrlw X0, AL
pxor N0, X0
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
add VALUES, 8*2
jmp .PADDING
.TRY7:
LOAD7
pcmpgtw N0, X0
paddw X0, N0
pxor X0, N0
psrlw X0, AL
pxor N0, X0
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
add VALUES, 8*2
.PADDING:
mov K, LEN
add K, 7
and K, -8
shr K, 3
sub K, DCTSIZE2/8
jz .EPADDING
align 16
.ZEROLOOP:
movdqa XMMWORD [VALUES + 0], ZERO
add VALUES, 8*2
inc K
jnz .ZEROLOOP
.EPADDING:
sub VALUES, DCTSIZE2*2
REDUCE0
movdqa ZERO, XMMWORD [rbp - 16]
uncollect_args 6
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
%undef ZERO
%undef X0
%undef X1
%undef N0
%undef N1
%undef AL
%undef K
%undef LUT
%undef T0
%undef T0d
%undef T1
%undef T1d
%undef BLOCK
%undef VALUES
%undef LEN
%undef LENEND
;
; Prepare data for jsimd_encode_mcu_AC_refine().
;

View File

@@ -1022,6 +1022,37 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
dctbl, actbl);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_first_prepare(void)
{
init_simd();
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (SIZEOF_SIZE_T != 8)
return 0;
if (!(simd_support & JSIMD_SSE2))
return 0;
#if defined(HAVE_BUILTIN_CTZL)
return 1;
#elif defined(HAVE_BITSCANFORWARD64)
return 1;
#else
return 0;
#endif
}
GLOBAL(void)
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *zerobits)
{
jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
Sl, Al, values, zerobits);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{