C/SSE2 optimization of encode_mcu_AC_first()
This commit adds C and SSE2 optimizations for the encode_mcu_AC_first() function used in progressive Huffman encoding. The image used for testing can be retrieved from this page: https://blog.cloudflare.com/doubling-the-speed-of-jpegtran All timings done on `Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz` clang version is `Apple LLVM version 9.0.0 (clang-900.0.39.2)` gcc-5 version is `gcc-5 (Homebrew GCC 5.5.0) 5.5.0` gcc-7 version is `gcc-7 (Homebrew GCC 7.2.0) 7.2.0` Here are the results in comparison to libjpeg-turbo@293263c using `time ./jpegtran -outfile /dev/null -progressive -optimise -copy none print_poster_0025.jpg` C clang x86_64: +19% gcc-5 x86_64: +80% gcc-7 x86_64: +57% clang i386: +5% gcc-5 i386: +59% gcc-7 i386: +51% SSE2 clang x86_64: +79% gcc-5 x86_64: +158% gcc-7 x86_64: +122% clang i386: +71% gcc-5 i386: +134% gcc-7 i386: +135% Discussion in libjpeg-turbo/libjpeg-turbo#46
This commit is contained in:
@@ -122,6 +122,11 @@ approximately 2-3.5x.
|
||||
14. Fixed a build error when building with older MinGW releases (regression
|
||||
caused by 1.5.1[7].)
|
||||
|
||||
15. Added SIMD acceleration for progressive Huffman encoding on SSE2-capable
|
||||
x86 and x86-64 platforms. This speeds up the compression of full-color
|
||||
progressive JPEGs by about 85-90% on average (relative to libjpeg-turbo 1.5.x)
|
||||
when using modern Intel and AMD CPUs.
|
||||
|
||||
|
||||
1.5.3
|
||||
=====
|
||||
|
||||
201
jcphuff.c
201
jcphuff.c
@@ -73,6 +73,10 @@
|
||||
typedef struct {
|
||||
struct jpeg_entropy_encoder pub; /* public fields */
|
||||
|
||||
/* Pointer to routine to prepare data for encode_mcu_AC_first() */
|
||||
void (*AC_first_prepare) (const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *values, size_t *zerobits);
|
||||
/* Pointer to routine to prepare data for encode_mcu_AC_refine() */
|
||||
int (*AC_refine_prepare) (const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
@@ -144,6 +148,9 @@ typedef phuff_entropy_encoder *phuff_entropy_ptr;
|
||||
/* Forward declarations */
|
||||
METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo,
|
||||
JBLOCKROW *MCU_data);
|
||||
METHODDEF(void) encode_mcu_AC_first_prepare
|
||||
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
|
||||
JCOEF *values, size_t *zerobits);
|
||||
METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo,
|
||||
JBLOCKROW *MCU_data);
|
||||
METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo,
|
||||
@@ -208,6 +215,10 @@ start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics)
|
||||
entropy->pub.encode_mcu = encode_mcu_DC_first;
|
||||
else
|
||||
entropy->pub.encode_mcu = encode_mcu_AC_first;
|
||||
if (jsimd_can_encode_mcu_AC_first_prepare())
|
||||
entropy->AC_first_prepare = jsimd_encode_mcu_AC_first_prepare;
|
||||
else
|
||||
entropy->AC_first_prepare = encode_mcu_AC_first_prepare;
|
||||
} else {
|
||||
if (is_DC_band)
|
||||
entropy->pub.encode_mcu = encode_mcu_DC_refine;
|
||||
@@ -541,21 +552,116 @@ encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Data preparation for encode_mcu_AC_first().
|
||||
*/
|
||||
|
||||
#define COMPUTE_ABSVALUES_AC_FIRST(Sl) { \
|
||||
for (k = 0; k < Sl; k++) { \
|
||||
temp = block[jpeg_natural_order_start[k]]; \
|
||||
if (temp == 0) \
|
||||
continue; \
|
||||
/* We must apply the point transform by Al. For AC coefficients this \
|
||||
* is an integer division with rounding towards 0. To do this portably \
|
||||
* in C, we shift after obtaining the absolute value; so the code is \
|
||||
* interwoven with finding the abs value (temp) and output bits (temp2). \
|
||||
*/ \
|
||||
temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
|
||||
temp ^= temp2; \
|
||||
temp -= temp2; /* temp is abs value of input */ \
|
||||
temp >>= Al; /* apply the point transform */ \
|
||||
/* Watch out for case that nonzero coef is zero after point transform */ \
|
||||
if (temp == 0) \
|
||||
continue; \
|
||||
/* For a negative coef, want temp2 = bitwise complement of abs(coef) */ \
|
||||
temp2 ^= temp; \
|
||||
values[k] = temp; \
|
||||
values[k + DCTSIZE2] = temp2; \
|
||||
zerobits |= ((size_t)1U) << k; \
|
||||
} \
|
||||
}
|
||||
|
||||
METHODDEF(void)
|
||||
encode_mcu_AC_first_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *values, size_t *bits)
|
||||
{
|
||||
register int k, temp, temp2;
|
||||
size_t zerobits = 0U;
|
||||
int Sl0 = Sl;
|
||||
|
||||
#if SIZEOF_SIZE_T == 4
|
||||
if (Sl0 > 32)
|
||||
Sl0 = 32;
|
||||
#endif
|
||||
|
||||
COMPUTE_ABSVALUES_AC_FIRST(Sl0);
|
||||
|
||||
bits[0] = zerobits;
|
||||
#if SIZEOF_SIZE_T == 4
|
||||
zerobits = 0U;
|
||||
|
||||
if (Sl > 32) {
|
||||
Sl -= 32;
|
||||
jpeg_natural_order_start += 32;
|
||||
values += 32;
|
||||
|
||||
COMPUTE_ABSVALUES_AC_FIRST(Sl);
|
||||
}
|
||||
bits[1] = zerobits;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* MCU encoding for AC initial scan (either spectral selection,
|
||||
* or first pass of successive approximation).
|
||||
*/
|
||||
|
||||
#define ENCODE_COEFS_AC_FIRST(label) { \
|
||||
while (zerobits) { \
|
||||
r = count_zeroes(&zerobits); \
|
||||
cvalue += r; \
|
||||
label \
|
||||
temp = cvalue[0]; \
|
||||
temp2 = cvalue[DCTSIZE2]; \
|
||||
\
|
||||
/* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
|
||||
while (r > 15) { \
|
||||
emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
|
||||
r -= 16; \
|
||||
} \
|
||||
\
|
||||
/* Find the number of bits needed for the magnitude of the coefficient */ \
|
||||
nbits = JPEG_NBITS_NONZERO(temp); /* there must be at least one 1 bit */ \
|
||||
/* Check for out-of-range coefficient values */ \
|
||||
if (nbits > MAX_COEF_BITS) \
|
||||
ERREXIT(cinfo, JERR_BAD_DCT_COEF); \
|
||||
\
|
||||
/* Count/emit Huffman symbol for run length / number of bits */ \
|
||||
emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits); \
|
||||
\
|
||||
/* Emit that number of bits of the value, if positive, */ \
|
||||
/* or the complement of its magnitude, if negative. */ \
|
||||
emit_bits(entropy, (unsigned int)temp2, nbits); \
|
||||
\
|
||||
cvalue++; \
|
||||
zerobits >>= 1; \
|
||||
} \
|
||||
}
|
||||
|
||||
METHODDEF(boolean)
|
||||
encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
|
||||
{
|
||||
phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
|
||||
register int temp, temp2, temp3;
|
||||
register int nbits;
|
||||
register int r, k;
|
||||
int Se = cinfo->Se;
|
||||
register int temp, temp2;
|
||||
register int nbits, r;
|
||||
int Sl = cinfo->Se - cinfo->Ss + 1;
|
||||
int Al = cinfo->Al;
|
||||
JBLOCKROW block;
|
||||
JCOEF values_unaligned[2 * DCTSIZE2 + 15];
|
||||
JCOEF *values;
|
||||
const JCOEF *cvalue;
|
||||
size_t zerobits;
|
||||
size_t bits[8 / SIZEOF_SIZE_T];
|
||||
|
||||
entropy->next_output_byte = cinfo->dest->next_output_byte;
|
||||
entropy->free_in_buffer = cinfo->dest->free_in_buffer;
|
||||
@@ -565,61 +671,48 @@ encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
|
||||
if (entropy->restarts_to_go == 0)
|
||||
emit_restart(entropy, entropy->next_restart_num);
|
||||
|
||||
/* Encode the MCU data block */
|
||||
block = MCU_data[0];
|
||||
#ifdef WITH_SIMD
|
||||
cvalue = values = (JCOEF *)PAD((size_t)values_unaligned, 16);
|
||||
#else
|
||||
/* Not using SIMD, so alignment is not needed */
|
||||
cvalue = values = values_unaligned;
|
||||
#endif
|
||||
|
||||
/* Prepare data */
|
||||
entropy->AC_first_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
|
||||
Sl, Al, values, bits);
|
||||
|
||||
zerobits = bits[0];
|
||||
#if SIZEOF_SIZE_T == 4
|
||||
zerobits |= bits[1];
|
||||
#endif
|
||||
|
||||
/* Emit any pending EOBRUN */
|
||||
if (zerobits && (entropy->EOBRUN > 0))
|
||||
emit_eobrun(entropy);
|
||||
|
||||
#if SIZEOF_SIZE_T == 4
|
||||
zerobits = bits[0];
|
||||
#endif
|
||||
|
||||
/* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
|
||||
|
||||
r = 0; /* r = run length of zeros */
|
||||
ENCODE_COEFS_AC_FIRST();
|
||||
|
||||
for (k = cinfo->Ss; k <= Se; k++) {
|
||||
if ((temp = (*block)[jpeg_natural_order[k]]) == 0) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
/* We must apply the point transform by Al. For AC coefficients this
|
||||
* is an integer division with rounding towards 0. To do this portably
|
||||
* in C, we shift after obtaining the absolute value; so the code is
|
||||
* interwoven with finding the abs value (temp) and output bits (temp2).
|
||||
*/
|
||||
temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
|
||||
temp ^= temp3;
|
||||
temp -= temp3; /* temp is abs value of input */
|
||||
temp >>= Al; /* apply the point transform */
|
||||
/* For a negative coef, want temp2 = bitwise complement of abs(coef) */
|
||||
temp2 = temp ^ temp3;
|
||||
/* Watch out for case that nonzero coef is zero after point transform */
|
||||
if (temp == 0) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Emit any pending EOBRUN */
|
||||
if (entropy->EOBRUN > 0)
|
||||
emit_eobrun(entropy);
|
||||
/* if run length > 15, must emit special run-length-16 codes (0xF0) */
|
||||
while (r > 15) {
|
||||
emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
|
||||
r -= 16;
|
||||
}
|
||||
|
||||
/* Find the number of bits needed for the magnitude of the coefficient */
|
||||
nbits = JPEG_NBITS_NONZERO(temp); /* there must be at least one 1 bit */
|
||||
/* Check for out-of-range coefficient values */
|
||||
if (nbits > MAX_COEF_BITS)
|
||||
ERREXIT(cinfo, JERR_BAD_DCT_COEF);
|
||||
|
||||
/* Count/emit Huffman symbol for run length / number of bits */
|
||||
emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits);
|
||||
|
||||
/* Emit that number of bits of the value, if positive, */
|
||||
/* or the complement of its magnitude, if negative. */
|
||||
emit_bits(entropy, (unsigned int)temp2, nbits);
|
||||
|
||||
r = 0; /* reset zero run length */
|
||||
#if SIZEOF_SIZE_T == 4
|
||||
zerobits = bits[1];
|
||||
if (zerobits) {
|
||||
int diff = ((values + DCTSIZE2 / 2) - cvalue);
|
||||
r = count_zeroes(&zerobits);
|
||||
r += diff;
|
||||
cvalue += r;
|
||||
goto first_iter_ac_first;
|
||||
}
|
||||
|
||||
if (r > 0) { /* If there are trailing zeroes, */
|
||||
ENCODE_COEFS_AC_FIRST(first_iter_ac_first:);
|
||||
#endif
|
||||
|
||||
if (cvalue < (values + Sl)) { /* If there are trailing zeroes, */
|
||||
entropy->EOBRUN++; /* count an EOB */
|
||||
if (entropy->EOBRUN == 0x7FFF)
|
||||
emit_eobrun(entropy); /* force it out to avoid overflow */
|
||||
|
||||
6
jsimd.h
6
jsimd.h
@@ -104,6 +104,12 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block(void *state, JOCTET *buffer,
|
||||
c_derived_tbl *dctbl,
|
||||
c_derived_tbl *actbl);
|
||||
|
||||
EXTERN(int) jsimd_can_encode_mcu_AC_first_prepare(void);
|
||||
|
||||
EXTERN(void) jsimd_encode_mcu_AC_first_prepare
|
||||
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
|
||||
JCOEF *values, size_t *zerobits);
|
||||
|
||||
EXTERN(int) jsimd_can_encode_mcu_AC_refine_prepare(void);
|
||||
|
||||
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare
|
||||
|
||||
13
jsimd_none.c
13
jsimd_none.c
@@ -390,6 +390,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_first_prepare(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *values, size_t *zerobits)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
|
||||
@@ -692,6 +692,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
dctbl, actbl);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_first_prepare(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *values, size_t *zerobits)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
|
||||
@@ -770,6 +770,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
last_dc_val, dctbl, actbl);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_first_prepare(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *values, size_t *zerobits)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
|
||||
@@ -25,7 +25,8 @@
|
||||
BITS 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; Macros to load data for jsimd_encode_mcu_AC_refine_prepare_sse2()
|
||||
; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
|
||||
; jsimd_encode_mcu_AC_refine_prepare_sse2()
|
||||
|
||||
%macro LOAD16 0
|
||||
pxor N0, N0
|
||||
@@ -245,6 +246,179 @@
|
||||
mov INT [edi+SIZEOF_INT], edx
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; Prepare data for jsimd_encode_mcu_AC_first().
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
|
||||
; const int *jpeg_natural_order_start,
|
||||
; int Sl, int Al, JCOEF *values,
|
||||
; size_t *zerobits)
|
||||
;
|
||||
; eax + 8 = const JCOEF *block
|
||||
; eax + 12 = const int *jpeg_natural_order_start
|
||||
; eax + 16 = int Sl
|
||||
; eax + 20 = int Al
|
||||
; eax + 24 = JCOEF *values
|
||||
; eax + 28 = size_t *zerobits
|
||||
|
||||
%define ZERO xmm7
|
||||
%define X0 xmm0
|
||||
%define X1 xmm1
|
||||
%define N0 xmm2
|
||||
%define N1 xmm3
|
||||
%define AL xmm4
|
||||
%define K eax
|
||||
%define LENEND eax
|
||||
%define LUT ebx
|
||||
%define T0 ecx
|
||||
%define T1 edx
|
||||
%define BLOCK esi
|
||||
%define VALUES edi
|
||||
%define LEN ebp
|
||||
|
||||
%define ZEROBITS INT [esp + 5 * 4]
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
|
||||
|
||||
EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
sub esp, 4
|
||||
push ebx
|
||||
push ecx
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
|
||||
mov BLOCK, INT [eax + 8]
|
||||
mov LUT, INT [eax + 12]
|
||||
mov VALUES, INT [eax + 24]
|
||||
movd AL, INT [eax + 20]
|
||||
mov T0, INT [eax + 28]
|
||||
mov ZEROBITS, T0
|
||||
mov LEN, INT [eax + 16]
|
||||
pxor ZERO, ZERO
|
||||
mov K, LEN
|
||||
and K, -16
|
||||
shr K, 4
|
||||
jz .ELOOP16
|
||||
.BLOOP16:
|
||||
LOAD16
|
||||
pcmpgtw N0, X0
|
||||
pcmpgtw N1, X1
|
||||
paddw X0, N0
|
||||
paddw X1, N1
|
||||
pxor X0, N0
|
||||
pxor X1, N1
|
||||
psrlw X0, AL
|
||||
psrlw X1, AL
|
||||
pxor N0, X0
|
||||
pxor N1, X1
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (8) * 2], X1
|
||||
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
||||
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
|
||||
add VALUES, 16*2
|
||||
add LUT, 16*SIZEOF_INT
|
||||
dec K
|
||||
jnz .BLOOP16
|
||||
.ELOOP16:
|
||||
mov LENEND, LEN
|
||||
and LENEND, 7
|
||||
|
||||
test LEN, 8
|
||||
jz .TRY7
|
||||
test LEN, 7
|
||||
jz .TRY8
|
||||
|
||||
LOAD15
|
||||
pcmpgtw N0, X0
|
||||
pcmpgtw N1, X1
|
||||
paddw X0, N0
|
||||
paddw X1, N1
|
||||
pxor X0, N0
|
||||
pxor X1, N1
|
||||
psrlw X0, AL
|
||||
psrlw X1, AL
|
||||
pxor N0, X0
|
||||
pxor N1, X1
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (8) * 2], X1
|
||||
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
||||
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
|
||||
add VALUES, 16*2
|
||||
jmp .PADDING
|
||||
.TRY8:
|
||||
LOAD8
|
||||
pcmpgtw N0, X0
|
||||
paddw X0, N0
|
||||
pxor X0, N0
|
||||
psrlw X0, AL
|
||||
pxor N0, X0
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
||||
add VALUES, 8*2
|
||||
jmp .PADDING
|
||||
.TRY7:
|
||||
LOAD7
|
||||
pcmpgtw N0, X0
|
||||
paddw X0, N0
|
||||
pxor X0, N0
|
||||
psrlw X0, AL
|
||||
pxor N0, X0
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
||||
add VALUES, 8*2
|
||||
.PADDING:
|
||||
mov K, LEN
|
||||
add K, 7
|
||||
and K, -8
|
||||
shr K, 3
|
||||
sub K, DCTSIZE2/8
|
||||
jz .EPADDING
|
||||
align 16
|
||||
.ZEROLOOP:
|
||||
movdqa XMMWORD [VALUES + 0], ZERO
|
||||
add VALUES, 8*2
|
||||
inc K
|
||||
jnz .ZEROLOOP
|
||||
.EPADDING:
|
||||
sub VALUES, DCTSIZE2*2
|
||||
|
||||
REDUCE0
|
||||
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
pop ecx
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
%undef ZERO
|
||||
%undef X0
|
||||
%undef X1
|
||||
%undef N0
|
||||
%undef N1
|
||||
%undef AL
|
||||
%undef K
|
||||
%undef LUT
|
||||
%undef T0
|
||||
%undef T1
|
||||
%undef BLOCK
|
||||
%undef VALUES
|
||||
%undef LEN
|
||||
|
||||
;
|
||||
; Prepare data for jsimd_encode_mcu_AC_refine().
|
||||
;
|
||||
|
||||
@@ -1199,6 +1199,37 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
dctbl, actbl);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_first_prepare(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (SIZEOF_SIZE_T != 4)
|
||||
return 0;
|
||||
if (!(simd_support & JSIMD_SSE2))
|
||||
return 0;
|
||||
#if defined(HAVE_BUILTIN_CTZL)
|
||||
return 1;
|
||||
#elif defined(HAVE_BITSCANFORWARD)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *values, size_t *zerobits)
|
||||
{
|
||||
jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
|
||||
Sl, Al, values, zerobits);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
|
||||
@@ -1074,6 +1074,10 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
|
||||
c_derived_tbl *dctbl, c_derived_tbl *actbl);
|
||||
|
||||
/* Progressive Huffman encoding */
|
||||
EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
|
||||
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
|
||||
JCOEF *values, size_t *zerobits);
|
||||
|
||||
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
|
||||
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
|
||||
JCOEF *absvalues, size_t *bits);
|
||||
|
||||
@@ -582,6 +582,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_first_prepare(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *values, size_t *zerobits)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
|
||||
@@ -1087,6 +1087,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_first_prepare(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *values, size_t *zerobits)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
|
||||
@@ -844,6 +844,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_first_prepare(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *values, size_t *zerobits)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
|
||||
@@ -26,7 +26,8 @@
|
||||
BITS 64
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; Macros to load data for jsimd_encode_mcu_AC_refine_prepare_sse2()
|
||||
; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
|
||||
; jsimd_encode_mcu_AC_refine_prepare_sse2()
|
||||
|
||||
%macro LOAD16 0
|
||||
pxor N0, N0
|
||||
@@ -245,6 +246,168 @@
|
||||
mov MMWORD [r15], rax
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; Prepare data for jsimd_encode_mcu_AC_first().
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
|
||||
; const int *jpeg_natural_order_start,
|
||||
; int Sl, int Al, JCOEF *values,
|
||||
; size_t *zerobits)
|
||||
;
|
||||
; r10 = const JCOEF *block
|
||||
; r11 = const int *jpeg_natural_order_start
|
||||
; r12 = int Sl
|
||||
; r13 = int Al
|
||||
; r14 = JCOEF *values
|
||||
; r15 = size_t *zerobits
|
||||
|
||||
%define ZERO xmm9
|
||||
%define X0 xmm0
|
||||
%define X1 xmm1
|
||||
%define N0 xmm2
|
||||
%define N1 xmm3
|
||||
%define AL xmm4
|
||||
%define K eax
|
||||
%define LUT r11
|
||||
%define T0 rcx
|
||||
%define T0d ecx
|
||||
%define T1 rdx
|
||||
%define T1d edx
|
||||
%define BLOCK r10
|
||||
%define VALUES r14
|
||||
%define LEN r12d
|
||||
%define LENEND r13d
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
|
||||
|
||||
EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [rbp - 16]
|
||||
collect_args 6
|
||||
|
||||
movdqa XMMWORD [rbp - 16], ZERO
|
||||
|
||||
movd AL, r13d
|
||||
pxor ZERO, ZERO
|
||||
mov K, LEN
|
||||
mov LENEND, LEN
|
||||
and K, -16
|
||||
and LENEND, 7
|
||||
shr K, 4
|
||||
jz .ELOOP16
|
||||
.BLOOP16:
|
||||
LOAD16
|
||||
pcmpgtw N0, X0
|
||||
pcmpgtw N1, X1
|
||||
paddw X0, N0
|
||||
paddw X1, N1
|
||||
pxor X0, N0
|
||||
pxor X1, N1
|
||||
psrlw X0, AL
|
||||
psrlw X1, AL
|
||||
pxor N0, X0
|
||||
pxor N1, X1
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (8) * 2], X1
|
||||
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
||||
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
|
||||
add VALUES, 16*2
|
||||
add LUT, 16*SIZEOF_INT
|
||||
dec K
|
||||
jnz .BLOOP16
|
||||
.ELOOP16:
|
||||
test LEN, 8
|
||||
jz .TRY7
|
||||
test LEN, 7
|
||||
jz .TRY8
|
||||
|
||||
LOAD15
|
||||
pcmpgtw N0, X0
|
||||
pcmpgtw N1, X1
|
||||
paddw X0, N0
|
||||
paddw X1, N1
|
||||
pxor X0, N0
|
||||
pxor X1, N1
|
||||
psrlw X0, AL
|
||||
psrlw X1, AL
|
||||
pxor N0, X0
|
||||
pxor N1, X1
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (8) * 2], X1
|
||||
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
||||
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
|
||||
add VALUES, 16*2
|
||||
jmp .PADDING
|
||||
.TRY8:
|
||||
LOAD8
|
||||
pcmpgtw N0, X0
|
||||
paddw X0, N0
|
||||
pxor X0, N0
|
||||
psrlw X0, AL
|
||||
pxor N0, X0
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
||||
add VALUES, 8*2
|
||||
jmp .PADDING
|
||||
.TRY7:
|
||||
LOAD7
|
||||
pcmpgtw N0, X0
|
||||
paddw X0, N0
|
||||
pxor X0, N0
|
||||
psrlw X0, AL
|
||||
pxor N0, X0
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
||||
add VALUES, 8*2
|
||||
.PADDING:
|
||||
mov K, LEN
|
||||
add K, 7
|
||||
and K, -8
|
||||
shr K, 3
|
||||
sub K, DCTSIZE2/8
|
||||
jz .EPADDING
|
||||
align 16
|
||||
.ZEROLOOP:
|
||||
movdqa XMMWORD [VALUES + 0], ZERO
|
||||
add VALUES, 8*2
|
||||
inc K
|
||||
jnz .ZEROLOOP
|
||||
.EPADDING:
|
||||
sub VALUES, DCTSIZE2*2
|
||||
|
||||
REDUCE0
|
||||
|
||||
movdqa ZERO, XMMWORD [rbp - 16]
|
||||
uncollect_args 6
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
%undef ZERO
|
||||
%undef X0
|
||||
%undef X1
|
||||
%undef N0
|
||||
%undef N1
|
||||
%undef AL
|
||||
%undef K
|
||||
%undef LUT
|
||||
%undef T0
|
||||
%undef T0d
|
||||
%undef T1
|
||||
%undef T1d
|
||||
%undef BLOCK
|
||||
%undef VALUES
|
||||
%undef LEN
|
||||
%undef LENEND
|
||||
|
||||
;
|
||||
; Prepare data for jsimd_encode_mcu_AC_refine().
|
||||
;
|
||||
|
||||
@@ -1022,6 +1022,37 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
dctbl, actbl);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_first_prepare(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (SIZEOF_SIZE_T != 8)
|
||||
return 0;
|
||||
if (!(simd_support & JSIMD_SSE2))
|
||||
return 0;
|
||||
#if defined(HAVE_BUILTIN_CTZL)
|
||||
return 1;
|
||||
#elif defined(HAVE_BITSCANFORWARD64)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *values, size_t *zerobits)
|
||||
{
|
||||
jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
|
||||
Sl, Al, values, zerobits);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user