Optimize Huffman encoding
This commit improves the C and SSE2 Huffman encoding implementations in
the following ways:
- Avoid using xmm8-xmm15 in the x86-64 SSE2 implementation. There is no
actual need to use those registers, and avoiding them produces a
cleaner WIN64 function entry/exit-- as well as shorter code, since REX
prefixes can be avoided (this is helpful on certain CPUs, such as
Intel Atom, for which instruction fetch and decoding can be a
bottleneck.)
- Optimize register usage so that fewer REX prefixes and
register-register moves are needed.
- Use the bit counter to store the number of free bits in the bit buffer
rather than the number of bits in the bit buffer. This changes the
method for inserting a code into the bit buffer to:
(put_buffer |= code << (free_bits -= code_size));
As a result:
* Only one bit counter needs to stay in a register (we just keep it in
cl.)
* The bit buffer contents are already properly aligned to be written
out (after a byte swap.)
* Adjusting the free bits counter and checking if the bit buffer is
full can be combined into a single operation.
* We can wait to flush the bit buffer until the buffer is actually
full and not just in danger of becoming full. Thus, eight bytes can
be flushed at a time.
- Speed is quite sensitive to the alignment of branch target labels, so
insert some padding and remove branches from the flush code.
(Flushing this way isn't actually faster when compared to using
branches, but the branchless code doesn't need extra alignment and is
thus smaller.)
- Speculatively write out the bit buffer as a single 8-byte write,
falling back to a byte-by-byte write only if there are any 0xFF bytes
in the bit buffer that need to be encoded as 0xFF 0x00.
- Use MMX registers for the 32-bit implementation (so the bit buffer can
be 64 bits wide.)
- Slightly reduce overall function code size.
- Eliminate or combine a few SSE instructions.
- Make some minor improvements to instruction scheduling.
- Adjust flush_bits() in jchuff.c to handle cases in which the bit
buffer has less than 7 free bits (apparently that couldn't happen
before.)
Based on:
947a09defa
262ebb6b81
6e9a091221
See change log for performance claims.
Closes #292
This commit is contained in:
@@ -45,6 +45,13 @@ longer supports 32-bit Java virtual machines. Oracle no longer provides a
|
||||
32-bit JVM for macOS, and Apple's implementation of Java 1.6 (Java for OS X
|
||||
systems) is long obsolete.
|
||||
|
||||
5. The SSE2 (x86 SIMD) and C Huffman encoding algorithms have been
|
||||
significantly optimized, resulting in a measured average overall compression
|
||||
speedup of 12-28% for 64-bit code and 22-52% for 32-bit code on various Intel
|
||||
and AMD CPUs, as well as a measured average overall compression speedup of
|
||||
0-23% on platforms that do not have a SIMD-accelerated Huffman encoding
|
||||
implementation.
|
||||
|
||||
|
||||
2.0.4
|
||||
=====
|
||||
|
||||
324
jchuff.c
324
jchuff.c
@@ -72,9 +72,33 @@ typedef unsigned long long bit_buf_type;
|
||||
typedef size_t bit_buf_type;
|
||||
#endif
|
||||
|
||||
/* NOTE: The more optimal Huffman encoding algorithm has not yet been
|
||||
* implemented in the ARM NEON SIMD extensions, which is why we retain the old
|
||||
* Huffman encoder behavior for that platform.
|
||||
*/
|
||||
#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__))
|
||||
typedef unsigned long long simd_bit_buf_type;
|
||||
#else
|
||||
typedef bit_buf_type simd_bit_buf_type;
|
||||
#endif
|
||||
|
||||
#if (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 8) || defined(_WIN64) || \
|
||||
(defined(__x86_64__) && defined(__ILP32__))
|
||||
#define BIT_BUF_SIZE 64
|
||||
#elif (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 4) || defined(_WIN32)
|
||||
#define BIT_BUF_SIZE 32
|
||||
#else
|
||||
#error Cannot determine word size
|
||||
#endif
|
||||
#define SIMD_BIT_BUF_SIZE (sizeof(simd_bit_buf_type) * 8)
|
||||
|
||||
typedef struct {
|
||||
bit_buf_type put_buffer; /* current bit-accumulation buffer */
|
||||
int put_bits; /* # of bits now in it */
|
||||
union {
|
||||
bit_buf_type c;
|
||||
simd_bit_buf_type simd;
|
||||
} put_buffer; /* current bit accumulation buffer */
|
||||
int free_bits; /* # of bits available in it */
|
||||
/* (ARM SIMD: # of bits now in it) */
|
||||
int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
|
||||
} savable_state;
|
||||
|
||||
@@ -110,6 +134,7 @@ typedef struct {
|
||||
size_t free_in_buffer; /* # of byte spaces remaining in buffer */
|
||||
savable_state cur; /* Current bit buffer & DC state */
|
||||
j_compress_ptr cinfo; /* dump_buffer needs access to this */
|
||||
int simd;
|
||||
} working_state;
|
||||
|
||||
|
||||
@@ -188,8 +213,17 @@ start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics)
|
||||
}
|
||||
|
||||
/* Initialize bit buffer to empty */
|
||||
entropy->saved.put_buffer = 0;
|
||||
entropy->saved.put_bits = 0;
|
||||
if (entropy->simd) {
|
||||
entropy->saved.put_buffer.simd = 0;
|
||||
#if defined(__arm__) || defined(__aarch64__)
|
||||
entropy->saved.free_bits = 0;
|
||||
#else
|
||||
entropy->saved.free_bits = SIMD_BIT_BUF_SIZE;
|
||||
#endif
|
||||
} else {
|
||||
entropy->saved.put_buffer.c = 0;
|
||||
entropy->saved.free_bits = BIT_BUF_SIZE;
|
||||
}
|
||||
|
||||
/* Initialize restart stuff */
|
||||
entropy->restarts_to_go = cinfo->restart_interval;
|
||||
@@ -321,94 +355,94 @@ dump_buffer(working_state *state)
|
||||
|
||||
/* Outputting bits to the file */
|
||||
|
||||
/* These macros perform the same task as the emit_bits() function in the
|
||||
* original libjpeg code. In addition to reducing overhead by explicitly
|
||||
* inlining the code, additional performance is achieved by taking into
|
||||
* account the size of the bit buffer and waiting until it is almost full
|
||||
* before emptying it. This mostly benefits 64-bit platforms, since 6
|
||||
* bytes can be stored in a 64-bit bit buffer before it has to be emptied.
|
||||
/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be
|
||||
* encoded as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the
|
||||
* byte is 0xFF. Otherwise, the output buffer pointer is advanced by 1, and
|
||||
* the speculative 0 byte will be overwritten by the next byte.
|
||||
*/
|
||||
|
||||
#define EMIT_BYTE() { \
|
||||
JOCTET c; \
|
||||
put_bits -= 8; \
|
||||
c = (JOCTET)GETJOCTET(put_buffer >> put_bits); \
|
||||
*buffer++ = c; \
|
||||
if (c == 0xFF) /* need to stuff a zero byte? */ \
|
||||
*buffer++ = 0; \
|
||||
#define EMIT_BYTE(b) { \
|
||||
buffer[0] = (JOCTET)(b); \
|
||||
buffer[1] = 0; \
|
||||
buffer -= -2 + ((JOCTET)(b) < 0xFF); \
|
||||
}
|
||||
|
||||
#define PUT_BITS(code, size) { \
|
||||
put_bits += size; \
|
||||
put_buffer = (put_buffer << size) | code; \
|
||||
}
|
||||
/* Output the entire bit buffer. If there are no 0xFF bytes in it, then write
|
||||
* directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to
|
||||
* encode 0xFF as 0xFF 0x00.
|
||||
*/
|
||||
#if BIT_BUF_SIZE == 64
|
||||
|
||||
#if SIZEOF_SIZE_T != 8 && !defined(_WIN64)
|
||||
|
||||
#define CHECKBUF15() { \
|
||||
if (put_bits > 15) { \
|
||||
EMIT_BYTE() \
|
||||
EMIT_BYTE() \
|
||||
#define FLUSH() { \
|
||||
if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
|
||||
EMIT_BYTE(put_buffer >> 56) \
|
||||
EMIT_BYTE(put_buffer >> 48) \
|
||||
EMIT_BYTE(put_buffer >> 40) \
|
||||
EMIT_BYTE(put_buffer >> 32) \
|
||||
EMIT_BYTE(put_buffer >> 24) \
|
||||
EMIT_BYTE(put_buffer >> 16) \
|
||||
EMIT_BYTE(put_buffer >> 8) \
|
||||
EMIT_BYTE(put_buffer ) \
|
||||
} else { \
|
||||
buffer[0] = (JOCTET)(put_buffer >> 56); \
|
||||
buffer[1] = (JOCTET)(put_buffer >> 48); \
|
||||
buffer[2] = (JOCTET)(put_buffer >> 40); \
|
||||
buffer[3] = (JOCTET)(put_buffer >> 32); \
|
||||
buffer[4] = (JOCTET)(put_buffer >> 24); \
|
||||
buffer[5] = (JOCTET)(put_buffer >> 16); \
|
||||
buffer[6] = (JOCTET)(put_buffer >> 8); \
|
||||
buffer[7] = (JOCTET)(put_buffer); \
|
||||
buffer += 8; \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define CHECKBUF31() { \
|
||||
if (put_bits > 31) { \
|
||||
EMIT_BYTE() \
|
||||
EMIT_BYTE() \
|
||||
EMIT_BYTE() \
|
||||
EMIT_BYTE() \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CHECKBUF47() { \
|
||||
if (put_bits > 47) { \
|
||||
EMIT_BYTE() \
|
||||
EMIT_BYTE() \
|
||||
EMIT_BYTE() \
|
||||
EMIT_BYTE() \
|
||||
EMIT_BYTE() \
|
||||
EMIT_BYTE() \
|
||||
} \
|
||||
}
|
||||
|
||||
#if !defined(_WIN32) && !defined(SIZEOF_SIZE_T)
|
||||
#error Cannot determine word size
|
||||
#endif
|
||||
|
||||
#if SIZEOF_SIZE_T == 8 || defined(_WIN64) || (defined(__x86_64__) && defined(__ILP32__))
|
||||
|
||||
#define EMIT_BITS(code, size) { \
|
||||
CHECKBUF47() \
|
||||
PUT_BITS(code, size) \
|
||||
}
|
||||
|
||||
#define EMIT_CODE(code, size) { \
|
||||
temp2 &= (((JLONG)1) << nbits) - 1; \
|
||||
CHECKBUF31() \
|
||||
PUT_BITS(code, size) \
|
||||
PUT_BITS(temp2, nbits) \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define EMIT_BITS(code, size) { \
|
||||
PUT_BITS(code, size) \
|
||||
CHECKBUF15() \
|
||||
}
|
||||
|
||||
#define EMIT_CODE(code, size) { \
|
||||
temp2 &= (((JLONG)1) << nbits) - 1; \
|
||||
PUT_BITS(code, size) \
|
||||
CHECKBUF15() \
|
||||
PUT_BITS(temp2, nbits) \
|
||||
CHECKBUF15() \
|
||||
#define FLUSH() { \
|
||||
if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
|
||||
EMIT_BYTE(put_buffer >> 24) \
|
||||
EMIT_BYTE(put_buffer >> 16) \
|
||||
EMIT_BYTE(put_buffer >> 8) \
|
||||
EMIT_BYTE(put_buffer ) \
|
||||
} else { \
|
||||
buffer[0] = (JOCTET)(put_buffer >> 24); \
|
||||
buffer[1] = (JOCTET)(put_buffer >> 16); \
|
||||
buffer[2] = (JOCTET)(put_buffer >> 8); \
|
||||
buffer[3] = (JOCTET)(put_buffer); \
|
||||
buffer += 4; \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* Fill the bit buffer to capacity with the leading bits from code, then output
|
||||
* the bit buffer and put the remaining bits from code into the bit buffer.
|
||||
*/
|
||||
#define PUT_AND_FLUSH(code, size) { \
|
||||
put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
|
||||
FLUSH() \
|
||||
free_bits += BIT_BUF_SIZE; \
|
||||
put_buffer = code; \
|
||||
}
|
||||
|
||||
/* Insert code into the bit buffer and output the bit buffer if needed.
|
||||
* NOTE: We can't flush with free_bits == 0, since the left shift in
|
||||
* PUT_AND_FLUSH() would have undefined behavior.
|
||||
*/
|
||||
#define PUT_BITS(code, size) { \
|
||||
free_bits -= size; \
|
||||
if (free_bits < 0) \
|
||||
PUT_AND_FLUSH(code, size) \
|
||||
else \
|
||||
put_buffer = (put_buffer << size) | code; \
|
||||
}
|
||||
|
||||
#define PUT_CODE(code, size) { \
|
||||
temp &= (((JLONG)1) << nbits) - 1; \
|
||||
temp |= code << nbits; \
|
||||
nbits += size; \
|
||||
PUT_BITS(temp, nbits) \
|
||||
}
|
||||
|
||||
|
||||
/* Although it is exceedingly rare, it is possible for a Huffman-encoded
|
||||
* coefficient block to be larger than the 128-byte unencoded block. For each
|
||||
@@ -431,6 +465,7 @@ dump_buffer(working_state *state)
|
||||
|
||||
#define STORE_BUFFER() { \
|
||||
if (localbuf) { \
|
||||
size_t bytes, bytestocopy; \
|
||||
bytes = buffer - _buffer; \
|
||||
buffer = _buffer; \
|
||||
while (bytes > 0) { \
|
||||
@@ -453,20 +488,46 @@ dump_buffer(working_state *state)
|
||||
LOCAL(boolean)
|
||||
flush_bits(working_state *state)
|
||||
{
|
||||
JOCTET _buffer[BUFSIZE], *buffer;
|
||||
bit_buf_type put_buffer; int put_bits;
|
||||
size_t bytes, bytestocopy; int localbuf = 0;
|
||||
JOCTET _buffer[BUFSIZE], *buffer, temp;
|
||||
simd_bit_buf_type put_buffer; int put_bits;
|
||||
int localbuf = 0;
|
||||
|
||||
if (state->simd) {
|
||||
#if defined __arm__ || defined __aarch64__
|
||||
put_bits = state->cur.free_bits;
|
||||
#else
|
||||
put_bits = SIMD_BIT_BUF_SIZE - state->cur.free_bits;
|
||||
#endif
|
||||
put_buffer = state->cur.put_buffer.simd;
|
||||
} else {
|
||||
put_bits = BIT_BUF_SIZE - state->cur.free_bits;
|
||||
put_buffer = state->cur.put_buffer.c;
|
||||
}
|
||||
|
||||
put_buffer = state->cur.put_buffer;
|
||||
put_bits = state->cur.put_bits;
|
||||
LOAD_BUFFER()
|
||||
|
||||
/* fill any partial byte with ones */
|
||||
PUT_BITS(0x7F, 7)
|
||||
while (put_bits >= 8) EMIT_BYTE()
|
||||
while (put_bits >= 8) {
|
||||
put_bits -= 8;
|
||||
temp = (JOCTET)(put_buffer >> put_bits);
|
||||
EMIT_BYTE(temp)
|
||||
}
|
||||
if (put_bits) {
|
||||
/* fill partial byte with ones */
|
||||
temp = (JOCTET)((put_buffer << (8 - put_bits)) | (0xFF >> put_bits));
|
||||
EMIT_BYTE(temp)
|
||||
}
|
||||
|
||||
state->cur.put_buffer = 0; /* and reset bit-buffer to empty */
|
||||
state->cur.put_bits = 0;
|
||||
if (state->simd) { /* and reset bit buffer to empty */
|
||||
state->cur.put_buffer.simd = 0;
|
||||
#if defined __arm__ || defined __aarch64__
|
||||
state->cur.free_bits = 0;
|
||||
#else
|
||||
state->cur.free_bits = SIMD_BIT_BUF_SIZE;
|
||||
#endif
|
||||
} else {
|
||||
state->cur.put_buffer.c = 0;
|
||||
state->cur.free_bits = BIT_BUF_SIZE;
|
||||
}
|
||||
STORE_BUFFER()
|
||||
|
||||
return TRUE;
|
||||
@@ -480,7 +541,7 @@ encode_one_block_simd(working_state *state, JCOEFPTR block, int last_dc_val,
|
||||
c_derived_tbl *dctbl, c_derived_tbl *actbl)
|
||||
{
|
||||
JOCTET _buffer[BUFSIZE], *buffer;
|
||||
size_t bytes, bytestocopy; int localbuf = 0;
|
||||
int localbuf = 0;
|
||||
|
||||
LOAD_BUFFER()
|
||||
|
||||
@@ -496,53 +557,41 @@ LOCAL(boolean)
|
||||
encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
|
||||
c_derived_tbl *dctbl, c_derived_tbl *actbl)
|
||||
{
|
||||
int temp, temp2, temp3;
|
||||
int nbits;
|
||||
int r, code, size;
|
||||
int temp, nbits, free_bits;
|
||||
bit_buf_type put_buffer;
|
||||
JOCTET _buffer[BUFSIZE], *buffer;
|
||||
bit_buf_type put_buffer; int put_bits;
|
||||
int code_0xf0 = actbl->ehufco[0xf0], size_0xf0 = actbl->ehufsi[0xf0];
|
||||
size_t bytes, bytestocopy; int localbuf = 0;
|
||||
int localbuf = 0;
|
||||
|
||||
put_buffer = state->cur.put_buffer;
|
||||
put_bits = state->cur.put_bits;
|
||||
free_bits = state->cur.free_bits;
|
||||
put_buffer = state->cur.put_buffer.c;
|
||||
LOAD_BUFFER()
|
||||
|
||||
/* Encode the DC coefficient difference per section F.1.2.1 */
|
||||
|
||||
temp = temp2 = block[0] - last_dc_val;
|
||||
temp = block[0] - last_dc_val;
|
||||
|
||||
/* This is a well-known technique for obtaining the absolute value without a
|
||||
* branch. It is derived from an assembly language technique presented in
|
||||
* "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by
|
||||
* Agner Fog.
|
||||
* Agner Fog. This code assumes we are on a two's complement machine.
|
||||
*/
|
||||
temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
|
||||
temp ^= temp3;
|
||||
temp -= temp3;
|
||||
|
||||
/* For a negative input, want temp2 = bitwise complement of abs(input) */
|
||||
/* This code assumes we are on a two's complement machine */
|
||||
temp2 += temp3;
|
||||
nbits = temp >> (CHAR_BIT * sizeof(int) - 1);
|
||||
temp += nbits;
|
||||
nbits ^= temp;
|
||||
|
||||
/* Find the number of bits needed for the magnitude of the coefficient */
|
||||
nbits = JPEG_NBITS(temp);
|
||||
nbits = JPEG_NBITS(nbits);
|
||||
|
||||
/* Emit the Huffman-coded symbol for the number of bits */
|
||||
code = dctbl->ehufco[nbits];
|
||||
size = dctbl->ehufsi[nbits];
|
||||
EMIT_BITS(code, size)
|
||||
|
||||
/* Mask off any extra bits in code */
|
||||
temp2 &= (((JLONG)1) << nbits) - 1;
|
||||
|
||||
/* Emit that number of bits of the value, if positive, */
|
||||
/* or the complement of its magnitude, if negative. */
|
||||
EMIT_BITS(temp2, nbits)
|
||||
/* Emit the Huffman-coded symbol for the number of bits.
|
||||
* Emit that number of bits of the value, if positive,
|
||||
* or the complement of its magnitude, if negative.
|
||||
*/
|
||||
PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits])
|
||||
|
||||
/* Encode the AC coefficients per section F.1.2.2 */
|
||||
|
||||
r = 0; /* r = run length of zeros */
|
||||
{
|
||||
int r = 0; /* r = run length of zeros */
|
||||
|
||||
/* Manually unroll the k loop to eliminate the counter variable. This
|
||||
* improves performance greatly on systems with a limited number of
|
||||
@@ -550,25 +599,21 @@ encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
|
||||
*/
|
||||
#define kloop(jpeg_natural_order_of_k) { \
|
||||
if ((temp = block[jpeg_natural_order_of_k]) == 0) { \
|
||||
r++; \
|
||||
r += 16; \
|
||||
} else { \
|
||||
temp2 = temp; \
|
||||
/* Branch-less absolute value, bitwise complement, etc., same as above */ \
|
||||
temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); \
|
||||
temp ^= temp3; \
|
||||
temp -= temp3; \
|
||||
temp2 += temp3; \
|
||||
nbits = JPEG_NBITS_NONZERO(temp); \
|
||||
nbits = temp >> (CHAR_BIT * sizeof(int) - 1); \
|
||||
temp += nbits; \
|
||||
nbits ^= temp; \
|
||||
nbits = JPEG_NBITS_NONZERO(nbits); \
|
||||
/* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
|
||||
while (r > 15) { \
|
||||
EMIT_BITS(code_0xf0, size_0xf0) \
|
||||
r -= 16; \
|
||||
while (r >= 16 * 16) { \
|
||||
r -= 16 * 16; \
|
||||
PUT_BITS(actbl->ehufco[0xf0], actbl->ehufsi[0xf0]) \
|
||||
} \
|
||||
/* Emit Huffman symbol for run length / number of bits */ \
|
||||
temp3 = (r << 4) + nbits; \
|
||||
code = actbl->ehufco[temp3]; \
|
||||
size = actbl->ehufsi[temp3]; \
|
||||
EMIT_CODE(code, size) \
|
||||
r += nbits; \
|
||||
PUT_CODE(actbl->ehufco[r], actbl->ehufsi[r]) \
|
||||
r = 0; \
|
||||
} \
|
||||
}
|
||||
@@ -588,13 +633,12 @@ encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
|
||||
|
||||
/* If the last coef(s) were zero, emit an end-of-block code */
|
||||
if (r > 0) {
|
||||
code = actbl->ehufco[0];
|
||||
size = actbl->ehufsi[0];
|
||||
EMIT_BITS(code, size)
|
||||
PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
|
||||
}
|
||||
}
|
||||
|
||||
state->cur.put_buffer = put_buffer;
|
||||
state->cur.put_bits = put_bits;
|
||||
state->cur.put_buffer.c = put_buffer;
|
||||
state->cur.free_bits = free_bits;
|
||||
STORE_BUFFER()
|
||||
|
||||
return TRUE;
|
||||
@@ -643,6 +687,7 @@ encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
|
||||
state.free_in_buffer = cinfo->dest->free_in_buffer;
|
||||
state.cur = entropy->saved;
|
||||
state.cinfo = cinfo;
|
||||
state.simd = entropy->simd;
|
||||
|
||||
/* Emit restart marker if needed */
|
||||
if (cinfo->restart_interval) {
|
||||
@@ -712,6 +757,7 @@ finish_pass_huff(j_compress_ptr cinfo)
|
||||
state.free_in_buffer = cinfo->dest->free_in_buffer;
|
||||
state.cur = entropy->saved;
|
||||
state.cinfo = cinfo;
|
||||
state.simd = entropy->simd;
|
||||
|
||||
/* Flush out the last data */
|
||||
if (!flush_bits(&state))
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -135,6 +135,8 @@ section .note.GNU-stack noalloc noexec nowrite progbits
|
||||
%define POINTER qword ; general pointer type
|
||||
%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
|
||||
%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
|
||||
%define resp resq
|
||||
%define dp dq
|
||||
%define raxp rax
|
||||
%define rbxp rbx
|
||||
%define rcxp rcx
|
||||
@@ -157,6 +159,8 @@ section .note.GNU-stack noalloc noexec nowrite progbits
|
||||
%define POINTER dword ; general pointer type
|
||||
%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
|
||||
%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
|
||||
%define resp resd
|
||||
%define dp dd
|
||||
; x86_64 ILP32 ABI (x32)
|
||||
%define raxp eax
|
||||
%define rbxp ebx
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
;
|
||||
; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
|
||||
;
|
||||
; Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
|
||||
; Copyright (C) 2009-2011, 2014-2016, 2019, D. R. Commander.
|
||||
; Copyright (C) 2015, Matthieu Darbois.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
;
|
||||
@@ -16,11 +16,25 @@
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains an SSE2 implementation for Huffman coding of one block.
|
||||
; The following code is based directly on jchuff.c; see jchuff.c for more
|
||||
; details.
|
||||
; The following code is based on jchuff.c; see jchuff.c for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
struc working_state
|
||||
.next_output_byte: resp 1 ; => next byte to write in buffer
|
||||
.free_in_buffer: resp 1 ; # of byte spaces remaining in buffer
|
||||
.cur.put_buffer.simd resq 1 ; current bit accumulation buffer
|
||||
.cur.free_bits resd 1 ; # of bits available in it
|
||||
.cur.last_dc_val resd 4 ; last DC coef for each component
|
||||
.cinfo: resp 1 ; dump_buffer needs access to this
|
||||
endstruc
|
||||
|
||||
struc c_derived_tbl
|
||||
.ehufco: resd 256 ; code for each symbol
|
||||
.ehufsi: resb 256 ; length of code for each symbol
|
||||
; If no code has been allocated for a symbol S, ehufsi[S] contains 0
|
||||
endstruc
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
@@ -29,134 +43,137 @@
|
||||
|
||||
EXTN(jconst_huff_encode_one_block):
|
||||
|
||||
%include "jpeg_nbits_table.inc"
|
||||
jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
|
||||
dd 0x000f, 0x001f, 0x003f, 0x007f
|
||||
dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
|
||||
dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
|
||||
|
||||
alignz 32
|
||||
|
||||
times 1 << 14 db 15
|
||||
times 1 << 13 db 14
|
||||
times 1 << 12 db 13
|
||||
times 1 << 11 db 12
|
||||
times 1 << 10 db 11
|
||||
times 1 << 9 db 10
|
||||
times 1 << 8 db 9
|
||||
times 1 << 7 db 8
|
||||
times 1 << 6 db 7
|
||||
times 1 << 5 db 6
|
||||
times 1 << 4 db 5
|
||||
times 1 << 3 db 4
|
||||
times 1 << 2 db 3
|
||||
times 1 << 1 db 2
|
||||
times 1 << 0 db 1
|
||||
times 1 db 0
|
||||
jpeg_nbits_table:
|
||||
times 1 db 0
|
||||
times 1 << 0 db 1
|
||||
times 1 << 1 db 2
|
||||
times 1 << 2 db 3
|
||||
times 1 << 3 db 4
|
||||
times 1 << 4 db 5
|
||||
times 1 << 5 db 6
|
||||
times 1 << 6 db 7
|
||||
times 1 << 7 db 8
|
||||
times 1 << 8 db 9
|
||||
times 1 << 9 db 10
|
||||
times 1 << 10 db 11
|
||||
times 1 << 11 db 12
|
||||
times 1 << 12 db 13
|
||||
times 1 << 13 db 14
|
||||
times 1 << 14 db 15
|
||||
|
||||
alignz 32
|
||||
|
||||
%define NBITS(x) nbits_base + x
|
||||
%define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
|
||||
; These macros perform the same task as the emit_bits() function in the
|
||||
; original libjpeg code. In addition to reducing overhead by explicitly
|
||||
; inlining the code, additional performance is achieved by taking into
|
||||
; account the size of the bit buffer and waiting until it is almost full
|
||||
; before emptying it. This mostly benefits 64-bit platforms, since 6
|
||||
; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
|
||||
; Shorthand used to describe SIMD operations:
|
||||
; wN: xmmN treated as eight signed 16-bit values
|
||||
; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7
|
||||
; bN: xmmN treated as 16 unsigned 8-bit values
|
||||
; bN[i]: perform the same operation on all 16 unsigned 8-bit values, i=0..15
|
||||
; Contents of SIMD registers are shown in memory order.
|
||||
|
||||
%macro EMIT_BYTE 0
|
||||
sub put_bits, 8 ; put_bits -= 8;
|
||||
mov rdx, put_buffer
|
||||
mov ecx, put_bits
|
||||
shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
|
||||
mov byte [buffer], dl ; *buffer++ = c;
|
||||
add buffer, 1
|
||||
cmp dl, 0xFF ; need to stuff a zero byte?
|
||||
jne %%.EMIT_BYTE_END
|
||||
mov byte [buffer], 0 ; *buffer++ = 0;
|
||||
add buffer, 1
|
||||
%%.EMIT_BYTE_END:
|
||||
%endmacro
|
||||
; Fill the bit buffer to capacity with the leading bits from code, then output
|
||||
; the bit buffer and put the remaining bits from code into the bit buffer.
|
||||
;
|
||||
; Usage:
|
||||
; code - contains the bits to shift into the bit buffer (LSB-aligned)
|
||||
; %1 - the label to which to jump when the macro completes
|
||||
; %2 (optional) - extra instructions to execute after nbits has been set
|
||||
;
|
||||
; Upon completion, free_bits will be set to the number of remaining bits from
|
||||
; code, and put_buffer will contain those remaining bits. temp and code will
|
||||
; be clobbered.
|
||||
;
|
||||
; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
|
||||
; macro in jchuff.c.
|
||||
|
||||
%macro PUT_BITS 1
|
||||
add put_bits, ecx ; put_bits += size;
|
||||
shl put_buffer, cl ; put_buffer = (put_buffer << size);
|
||||
or put_buffer, %1
|
||||
%endmacro
|
||||
|
||||
%macro CHECKBUF31 0
|
||||
cmp put_bits, 32 ; if (put_bits > 31) {
|
||||
jl %%.CHECKBUF31_END
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
%%.CHECKBUF31_END:
|
||||
%endmacro
|
||||
|
||||
%macro CHECKBUF47 0
|
||||
cmp put_bits, 48 ; if (put_bits > 47) {
|
||||
jl %%.CHECKBUF47_END
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
%%.CHECKBUF47_END:
|
||||
%endmacro
|
||||
|
||||
%macro EMIT_BITS 2
|
||||
CHECKBUF47
|
||||
mov ecx, %2
|
||||
PUT_BITS %1
|
||||
%endmacro
|
||||
|
||||
%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
|
||||
pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128();
|
||||
pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128();
|
||||
pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128();
|
||||
pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128();
|
||||
pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
|
||||
pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
|
||||
pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
|
||||
pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
|
||||
pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
|
||||
pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
|
||||
pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
|
||||
pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
|
||||
pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
|
||||
pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
|
||||
pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
|
||||
pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
|
||||
pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
|
||||
pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
|
||||
pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
|
||||
pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
|
||||
pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
|
||||
pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
|
||||
pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
|
||||
pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
|
||||
pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
|
||||
pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
|
||||
pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
|
||||
pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
|
||||
pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
|
||||
pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
|
||||
pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
|
||||
pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
|
||||
pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
|
||||
pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
|
||||
pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
|
||||
%if %1 != 32
|
||||
pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
|
||||
%else
|
||||
pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31];
|
||||
%endif
|
||||
pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
|
||||
pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
|
||||
pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
|
||||
pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
|
||||
paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg);
|
||||
paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg);
|
||||
paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg);
|
||||
paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg);
|
||||
pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg);
|
||||
pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg);
|
||||
pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg);
|
||||
pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg);
|
||||
pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1);
|
||||
pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1);
|
||||
pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1);
|
||||
pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1);
|
||||
movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
|
||||
movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
|
||||
movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
|
||||
movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
|
||||
movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
|
||||
movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
|
||||
movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
|
||||
movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
|
||||
%macro EMIT_QWORD 1-2
|
||||
add nbitsb, free_bitsb ; nbits += free_bits;
|
||||
neg free_bitsb ; free_bits = -free_bits;
|
||||
mov tempd, code ; temp = code;
|
||||
shl put_buffer, nbitsb ; put_buffer <<= nbits;
|
||||
mov nbitsb, free_bitsb ; nbits = free_bits;
|
||||
neg free_bitsb ; free_bits = -free_bits;
|
||||
shr tempd, nbitsb ; temp >>= nbits;
|
||||
or tempq, put_buffer ; temp |= put_buffer;
|
||||
movq xmm0, tempq ; xmm0.u64 = { temp, 0 };
|
||||
bswap tempq ; temp = htonl(temp);
|
||||
mov put_buffer, codeq ; put_buffer = code;
|
||||
pcmpeqb xmm0, xmm1 ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0);
|
||||
%2
|
||||
pmovmskb code, xmm0 ; code = 0; code |= ((b0[i] >> 7) << i);
|
||||
mov qword [buffer], tempq ; memcpy(buffer, &temp, 8);
|
||||
; (speculative; will be overwritten if
|
||||
; code contains any 0xFF bytes)
|
||||
add free_bitsb, 64 ; free_bits += 64;
|
||||
add bufferp, 8 ; buffer += 8;
|
||||
test code, code ; if (code == 0) /* No 0xFF bytes */
|
||||
jz %1 ; return;
|
||||
; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
|
||||
; bytes in the qword.
|
||||
cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
|
||||
mov byte [buffer-7], 0 ; buffer[-7] = 0;
|
||||
sbb bufferp, 6 ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0));
|
||||
mov byte [buffer], temph ; buffer[0] = temp[1];
|
||||
cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
|
||||
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
||||
sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
|
||||
shr tempq, 16 ; temp >>= 16;
|
||||
mov byte [buffer], tempb ; buffer[0] = temp[0];
|
||||
cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
|
||||
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
||||
sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
|
||||
mov byte [buffer], temph ; buffer[0] = temp[1];
|
||||
cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
|
||||
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
||||
sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
|
||||
shr tempq, 16 ; temp >>= 16;
|
||||
mov byte [buffer], tempb ; buffer[0] = temp[0];
|
||||
cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
|
||||
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
||||
sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
|
||||
mov byte [buffer], temph ; buffer[0] = temp[1];
|
||||
cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
|
||||
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
||||
sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
|
||||
shr tempd, 16 ; temp >>= 16;
|
||||
mov byte [buffer], tempb ; buffer[0] = temp[0];
|
||||
cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
|
||||
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
||||
sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
|
||||
mov byte [buffer], temph ; buffer[0] = temp[1];
|
||||
cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
|
||||
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
||||
sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
|
||||
jmp %1 ; return;
|
||||
%endmacro
|
||||
|
||||
;
|
||||
@@ -167,181 +184,398 @@ EXTN(jconst_huff_encode_one_block):
|
||||
; JCOEFPTR block, int last_dc_val,
|
||||
; c_derived_tbl *dctbl, c_derived_tbl *actbl)
|
||||
;
|
||||
; NOTES:
|
||||
; When shuffling data, we try to avoid pinsrw as much as possible, since it is
|
||||
; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on
|
||||
; modern CPUs, so chains of pinsrw instructions (even with different outputs)
|
||||
; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and
|
||||
; requires 2 µops (with memory operand) on Intel. In either case, only one
|
||||
; pinsrw instruction can be decoded per cycle (and nothing else if they are
|
||||
; back-to-back), so out-of-order execution cannot be used to work around long
|
||||
; pinsrw chains (though for Sandy Bridge and later, this may be less of a
|
||||
; problem if the code runs from the µop cache.)
|
||||
;
|
||||
; We use tzcnt instead of bsf without checking for support. The instruction is
|
||||
; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
|
||||
; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is
|
||||
; an input dependency (although the behavior is not formally defined, Intel
|
||||
; CPUs usually leave the destination unmodified if the source is zero.) This
|
||||
; can prevent out-of-order execution, so we clear the destination before
|
||||
; invoking tzcnt.
|
||||
;
|
||||
; Initial register allocation
|
||||
; rax - buffer
|
||||
; rbx - temp
|
||||
; rcx - nbits
|
||||
; rdx - block --> free_bits
|
||||
; rsi - nbits_base
|
||||
; rdi - t
|
||||
; rbp - code
|
||||
; r8 - dctbl --> code_temp
|
||||
; r9 - actbl
|
||||
; r10 - state
|
||||
; r11 - index
|
||||
; r12 - put_buffer
|
||||
|
||||
; r10 = working_state *state
|
||||
; r11 = JOCTET *buffer
|
||||
; r12 = JCOEFPTR block
|
||||
; r13d = int last_dc_val
|
||||
; r14 = c_derived_tbl *dctbl
|
||||
; r15 = c_derived_tbl *actbl
|
||||
|
||||
%define t1 rbp - (DCTSIZE2 * SIZEOF_WORD)
|
||||
%define t2 t1 - (DCTSIZE2 * SIZEOF_WORD)
|
||||
%define put_buffer r8
|
||||
%define put_bits r9d
|
||||
%define buffer rax
|
||||
%ifdef WIN64
|
||||
%define bufferp rax
|
||||
%else
|
||||
%define bufferp raxp
|
||||
%endif
|
||||
%define tempq rbx
|
||||
%define tempd ebx
|
||||
%define tempb bl
|
||||
%define temph bh
|
||||
%define nbitsq rcx
|
||||
%define nbits ecx
|
||||
%define nbitsb cl
|
||||
%define block rdx
|
||||
%define nbits_base rsi
|
||||
%define t rdi
|
||||
%define td edi
|
||||
%define codeq rbp
|
||||
%define code ebp
|
||||
%define dctbl r8
|
||||
%define actbl r9
|
||||
%define state r10
|
||||
%define index r11
|
||||
%define indexd r11d
|
||||
%define put_buffer r12
|
||||
%define put_bufferd r12d
|
||||
|
||||
; Step 1: Re-arrange input data according to jpeg_natural_order
|
||||
; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
|
||||
; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05
|
||||
; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34
|
||||
; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28
|
||||
; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36
|
||||
; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51
|
||||
; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46
|
||||
; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
|
||||
|
||||
EXTN(jsimd_huff_encode_one_block_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [t2]
|
||||
push_xmm 4
|
||||
collect_args 6
|
||||
|
||||
%ifdef WIN64
|
||||
|
||||
; rcx = working_state *state
|
||||
; rdx = JOCTET *buffer
|
||||
; r8 = JCOEFPTR block
|
||||
; r9 = int last_dc_val
|
||||
; [rax+48] = c_derived_tbl *dctbl
|
||||
; [rax+56] = c_derived_tbl *actbl
|
||||
|
||||
;X: X = code stream
|
||||
mov buffer, rdx
|
||||
mov block, r8
|
||||
movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
|
||||
push rbx
|
||||
push rbp
|
||||
movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
|
||||
mov state, rcx
|
||||
movsx code, word [block] ;Z: code = block[0];
|
||||
pxor xmm4, xmm4 ;A: w4[i] = 0;
|
||||
sub code, r9d ;Z: code -= last_dc_val;
|
||||
mov dctbl, POINTER [rsp+6*8+4*8]
|
||||
mov actbl, POINTER [rsp+6*8+5*8]
|
||||
punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
|
||||
lea nbits_base, [rel jpeg_nbits_table]
|
||||
add rsp, -DCTSIZE2 * SIZEOF_WORD
|
||||
mov t, rsp
|
||||
|
||||
mov buffer, r11 ; r11 is now sratch
|
||||
%else
|
||||
|
||||
mov put_buffer, MMWORD [r10+SIZEOF_POINTER*2] ; put_buffer = state->cur.put_buffer;
|
||||
mov put_bits, dword [r10+SIZEOF_POINTER*2+8] ; put_bits = state->cur.put_bits;
|
||||
push r10 ; r10 is now scratch
|
||||
; rdi = working_state *state
|
||||
; rsi = JOCTET *buffer
|
||||
; rdx = JCOEFPTR block
|
||||
; rcx = int last_dc_val
|
||||
; r8 = c_derived_tbl *dctbl
|
||||
; r9 = c_derived_tbl *actbl
|
||||
|
||||
; Encode the DC coefficient difference per section F.1.2.1
|
||||
movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val;
|
||||
sub edi, r13d ; r13 is not used anymore
|
||||
mov ebx, edi
|
||||
;X: X = code stream
|
||||
movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
|
||||
push rbx
|
||||
push rbp
|
||||
movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
|
||||
push r12
|
||||
mov state, rdi
|
||||
mov buffer, rsi
|
||||
movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
|
||||
movsx codeq, word [block] ;Z: code = block[0];
|
||||
lea nbits_base, [rel jpeg_nbits_table]
|
||||
pxor xmm4, xmm4 ;A: w4[i] = 0;
|
||||
sub codeq, rcx ;Z: code -= last_dc_val;
|
||||
punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
|
||||
lea t, [rsp - DCTSIZE2 * SIZEOF_WORD] ; use red zone for t_
|
||||
|
||||
; This is a well-known technique for obtaining the absolute value
|
||||
; without a branch. It is derived from an assembly language technique
|
||||
; presented in "How to Optimize for the Pentium Processors",
|
||||
; Copyright (c) 1996, 1997 by Agner Fog.
|
||||
mov esi, edi
|
||||
sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
|
||||
xor edi, esi ; temp ^= temp3;
|
||||
sub edi, esi ; temp -= temp3;
|
||||
%endif
|
||||
|
||||
; For a negative input, want temp2 = bitwise complement of abs(input)
|
||||
; This code assumes we are on a two's complement machine
|
||||
add ebx, esi ; temp2 += temp3;
|
||||
pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
|
||||
pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
|
||||
punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
|
||||
punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13
|
||||
pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17
|
||||
;A: (Row 0, offset 1)
|
||||
pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
|
||||
paddw xmm0, xmm4 ;A: w0[i] += w4[i];
|
||||
movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i];
|
||||
|
||||
; Find the number of bits needed for the magnitude of the coefficient
|
||||
lea r11, [rel jpeg_nbits_table]
|
||||
movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp);
|
||||
; Emit the Huffman-coded symbol for the number of bits
|
||||
mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits];
|
||||
movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits];
|
||||
EMIT_BITS r11, esi ; EMIT_BITS(code, size)
|
||||
movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- --
|
||||
pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- --
|
||||
pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12
|
||||
movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55
|
||||
movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12
|
||||
punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51
|
||||
pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12
|
||||
pxor xmm4, xmm4 ;A: w4[i] = 0;
|
||||
psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- --
|
||||
pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
|
||||
pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12
|
||||
; (Row 1, offset 1)
|
||||
pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
|
||||
paddw xmm1, xmm4 ;B: w1[i] += w4[i];
|
||||
movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i];
|
||||
pxor xmm4, xmm4 ;B: w4[i] = 0;
|
||||
pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
|
||||
|
||||
; Mask off any extra bits in code
|
||||
mov esi, 1
|
||||
mov ecx, edi
|
||||
shl esi, cl
|
||||
dec esi
|
||||
and ebx, esi ; temp2 &= (((JLONG)1)<<nbits) - 1;
|
||||
packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
|
||||
; w/ signed saturation
|
||||
|
||||
; Emit that number of bits of the value, if positive,
|
||||
; or the complement of its magnitude, if negative.
|
||||
EMIT_BITS rbx, edi ; EMIT_BITS(temp2, nbits)
|
||||
pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- --
|
||||
pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- --
|
||||
pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 --
|
||||
pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35
|
||||
; (Row 3, offset 1)
|
||||
pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
|
||||
paddw xmm3, xmm4 ;D: w3[i] += w4[i];
|
||||
movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i];
|
||||
pxor xmm4, xmm4 ;D: w4[i] = 0;
|
||||
pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
|
||||
|
||||
; Prepare data
|
||||
xor ebx, ebx
|
||||
kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
|
||||
18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
|
||||
27, 20, 13, 6, 7, 14, 21, 28, 35, \
|
||||
xmm0, xmm1, xmm2, xmm3
|
||||
kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
|
||||
30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
|
||||
53, 60, 61, 54, 47, 55, 62, 63, 63, \
|
||||
xmm4, xmm5, xmm6, xmm7
|
||||
pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51
|
||||
cmp code, 1 << 31 ;Z: Set CF if code < 0x80000000,
|
||||
;Z: i.e. if code is positive
|
||||
pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51
|
||||
pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51
|
||||
adc code, -1 ;Z: code += -1 + (code >= 0 ? 1 : 0);
|
||||
pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51
|
||||
pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51
|
||||
movsxd codeq, code ;Z: sign extend code
|
||||
pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27
|
||||
; (Row 2, offset 1)
|
||||
pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
|
||||
paddw xmm2, xmm4 ;C: w2[i] += w4[i];
|
||||
movaps XMMWORD [t + 16 * SIZEOF_WORD], xmm2 ;C: t[i+16] = w2[i];
|
||||
pxor xmm4, xmm4 ;C: w4[i] = 0;
|
||||
pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
|
||||
|
||||
pxor xmm8, xmm8
|
||||
pcmpeqw xmm0, xmm8 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
|
||||
pcmpeqw xmm1, xmm8 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
|
||||
pcmpeqw xmm2, xmm8 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
|
||||
pcmpeqw xmm3, xmm8 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
|
||||
pcmpeqw xmm4, xmm8 ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
|
||||
pcmpeqw xmm5, xmm8 ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
|
||||
pcmpeqw xmm6, xmm8 ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
|
||||
pcmpeqw xmm7, xmm8 ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
|
||||
packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
|
||||
packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
|
||||
packsswb xmm4, xmm5 ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
|
||||
packsswb xmm6, xmm7 ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
|
||||
pmovmskb r11d, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
|
||||
pmovmskb r12d, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
|
||||
pmovmskb r13d, xmm4 ; index = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
|
||||
pmovmskb r14d, xmm6 ; index = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
|
||||
shl r12, 16
|
||||
shl r14, 16
|
||||
or r11, r12
|
||||
or r13, r14
|
||||
shl r13, 32
|
||||
or r11, r13
|
||||
not r11 ; index = ~index;
|
||||
packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
|
||||
; w/ signed saturation
|
||||
|
||||
;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
|
||||
;jmp .EFN
|
||||
movzx nbitsq, byte [NBITS(codeq)] ;Z: nbits = JPEG_NBITS(code);
|
||||
movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55
|
||||
pmovmskb tempd, xmm2 ;Z: temp = 0; temp |= ((b2[i] >> 7) << i);
|
||||
pmovmskb put_bufferd, xmm0 ;Z: put_buffer = 0; put_buffer |= ((b0[i] >> 7) << i);
|
||||
movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63
|
||||
punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63
|
||||
shl tempd, 16 ;Z: temp <<= 16;
|
||||
psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 --
|
||||
pxor xmm2, xmm2 ;H: w2[i] = 0;
|
||||
or put_bufferd, tempd ;Z: put_buffer |= temp;
|
||||
pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 --
|
||||
movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- --
|
||||
unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59
|
||||
pxor xmm0, xmm0 ;H: w0[i] = 0;
|
||||
pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 --
|
||||
; (Row 7, offset 1)
|
||||
pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
|
||||
paddw xmm3, xmm2 ;H: w3[i] += w2[i];
|
||||
movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i];
|
||||
movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- --
|
||||
pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
|
||||
punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47
|
||||
mov tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4]
|
||||
;Z: temp = dctbl->ehufco[nbits];
|
||||
movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47
|
||||
psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 --
|
||||
shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59
|
||||
and code, dword [MASK_BITS(nbitsq)] ;Z: code &= (1 << nbits) - 1;
|
||||
pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 --
|
||||
pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58
|
||||
shl tempq, nbitsb ;Z: temp <<= nbits;
|
||||
pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 --
|
||||
pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58
|
||||
pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 --
|
||||
or code, tempd ;Z: code |= temp;
|
||||
movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58
|
||||
pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 --
|
||||
pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58
|
||||
pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53
|
||||
; (Row 6, offset 1)
|
||||
pxor xmm2, xmm2 ;G: w2[i] = 0;
|
||||
pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
|
||||
pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58
|
||||
paddw xmm4, xmm0 ;G: w4[i] += w0[i];
|
||||
movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i];
|
||||
pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58
|
||||
; (Row 5, offset 1)
|
||||
pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
|
||||
pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59
|
||||
|
||||
mov r13d, INT [r15 + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
|
||||
movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
|
||||
lea rsi, [t1]
|
||||
.BLOOP:
|
||||
bsf r12, r11 ; r = __builtin_ctzl(index);
|
||||
jz .ELOOP
|
||||
mov rcx, r12
|
||||
lea rsi, [rsi+r12*2] ; k += r;
|
||||
shr r11, cl ; index >>= r;
|
||||
movzx rdi, word [rsi] ; temp = t1[k];
|
||||
lea rbx, [rel jpeg_nbits_table]
|
||||
movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp);
|
||||
.BRLOOP:
|
||||
cmp r12, 16 ; while (r > 15) {
|
||||
jl .ERLOOP
|
||||
EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0)
|
||||
sub r12, 16 ; r -= 16;
|
||||
jmp .BRLOOP
|
||||
.ERLOOP:
|
||||
; Emit Huffman symbol for run length / number of bits
|
||||
CHECKBUF31 ; uses rcx, rdx
|
||||
packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
|
||||
; w/ signed saturation
|
||||
|
||||
shl r12, 4 ; temp3 = (r << 4) + nbits;
|
||||
add r12, rdi
|
||||
mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3];
|
||||
movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3];
|
||||
PUT_BITS rbx
|
||||
pxor xmm0, xmm0 ;F: w0[i] = 0;
|
||||
pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59
|
||||
pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
|
||||
pmovmskb tempd, xmm4 ;Z: temp = 0; temp |= ((b4[i] >> 7) << i);
|
||||
pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59
|
||||
paddw xmm1, xmm2 ;F: w1[i] += w2[i];
|
||||
movaps XMMWORD [t + 40 * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i];
|
||||
pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
|
||||
; (Row 4, offset 1)
|
||||
%undef block
|
||||
%define free_bitsq rdx
|
||||
%define free_bitsd edx
|
||||
%define free_bitsb dl
|
||||
pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
|
||||
shl tempq, 48 ;Z: temp <<= 48;
|
||||
pxor xmm2, xmm2 ;E: w2[i] = 0;
|
||||
pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
|
||||
paddw xmm5, xmm0 ;E: w5[i] += w0[i];
|
||||
or tempq, put_buffer ;Z: temp |= put_buffer;
|
||||
movaps XMMWORD [t + 32 * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i];
|
||||
lea t, [dword t - 2] ;Z: t = &t[-1];
|
||||
pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
|
||||
|
||||
;EMIT_CODE(code, size)
|
||||
packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
|
||||
; w/ signed saturation
|
||||
|
||||
movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k];
|
||||
; Mask off any extra bits in code
|
||||
mov rcx, rdi
|
||||
mov rdx, 1
|
||||
shl rdx, cl
|
||||
dec rdx
|
||||
and rbx, rdx ; temp2 &= (((JLONG)1)<<nbits) - 1;
|
||||
PUT_BITS rbx ; PUT_BITS(temp2, nbits)
|
||||
add nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq]
|
||||
;Z: nbits += dctbl->ehufsi[nbits];
|
||||
%undef dctbl
|
||||
%define code_temp r8d
|
||||
pmovmskb indexd, xmm5 ;Z: index = 0; index |= ((b5[i] >> 7) << i);
|
||||
mov free_bitsd, [state+working_state.cur.free_bits]
|
||||
;Z: free_bits = state->cur.free_bits;
|
||||
pcmpeqw xmm1, xmm1 ;Z: b1[i] = 0xFF;
|
||||
shl index, 32 ;Z: index <<= 32;
|
||||
mov put_buffer, [state+working_state.cur.put_buffer.simd]
|
||||
;Z: put_buffer = state->cur.put_buffer.simd;
|
||||
or index, tempq ;Z: index |= temp;
|
||||
not index ;Z: index = ~index;
|
||||
sub free_bitsb, nbitsb ;Z: if ((free_bits -= nbits) >= 0)
|
||||
jnl .ENTRY_SKIP_EMIT_CODE ;Z: goto .ENTRY_SKIP_EMIT_CODE;
|
||||
align 16
|
||||
.EMIT_CODE: ;Z: .EMIT_CODE:
|
||||
EMIT_QWORD .BLOOP_COND ;Z: insert code, flush buffer, goto .BLOOP_COND
|
||||
|
||||
shr r11, 1 ; index >>= 1;
|
||||
add rsi, 2 ; ++k;
|
||||
jmp .BLOOP
|
||||
.ELOOP:
|
||||
; If the last coef(s) were zero, emit an end-of-block code
|
||||
lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
|
||||
cmp rdi, rsi ; if (r > 0) {
|
||||
je .EFN
|
||||
mov ebx, INT [r15] ; code = actbl->ehufco[0];
|
||||
movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0];
|
||||
EMIT_BITS rbx, r12d
|
||||
.EFN:
|
||||
pop r10
|
||||
; Save put_buffer & put_bits
|
||||
mov MMWORD [r10+SIZEOF_POINTER*2], put_buffer ; state->cur.put_buffer = put_buffer;
|
||||
mov dword [r10+SIZEOF_POINTER*2+8], put_bits ; state->cur.put_bits = put_bits;
|
||||
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
pop rbx
|
||||
uncollect_args 6
|
||||
pop_xmm 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
align 16
|
||||
.BRLOOP: ; do {
|
||||
lea code_temp, [nbitsq - 16] ; code_temp = nbits - 16;
|
||||
movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
|
||||
; nbits = actbl->ehufsi[0xf0];
|
||||
mov code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
|
||||
; code = actbl->ehufco[0xf0];
|
||||
sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
|
||||
jle .EMIT_BRLOOP_CODE ; goto .EMIT_BRLOOP_CODE;
|
||||
shl put_buffer, nbitsb ; put_buffer <<= nbits;
|
||||
mov nbits, code_temp ; nbits = code_temp;
|
||||
or put_buffer, codeq ; put_buffer |= code;
|
||||
cmp nbits, 16 ; if (nbits <= 16)
|
||||
jle .ERLOOP ; break;
|
||||
jmp .BRLOOP ; } while(1);
|
||||
|
||||
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
align 16
|
||||
times 5 nop
|
||||
.ENTRY_SKIP_EMIT_CODE: ; .ENTRY_SKIP_EMIT_CODE:
|
||||
shl put_buffer, nbitsb ; put_buffer <<= nbits;
|
||||
or put_buffer, codeq ; put_buffer |= code;
|
||||
.BLOOP_COND: ; .BLOOP_COND:
|
||||
test index, index ; if (index != 0)
|
||||
jz .ELOOP ; {
|
||||
.BLOOP: ; do {
|
||||
xor nbits, nbits ; nbits = 0; /* kill tzcnt input dependency */
|
||||
tzcnt nbitsq, index ; nbits = # of trailing 0 bits in index
|
||||
inc nbits ; ++nbits;
|
||||
lea t, [t + nbitsq * 2] ; t = &t[nbits];
|
||||
shr index, nbitsb ; index >>= nbits;
|
||||
.EMIT_BRLOOP_CODE_END: ; .EMIT_BRLOOP_CODE_END:
|
||||
cmp nbits, 16 ; if (nbits > 16)
|
||||
jg .BRLOOP ; goto .BRLOOP;
|
||||
.ERLOOP: ; .ERLOOP:
|
||||
movsx codeq, word [t] ; code = *t;
|
||||
lea tempd, [nbitsq * 2] ; temp = nbits * 2;
|
||||
movzx nbits, byte [NBITS(codeq)] ; nbits = JPEG_NBITS(code);
|
||||
lea tempd, [nbitsq + tempq * 8] ; temp = temp * 8 + nbits;
|
||||
mov code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4]
|
||||
; code_temp = actbl->ehufco[temp-16];
|
||||
shl code_temp, nbitsb ; code_temp <<= nbits;
|
||||
and code, dword [MASK_BITS(nbitsq)] ; code &= (1 << nbits) - 1;
|
||||
add nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)]
|
||||
; free_bits -= actbl->ehufsi[temp-16];
|
||||
or code, code_temp ; code |= code_temp;
|
||||
sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
|
||||
jle .EMIT_CODE ; goto .EMIT_CODE;
|
||||
shl put_buffer, nbitsb ; put_buffer <<= nbits;
|
||||
or put_buffer, codeq ; put_buffer |= code;
|
||||
test index, index
|
||||
jnz .BLOOP ; } while (index != 0);
|
||||
.ELOOP: ; } /* index != 0 */
|
||||
sub td, esp ; t -= (WIN64: &t_[0], UNIX: &t_[64]);
|
||||
%ifdef WIN64
|
||||
cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62)
|
||||
%else
|
||||
cmp td, -2 * SIZEOF_WORD ; if (t != -2)
|
||||
%endif
|
||||
je .EFN ; {
|
||||
movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
|
||||
; nbits = actbl->ehufsi[0];
|
||||
mov code, [actbl + c_derived_tbl.ehufco + 0] ; code = actbl->ehufco[0];
|
||||
sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
|
||||
jg .EFN_SKIP_EMIT_CODE ; {
|
||||
EMIT_QWORD .EFN ; insert code, flush buffer
|
||||
align 16
|
||||
.EFN_SKIP_EMIT_CODE: ; } else {
|
||||
shl put_buffer, nbitsb ; put_buffer <<= nbits;
|
||||
or put_buffer, codeq ; put_buffer |= code;
|
||||
.EFN: ; } }
|
||||
mov [state + working_state.cur.put_buffer.simd], put_buffer
|
||||
; state->cur.put_buffer.simd = put_buffer;
|
||||
mov byte [state + working_state.cur.free_bits], free_bitsb
|
||||
; state->cur.free_bits = free_bits;
|
||||
%ifdef WIN64
|
||||
sub rsp, -DCTSIZE2 * SIZEOF_WORD
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
%else
|
||||
pop r12
|
||||
pop rbp
|
||||
pop rbx
|
||||
%endif
|
||||
ret
|
||||
|
||||
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
align 16
|
||||
.EMIT_BRLOOP_CODE:
|
||||
EMIT_QWORD .EMIT_BRLOOP_CODE_END, {mov nbits, code_temp} ; insert code, flush buffer,
|
||||
; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
|
||||
Reference in New Issue
Block a user