diff --git a/ChangeLog.md b/ChangeLog.md index b4cca2ec..94907cc5 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -45,6 +45,13 @@ longer supports 32-bit Java virtual machines. Oracle no longer provides a 32-bit JVM for macOS, and Apple's implementation of Java 1.6 (Java for OS X systems) is long obsolete. +5. The SSE2 (x86 SIMD) and C Huffman encoding algorithms have been +significantly optimized, resulting in a measured average overall compression +speedup of 12-28% for 64-bit code and 22-52% for 32-bit code on various Intel +and AMD CPUs, as well as a measured average overall compression speedup of +0-23% on platforms that do not have a SIMD-accelerated Huffman encoding +implementation. + 2.0.4 ===== diff --git a/jchuff.c b/jchuff.c index c15ab26e..964da724 100644 --- a/jchuff.c +++ b/jchuff.c @@ -72,9 +72,33 @@ typedef unsigned long long bit_buf_type; typedef size_t bit_buf_type; #endif +/* NOTE: The more optimal Huffman encoding algorithm has not yet been + * implemented in the ARM NEON SIMD extensions, which is why we retain the old + * Huffman encoder behavior for that platform. + */ +#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__)) +typedef unsigned long long simd_bit_buf_type; +#else +typedef bit_buf_type simd_bit_buf_type; +#endif + +#if (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 8) || defined(_WIN64) || \ + (defined(__x86_64__) && defined(__ILP32__)) +#define BIT_BUF_SIZE 64 +#elif (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 4) || defined(_WIN32) +#define BIT_BUF_SIZE 32 +#else +#error Cannot determine word size +#endif +#define SIMD_BIT_BUF_SIZE (sizeof(simd_bit_buf_type) * 8) + typedef struct { - bit_buf_type put_buffer; /* current bit-accumulation buffer */ - int put_bits; /* # of bits now in it */ + union { + bit_buf_type c; + simd_bit_buf_type simd; + } put_buffer; /* current bit accumulation buffer */ + int free_bits; /* # of bits available in it */ + /* (ARM SIMD: # of bits now in it) */ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ } savable_state; @@ -110,6 +134,7 @@ typedef struct { size_t free_in_buffer; /* # of byte spaces remaining in buffer */ savable_state cur; /* Current bit buffer & DC state */ j_compress_ptr cinfo; /* dump_buffer needs access to this */ + int simd; } working_state; @@ -188,8 +213,17 @@ start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics) } /* Initialize bit buffer to empty */ - entropy->saved.put_buffer = 0; - entropy->saved.put_bits = 0; + if (entropy->simd) { + entropy->saved.put_buffer.simd = 0; +#if defined(__arm__) || defined(__aarch64__) + entropy->saved.free_bits = 0; +#else + entropy->saved.free_bits = SIMD_BIT_BUF_SIZE; +#endif + } else { + entropy->saved.put_buffer.c = 0; + entropy->saved.free_bits = BIT_BUF_SIZE; + } /* Initialize restart stuff */ entropy->restarts_to_go = cinfo->restart_interval; @@ -321,94 +355,94 @@ dump_buffer(working_state *state) /* Outputting bits to the file */ -/* These macros perform the same task as the emit_bits() function in the - * original libjpeg code. In addition to reducing overhead by explicitly - * inlining the code, additional performance is achieved by taking into - * account the size of the bit buffer and waiting until it is almost full - * before emptying it. This mostly benefits 64-bit platforms, since 6 - * bytes can be stored in a 64-bit bit buffer before it has to be emptied. +/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be + * encoded as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the + * byte is 0xFF. Otherwise, the output buffer pointer is advanced by 1, and + * the speculative 0 byte will be overwritten by the next byte. */ - -#define EMIT_BYTE() { \ - JOCTET c; \ - put_bits -= 8; \ - c = (JOCTET)GETJOCTET(put_buffer >> put_bits); \ - *buffer++ = c; \ - if (c == 0xFF) /* need to stuff a zero byte? */ \ - *buffer++ = 0; \ +#define EMIT_BYTE(b) { \ + buffer[0] = (JOCTET)(b); \ + buffer[1] = 0; \ + buffer -= -2 + ((JOCTET)(b) < 0xFF); \ } -#define PUT_BITS(code, size) { \ - put_bits += size; \ - put_buffer = (put_buffer << size) | code; \ -} +/* Output the entire bit buffer. If there are no 0xFF bytes in it, then write + * directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to + * encode 0xFF as 0xFF 0x00. + */ +#if BIT_BUF_SIZE == 64 -#if SIZEOF_SIZE_T != 8 && !defined(_WIN64) - -#define CHECKBUF15() { \ - if (put_bits > 15) { \ - EMIT_BYTE() \ - EMIT_BYTE() \ +#define FLUSH() { \ + if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \ + EMIT_BYTE(put_buffer >> 56) \ + EMIT_BYTE(put_buffer >> 48) \ + EMIT_BYTE(put_buffer >> 40) \ + EMIT_BYTE(put_buffer >> 32) \ + EMIT_BYTE(put_buffer >> 24) \ + EMIT_BYTE(put_buffer >> 16) \ + EMIT_BYTE(put_buffer >> 8) \ + EMIT_BYTE(put_buffer ) \ + } else { \ + buffer[0] = (JOCTET)(put_buffer >> 56); \ + buffer[1] = (JOCTET)(put_buffer >> 48); \ + buffer[2] = (JOCTET)(put_buffer >> 40); \ + buffer[3] = (JOCTET)(put_buffer >> 32); \ + buffer[4] = (JOCTET)(put_buffer >> 24); \ + buffer[5] = (JOCTET)(put_buffer >> 16); \ + buffer[6] = (JOCTET)(put_buffer >> 8); \ + buffer[7] = (JOCTET)(put_buffer); \ + buffer += 8; \ } \ } -#endif - -#define CHECKBUF31() { \ - if (put_bits > 31) { \ - EMIT_BYTE() \ - EMIT_BYTE() \ - EMIT_BYTE() \ - EMIT_BYTE() \ - } \ -} - -#define CHECKBUF47() { \ - if (put_bits > 47) { \ - EMIT_BYTE() \ - EMIT_BYTE() \ - EMIT_BYTE() \ - EMIT_BYTE() \ - EMIT_BYTE() \ - EMIT_BYTE() \ - } \ -} - -#if !defined(_WIN32) && !defined(SIZEOF_SIZE_T) -#error Cannot determine word size -#endif - -#if SIZEOF_SIZE_T == 8 || defined(_WIN64) || (defined(__x86_64__) && defined(__ILP32__)) - -#define EMIT_BITS(code, size) { \ - CHECKBUF47() \ - PUT_BITS(code, size) \ -} - -#define EMIT_CODE(code, size) { \ - temp2 &= (((JLONG)1) << nbits) - 1; \ - CHECKBUF31() \ - PUT_BITS(code, size) \ - PUT_BITS(temp2, nbits) \ -} - #else -#define EMIT_BITS(code, size) { \ - PUT_BITS(code, size) \ - CHECKBUF15() \ -} - -#define EMIT_CODE(code, size) { \ - temp2 &= (((JLONG)1) << nbits) - 1; \ - PUT_BITS(code, size) \ - CHECKBUF15() \ - PUT_BITS(temp2, nbits) \ - CHECKBUF15() \ +#define FLUSH() { \ + if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \ + EMIT_BYTE(put_buffer >> 24) \ + EMIT_BYTE(put_buffer >> 16) \ + EMIT_BYTE(put_buffer >> 8) \ + EMIT_BYTE(put_buffer ) \ + } else { \ + buffer[0] = (JOCTET)(put_buffer >> 24); \ + buffer[1] = (JOCTET)(put_buffer >> 16); \ + buffer[2] = (JOCTET)(put_buffer >> 8); \ + buffer[3] = (JOCTET)(put_buffer); \ + buffer += 4; \ + } \ } #endif +/* Fill the bit buffer to capacity with the leading bits from code, then output + * the bit buffer and put the remaining bits from code into the bit buffer. + */ +#define PUT_AND_FLUSH(code, size) { \ + put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \ + FLUSH() \ + free_bits += BIT_BUF_SIZE; \ + put_buffer = code; \ +} + +/* Insert code into the bit buffer and output the bit buffer if needed. + * NOTE: We can't flush with free_bits == 0, since the left shift in + * PUT_AND_FLUSH() would have undefined behavior. + */ +#define PUT_BITS(code, size) { \ + free_bits -= size; \ + if (free_bits < 0) \ + PUT_AND_FLUSH(code, size) \ + else \ + put_buffer = (put_buffer << size) | code; \ +} + +#define PUT_CODE(code, size) { \ + temp &= (((JLONG)1) << nbits) - 1; \ + temp |= code << nbits; \ + nbits += size; \ + PUT_BITS(temp, nbits) \ +} + /* Although it is exceedingly rare, it is possible for a Huffman-encoded * coefficient block to be larger than the 128-byte unencoded block. For each @@ -431,6 +465,7 @@ dump_buffer(working_state *state) #define STORE_BUFFER() { \ if (localbuf) { \ + size_t bytes, bytestocopy; \ bytes = buffer - _buffer; \ buffer = _buffer; \ while (bytes > 0) { \ @@ -453,20 +488,46 @@ dump_buffer(working_state *state) LOCAL(boolean) flush_bits(working_state *state) { - JOCTET _buffer[BUFSIZE], *buffer; - bit_buf_type put_buffer; int put_bits; - size_t bytes, bytestocopy; int localbuf = 0; + JOCTET _buffer[BUFSIZE], *buffer, temp; + simd_bit_buf_type put_buffer; int put_bits; + int localbuf = 0; + + if (state->simd) { +#if defined __arm__ || defined __aarch64__ + put_bits = state->cur.free_bits; +#else + put_bits = SIMD_BIT_BUF_SIZE - state->cur.free_bits; +#endif + put_buffer = state->cur.put_buffer.simd; + } else { + put_bits = BIT_BUF_SIZE - state->cur.free_bits; + put_buffer = state->cur.put_buffer.c; + } - put_buffer = state->cur.put_buffer; - put_bits = state->cur.put_bits; LOAD_BUFFER() - /* fill any partial byte with ones */ - PUT_BITS(0x7F, 7) - while (put_bits >= 8) EMIT_BYTE() + while (put_bits >= 8) { + put_bits -= 8; + temp = (JOCTET)(put_buffer >> put_bits); + EMIT_BYTE(temp) + } + if (put_bits) { + /* fill partial byte with ones */ + temp = (JOCTET)((put_buffer << (8 - put_bits)) | (0xFF >> put_bits)); + EMIT_BYTE(temp) + } - state->cur.put_buffer = 0; /* and reset bit-buffer to empty */ - state->cur.put_bits = 0; + if (state->simd) { /* and reset bit buffer to empty */ + state->cur.put_buffer.simd = 0; +#if defined __arm__ || defined __aarch64__ + state->cur.free_bits = 0; +#else + state->cur.free_bits = SIMD_BIT_BUF_SIZE; +#endif + } else { + state->cur.put_buffer.c = 0; + state->cur.free_bits = BIT_BUF_SIZE; + } STORE_BUFFER() return TRUE; @@ -480,7 +541,7 @@ encode_one_block_simd(working_state *state, JCOEFPTR block, int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl) { JOCTET _buffer[BUFSIZE], *buffer; - size_t bytes, bytestocopy; int localbuf = 0; + int localbuf = 0; LOAD_BUFFER() @@ -496,53 +557,41 @@ LOCAL(boolean) encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl) { - int temp, temp2, temp3; - int nbits; - int r, code, size; + int temp, nbits, free_bits; + bit_buf_type put_buffer; JOCTET _buffer[BUFSIZE], *buffer; - bit_buf_type put_buffer; int put_bits; - int code_0xf0 = actbl->ehufco[0xf0], size_0xf0 = actbl->ehufsi[0xf0]; - size_t bytes, bytestocopy; int localbuf = 0; + int localbuf = 0; - put_buffer = state->cur.put_buffer; - put_bits = state->cur.put_bits; + free_bits = state->cur.free_bits; + put_buffer = state->cur.put_buffer.c; LOAD_BUFFER() /* Encode the DC coefficient difference per section F.1.2.1 */ - temp = temp2 = block[0] - last_dc_val; + temp = block[0] - last_dc_val; /* This is a well-known technique for obtaining the absolute value without a * branch. It is derived from an assembly language technique presented in * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by - * Agner Fog. + * Agner Fog. This code assumes we are on a two's complement machine. */ - temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); - temp ^= temp3; - temp -= temp3; - - /* For a negative input, want temp2 = bitwise complement of abs(input) */ - /* This code assumes we are on a two's complement machine */ - temp2 += temp3; + nbits = temp >> (CHAR_BIT * sizeof(int) - 1); + temp += nbits; + nbits ^= temp; /* Find the number of bits needed for the magnitude of the coefficient */ - nbits = JPEG_NBITS(temp); + nbits = JPEG_NBITS(nbits); - /* Emit the Huffman-coded symbol for the number of bits */ - code = dctbl->ehufco[nbits]; - size = dctbl->ehufsi[nbits]; - EMIT_BITS(code, size) - - /* Mask off any extra bits in code */ - temp2 &= (((JLONG)1) << nbits) - 1; - - /* Emit that number of bits of the value, if positive, */ - /* or the complement of its magnitude, if negative. */ - EMIT_BITS(temp2, nbits) + /* Emit the Huffman-coded symbol for the number of bits. + * Emit that number of bits of the value, if positive, + * or the complement of its magnitude, if negative. + */ + PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits]) /* Encode the AC coefficients per section F.1.2.2 */ - r = 0; /* r = run length of zeros */ + { + int r = 0; /* r = run length of zeros */ /* Manually unroll the k loop to eliminate the counter variable. This * improves performance greatly on systems with a limited number of @@ -550,51 +599,46 @@ encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val, */ #define kloop(jpeg_natural_order_of_k) { \ if ((temp = block[jpeg_natural_order_of_k]) == 0) { \ - r++; \ + r += 16; \ } else { \ - temp2 = temp; \ /* Branch-less absolute value, bitwise complement, etc., same as above */ \ - temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); \ - temp ^= temp3; \ - temp -= temp3; \ - temp2 += temp3; \ - nbits = JPEG_NBITS_NONZERO(temp); \ + nbits = temp >> (CHAR_BIT * sizeof(int) - 1); \ + temp += nbits; \ + nbits ^= temp; \ + nbits = JPEG_NBITS_NONZERO(nbits); \ /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \ - while (r > 15) { \ - EMIT_BITS(code_0xf0, size_0xf0) \ - r -= 16; \ + while (r >= 16 * 16) { \ + r -= 16 * 16; \ + PUT_BITS(actbl->ehufco[0xf0], actbl->ehufsi[0xf0]) \ } \ /* Emit Huffman symbol for run length / number of bits */ \ - temp3 = (r << 4) + nbits; \ - code = actbl->ehufco[temp3]; \ - size = actbl->ehufsi[temp3]; \ - EMIT_CODE(code, size) \ + r += nbits; \ + PUT_CODE(actbl->ehufco[r], actbl->ehufsi[r]) \ r = 0; \ } \ } - /* One iteration for each value in jpeg_natural_order[] */ - kloop(1); kloop(8); kloop(16); kloop(9); kloop(2); kloop(3); - kloop(10); kloop(17); kloop(24); kloop(32); kloop(25); kloop(18); - kloop(11); kloop(4); kloop(5); kloop(12); kloop(19); kloop(26); - kloop(33); kloop(40); kloop(48); kloop(41); kloop(34); kloop(27); - kloop(20); kloop(13); kloop(6); kloop(7); kloop(14); kloop(21); - kloop(28); kloop(35); kloop(42); kloop(49); kloop(56); kloop(57); - kloop(50); kloop(43); kloop(36); kloop(29); kloop(22); kloop(15); - kloop(23); kloop(30); kloop(37); kloop(44); kloop(51); kloop(58); - kloop(59); kloop(52); kloop(45); kloop(38); kloop(31); kloop(39); - kloop(46); kloop(53); kloop(60); kloop(61); kloop(54); kloop(47); - kloop(55); kloop(62); kloop(63); + /* One iteration for each value in jpeg_natural_order[] */ + kloop(1); kloop(8); kloop(16); kloop(9); kloop(2); kloop(3); + kloop(10); kloop(17); kloop(24); kloop(32); kloop(25); kloop(18); + kloop(11); kloop(4); kloop(5); kloop(12); kloop(19); kloop(26); + kloop(33); kloop(40); kloop(48); kloop(41); kloop(34); kloop(27); + kloop(20); kloop(13); kloop(6); kloop(7); kloop(14); kloop(21); + kloop(28); kloop(35); kloop(42); kloop(49); kloop(56); kloop(57); + kloop(50); kloop(43); kloop(36); kloop(29); kloop(22); kloop(15); + kloop(23); kloop(30); kloop(37); kloop(44); kloop(51); kloop(58); + kloop(59); kloop(52); kloop(45); kloop(38); kloop(31); kloop(39); + kloop(46); kloop(53); kloop(60); kloop(61); kloop(54); kloop(47); + kloop(55); kloop(62); kloop(63); - /* If the last coef(s) were zero, emit an end-of-block code */ - if (r > 0) { - code = actbl->ehufco[0]; - size = actbl->ehufsi[0]; - EMIT_BITS(code, size) + /* If the last coef(s) were zero, emit an end-of-block code */ + if (r > 0) { + PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0]) + } } - state->cur.put_buffer = put_buffer; - state->cur.put_bits = put_bits; + state->cur.put_buffer.c = put_buffer; + state->cur.free_bits = free_bits; STORE_BUFFER() return TRUE; @@ -643,6 +687,7 @@ encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data) state.free_in_buffer = cinfo->dest->free_in_buffer; state.cur = entropy->saved; state.cinfo = cinfo; + state.simd = entropy->simd; /* Emit restart marker if needed */ if (cinfo->restart_interval) { @@ -712,6 +757,7 @@ finish_pass_huff(j_compress_ptr cinfo) state.free_in_buffer = cinfo->dest->free_in_buffer; state.cur = entropy->saved; state.cinfo = cinfo; + state.simd = entropy->simd; /* Flush out the last data */ if (!flush_bits(&state)) diff --git a/simd/i386/jchuff-sse2.asm b/simd/i386/jchuff-sse2.asm index 79f0ca52..342cc437 100644 --- a/simd/i386/jchuff-sse2.asm +++ b/simd/i386/jchuff-sse2.asm @@ -1,8 +1,9 @@ ; ; jchuff-sse2.asm - Huffman entropy encoding (SSE2) ; -; Copyright (C) 2009-2011, 2014-2017, D. R. Commander. +; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander. ; Copyright (C) 2015, Matthieu Darbois. +; Copyright (C) 2018, Matthias Räncker. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -15,134 +16,256 @@ ; http://sourceforge.net/project/showfiles.php?group_id=6208 ; ; This file contains an SSE2 implementation for Huffman coding of one block. -; The following code is based directly on jchuff.c; see jchuff.c for more -; details. +; The following code is based on jchuff.c; see jchuff.c for more details. %include "jsimdext.inc" +struc working_state +.next_output_byte: resp 1 ; => next byte to write in buffer +.free_in_buffer: resp 1 ; # of byte spaces remaining in buffer +.cur.put_buffer.simd resq 1 ; current bit accumulation buffer +.cur.free_bits resd 1 ; # of bits available in it +.cur.last_dc_val resd 4 ; last DC coef for each component +.cinfo: resp 1 ; dump_buffer needs access to this +endstruc + +struc c_derived_tbl +.ehufco: resd 256 ; code for each symbol +.ehufsi: resb 256 ; length of code for each symbol +; If no code has been allocated for a symbol S, ehufsi[S] contains 0 +endstruc + ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 GLOBAL_DATA(jconst_huff_encode_one_block) EXTN(jconst_huff_encode_one_block): -%include "jpeg_nbits_table.inc" + alignz 32 + +jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007 + dq 0x000f, 0x001f, 0x003f, 0x007f + dq 0x00ff, 0x01ff, 0x03ff, 0x07ff + dq 0x0fff, 0x1fff, 0x3fff, 0x7fff + +times 1 << 14 db 15 +times 1 << 13 db 14 +times 1 << 12 db 13 +times 1 << 11 db 12 +times 1 << 10 db 11 +times 1 << 9 db 10 +times 1 << 8 db 9 +times 1 << 7 db 8 +times 1 << 6 db 7 +times 1 << 5 db 6 +times 1 << 4 db 5 +times 1 << 3 db 4 +times 1 << 2 db 3 +times 1 << 1 db 2 +times 1 << 0 db 1 +times 1 db 0 +jpeg_nbits_table: +times 1 db 0 +times 1 << 0 db 1 +times 1 << 1 db 2 +times 1 << 2 db 3 +times 1 << 3 db 4 +times 1 << 4 db 5 +times 1 << 5 db 6 +times 1 << 6 db 7 +times 1 << 7 db 8 +times 1 << 8 db 9 +times 1 << 9 db 10 +times 1 << 10 db 11 +times 1 << 11 db 12 +times 1 << 12 db 13 +times 1 << 13 db 14 +times 1 << 14 db 15 alignz 32 +%ifdef PIC +%define NBITS(x) nbits_base + x +%else +%define NBITS(x) jpeg_nbits_table + x +%endif +%define MASK_BITS(x) NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table) + ; -------------------------------------------------------------------------- SECTION SEG_TEXT BITS 32 -; These macros perform the same task as the emit_bits() function in the -; original libjpeg code. In addition to reducing overhead by explicitly -; inlining the code, additional performance is achieved by taking into -; account the size of the bit buffer and waiting until it is almost full -; before emptying it. This mostly benefits 64-bit platforms, since 6 -; bytes can be stored in a 64-bit bit buffer before it has to be emptied. +%define mm_put_buffer mm0 +%define mm_all_0xff mm1 +%define mm_temp mm2 +%define mm_nbits mm3 +%define mm_code_bits mm3 +%define mm_code mm4 +%define mm_overflow_bits mm5 +%define mm_save_nbits mm6 -%macro EMIT_BYTE 0 - sub put_bits, 8 ; put_bits -= 8; - mov edx, put_buffer - mov ecx, put_bits - shr edx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits); - mov byte [eax], dl ; *buffer++ = c; - add eax, 1 - cmp dl, 0xFF ; need to stuff a zero byte? - jne %%.EMIT_BYTE_END - mov byte [eax], 0 ; *buffer++ = 0; - add eax, 1 -%%.EMIT_BYTE_END: -%endmacro +; Shorthand used to describe SIMD operations: +; wN: xmmN treated as eight signed 16-bit values +; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7 +; bN: xmmN treated as 16 unsigned 8-bit values, or +; mmN treated as eight unsigned 8-bit values +; bN[i]: perform the same operation on all unsigned 8-bit values, +; i=0..15 (SSE register) or i=0..7 (MMX register) +; Contents of SIMD registers are shown in memory order. -%macro PUT_BITS 1 - add put_bits, ecx ; put_bits += size; - shl put_buffer, cl ; put_buffer = (put_buffer << size); - or put_buffer, %1 -%endmacro +; Fill the bit buffer to capacity with the leading bits from code, then output +; the bit buffer and put the remaining bits from code into the bit buffer. +; +; Usage: +; code - contains the bits to shift into the bit buffer (LSB-aligned) +; %1 - temp register +; %2 - low byte of temp register +; %3 - second byte of temp register +; %4-%8 (optional) - extra instructions to execute before the macro completes +; %9 - the label to which to jump when the macro completes +; +; Upon completion, free_bits will be set to the number of remaining bits from +; code, and put_buffer will contain those remaining bits. temp and code will +; be clobbered. +; +; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE() +; macro in jchuff.c. -%macro CHECKBUF15 0 - cmp put_bits, 16 ; if (put_bits > 31) { - jl %%.CHECKBUF15_END - mov eax, POINTER [esp+buffer] - EMIT_BYTE - EMIT_BYTE - mov POINTER [esp+buffer], eax -%%.CHECKBUF15_END: -%endmacro - -%macro EMIT_BITS 1 - PUT_BITS %1 - CHECKBUF15 -%endmacro - -%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3) - pxor xmm4, xmm4 ; __m128i neg = _mm_setzero_si128(); - pxor xmm5, xmm5 ; __m128i neg = _mm_setzero_si128(); - pxor xmm6, xmm6 ; __m128i neg = _mm_setzero_si128(); - pxor xmm7, xmm7 ; __m128i neg = _mm_setzero_si128(); - pinsrw %34, word [esi + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0]; - pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8]; - pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16]; - pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24]; - pinsrw %34, word [esi + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1]; - pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9]; - pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17]; - pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25]; - pinsrw %34, word [esi + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2]; - pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10]; - pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18]; - pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26]; - pinsrw %34, word [esi + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3]; - pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11]; - pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19]; - pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27]; - pinsrw %34, word [esi + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4]; - pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12]; - pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20]; - pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28]; - pinsrw %34, word [esi + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5]; - pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13]; - pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21]; - pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29]; - pinsrw %34, word [esi + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6]; - pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14]; - pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22]; - pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30]; - pinsrw %34, word [esi + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7]; - pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15]; - pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23]; -%if %1 != 32 - pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31]; -%else - pinsrw %37, ecx, 7 ; xmm_shadow[31] = block[jno31]; +%macro EMIT_QWORD 9 +%define %%temp %1 +%define %%tempb %2 +%define %%temph %3 + add nbits, free_bits ; nbits += free_bits; + neg free_bits ; free_bits = -free_bits; + movq mm_temp, mm_code ; temp = code; + movd mm_nbits, nbits ; nbits --> MMX register + movd mm_overflow_bits, free_bits ; overflow_bits (temp register) = free_bits; + neg free_bits ; free_bits = -free_bits; + psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; + psrlq mm_temp, mm_overflow_bits ; temp >>= overflow_bits; + add free_bits, 64 ; free_bits += 64; + por mm_temp, mm_put_buffer ; temp |= put_buffer; +%ifidn %%temp, nbits_base + movd mm_save_nbits, nbits_base ; save nbits_base +%endif + movq mm_code_bits, mm_temp ; code_bits (temp register) = temp; + movq mm_put_buffer, mm_code ; put_buffer = code; + pcmpeqb mm_temp, mm_all_0xff ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0); + movq mm_code, mm_code_bits ; code = code_bits; + psrlq mm_code_bits, 32 ; code_bits >>= 32; + pmovmskb nbits, mm_temp ; nbits = 0; nbits |= ((b_temp[i] >> 7) << i); + movd %%temp, mm_code_bits ; temp = code_bits; + bswap %%temp ; temp = htonl(temp); + test nbits, nbits ; if (nbits != 0) /* Some 0xFF bytes */ + jnz %%.SLOW ; goto %%.SLOW + mov dword [buffer], %%temp ; *(uint32_t)buffer = temp; +%ifidn %%temp, nbits_base + movd nbits_base, mm_save_nbits ; restore nbits_base +%endif + %4 + movd nbits, mm_code ; nbits = (uint32_t)(code); + %5 + bswap nbits ; nbits = htonl(nbits); + mov dword [buffer + 4], nbits ; *(uint32_t)(buffer + 4) = nbits; + lea buffer, [buffer + 8] ; buffer += 8; + %6 + %7 + %8 + jmp %9 ; return +%%.SLOW: + ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8 + ; bytes in the qword. + mov byte [buffer], %%tempb ; buffer[0] = temp[0]; + cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); + mov byte [buffer], %%temph ; buffer[0] = temp[1]; + cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); + shr %%temp, 16 ; temp >>= 16; + mov byte [buffer], %%tempb ; buffer[0] = temp[0]; + cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); + mov byte [buffer], %%temph ; buffer[0] = temp[1]; + cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); + movd nbits, mm_code ; nbits (temp register) = (uint32_t)(code) +%ifidn %%temp, nbits_base + movd nbits_base, mm_save_nbits ; restore nbits_base +%endif + bswap nbits ; nbits = htonl(nbits) + mov byte [buffer], nbitsb ; buffer[0] = nbits[0]; + cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0)); + mov byte [buffer], nbitsh ; buffer[0] = nbits[1]; + cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0)); + shr nbits, 16 ; nbits >>= 16; + mov byte [buffer], nbitsb ; buffer[0] = nbits[0]; + cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0)); + mov byte [buffer], nbitsh ; buffer[0] = nbits[1]; + %4 + cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0)); + %5 + %6 + %7 + %8 + jmp %9 ; return; +%endmacro + +%macro PUSH 1 + push %1 +%assign stack_offset stack_offset + 4 +%endmacro + +%macro POP 1 + pop %1 +%assign stack_offset stack_offset - 4 +%endmacro + +; If PIC is defined, load the address of a symbol defined in this file into a +; register. Equivalent to +; get_GOT %1 +; lea %1, [GOTOFF(%1, %2)] +; without using the GOT. +; +; Usage: +; %1 - register into which to load the address of the symbol +; %2 - symbol whose address should be loaded +; %3 - optional multi-line macro to execute before the symbol address is loaded +; %4 - optional multi-line macro to execute after the symbol address is loaded +; +; If PIC is not defined, then %3 and %4 are executed in order. + +%macro GET_SYM 2-4 +%ifdef PIC + call %%.geteip +%%.ref: + %4 + add %1, %2 + sub %1, %%.ref + jmp short %%.done + align 32 +%%.geteip: + %3 4 ; must adjust stack pointer because of call + mov %1, POINTER [esp] + ret + align 32 +%%.done: +%else + %3 0 + %4 %endif - pcmpgtw xmm4, %34 ; neg = _mm_cmpgt_epi16(neg, x1); - pcmpgtw xmm5, %35 ; neg = _mm_cmpgt_epi16(neg, x1); - pcmpgtw xmm6, %36 ; neg = _mm_cmpgt_epi16(neg, x1); - pcmpgtw xmm7, %37 ; neg = _mm_cmpgt_epi16(neg, x1); - paddw %34, xmm4 ; x1 = _mm_add_epi16(x1, neg); - paddw %35, xmm5 ; x1 = _mm_add_epi16(x1, neg); - paddw %36, xmm6 ; x1 = _mm_add_epi16(x1, neg); - paddw %37, xmm7 ; x1 = _mm_add_epi16(x1, neg); - pxor %34, xmm4 ; x1 = _mm_xor_si128(x1, neg); - pxor %35, xmm5 ; x1 = _mm_xor_si128(x1, neg); - pxor %36, xmm6 ; x1 = _mm_xor_si128(x1, neg); - pxor %37, xmm7 ; x1 = _mm_xor_si128(x1, neg); - pxor xmm4, %34 ; neg = _mm_xor_si128(neg, x1); - pxor xmm5, %35 ; neg = _mm_xor_si128(neg, x1); - pxor xmm6, %36 ; neg = _mm_xor_si128(neg, x1); - pxor xmm7, %37 ; neg = _mm_xor_si128(neg, x1); - movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1); - movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1); - movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1); - movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1); - movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg); - movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg); - movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg); - movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg); %endmacro ; @@ -153,272 +276,487 @@ EXTN(jconst_huff_encode_one_block): ; JCOEFPTR block, int last_dc_val, ; c_derived_tbl *dctbl, c_derived_tbl *actbl) ; +; Stack layout: +; Function args +; Return address +; Saved ebx +; Saved ebp +; Saved esi +; Saved edi <-- esp_save +; ... +; esp_save +; t_ 64*2 bytes (aligned to 128 bytes) +; +; esp is used (as t) to point into t_ (data in lower indices is not used once +; esp passes over them, so this is signal-safe.) Aligning to 128 bytes allows +; us to find the rest of the data again. +; +; NOTES: +; When shuffling data, we try to avoid pinsrw as much as possible, since it is +; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on +; modern CPUs, so chains of pinsrw instructions (even with different outputs) +; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and +; requires 2 µops (with memory operand) on Intel. In either case, only one +; pinsrw instruction can be decoded per cycle (and nothing else if they are +; back-to-back), so out-of-order execution cannot be used to work around long +; pinsrw chains (though for Sandy Bridge and later, this may be less of a +; problem if the code runs from the µop cache.) +; +; We use tzcnt instead of bsf without checking for support. The instruction is +; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to +; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is +; an input dependency (although the behavior is not formally defined, Intel +; CPUs usually leave the destination unmodified if the source is zero.) This +; can prevent out-of-order execution, so we clear the destination before +; invoking tzcnt. +; +; Initial register allocation +; eax - frame --> buffer +; ebx - nbits_base (PIC) / emit_temp +; ecx - dctbl --> size --> state +; edx - block --> nbits +; esi - code_temp --> state --> actbl +; edi - index_temp --> free_bits +; esp - t +; ebp - index -; eax + 8 = working_state *state -; eax + 12 = JOCTET *buffer -; eax + 16 = JCOEFPTR block -; eax + 20 = int last_dc_val -; eax + 24 = c_derived_tbl *dctbl -; eax + 28 = c_derived_tbl *actbl +%define frame eax +%ifdef PIC +%define nbits_base ebx +%endif +%define emit_temp ebx +%define emit_tempb bl +%define emit_temph bh +%define dctbl ecx +%define block edx +%define code_temp esi +%define index_temp edi +%define t esp +%define index ebp -%define pad 6 * SIZEOF_DWORD ; Align to 16 bytes -%define t1 pad -%define t2 t1 + (DCTSIZE2 * SIZEOF_WORD) -%define block t2 + (DCTSIZE2 * SIZEOF_WORD) -%define actbl block + SIZEOF_DWORD -%define buffer actbl + SIZEOF_DWORD -%define temp buffer + SIZEOF_DWORD -%define temp2 temp + SIZEOF_DWORD -%define temp3 temp2 + SIZEOF_DWORD -%define temp4 temp3 + SIZEOF_DWORD -%define temp5 temp4 + SIZEOF_DWORD -%define gotptr temp5 + SIZEOF_DWORD ; void *gotptr -%define put_buffer ebx -%define put_bits edi +%assign save_frame DCTSIZE2 * SIZEOF_WORD + +; Step 1: Re-arrange input data according to jpeg_natural_order +; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10 +; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05 +; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34 +; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28 +; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36 +; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51 +; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46 +; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63 align 32 GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2) EXTN(jsimd_huff_encode_one_block_sse2): - push ebp - mov eax, esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp], eax - mov ebp, esp ; ebp = aligned ebp - sub esp, temp5+9*SIZEOF_DWORD-pad - push ebx - push ecx -; push edx ; need not be preserved - push esi - push edi - push ebp - mov esi, POINTER [eax+8] ; (working_state *state) - mov put_buffer, dword [esi+8] ; put_buffer = state->cur.put_buffer; - mov put_bits, dword [esi+12] ; put_bits = state->cur.put_bits; - push esi ; esi is now scratch +%assign stack_offset 0 +%define arg_state 4 + stack_offset +%define arg_buffer 8 + stack_offset +%define arg_block 12 + stack_offset +%define arg_last_dc_val 16 + stack_offset +%define arg_dctbl 20 + stack_offset +%define arg_actbl 24 + stack_offset - get_GOT edx ; get GOT address - movpic POINTER [esp+gotptr], edx ; save GOT address + ;X: X = code stream + mov block, [esp + arg_block] + PUSH ebx + PUSH ebp + movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 + PUSH esi + PUSH edi + movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 + mov frame, esp + lea t, [frame - (save_frame + 4)] + movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 + and t, -DCTSIZE2 * SIZEOF_WORD ; t = &t_[0] + mov [t + save_frame], frame + pxor xmm4, xmm4 ;A: w4[i] = 0; + punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 + pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11 + pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11 + punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15 + punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13 + pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17 + ;A: (Row 0, offset 1) + pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0); + paddw xmm0, xmm4 ;A: w0[i] += w4[i]; + movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i]; - mov ecx, POINTER [eax+28] - mov edx, POINTER [eax+16] - mov esi, POINTER [eax+12] - mov POINTER [esp+actbl], ecx - mov POINTER [esp+block], edx - mov POINTER [esp+buffer], esi + movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- -- + pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- -- + pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12 + movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55 + movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12 + punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51 + pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12 + pxor xmm4, xmm4 ;A: w4[i] = 0; + psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- -- + pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0); + pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12 + ; (Row 1, offset 1) + pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0); + paddw xmm1, xmm4 ;B: w1[i] += w4[i]; + movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i]; + pxor xmm4, xmm4 ;B: w4[i] = 0; + pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0); - ; Encode the DC coefficient difference per section F.1.2.1 - mov esi, POINTER [esp+block] ; block - movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val; - sub ecx, dword [eax+20] - mov esi, ecx + packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i] + ; w/ signed saturation - ; This is a well-known technique for obtaining the absolute value - ; with out a branch. It is derived from an assembly language technique - ; presented in "How to Optimize for the Pentium Processors", - ; Copyright (c) 1996, 1997 by Agner Fog. - mov edx, ecx - sar edx, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); - xor ecx, edx ; temp ^= temp3; - sub ecx, edx ; temp -= temp3; + pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- -- + pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- -- + pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 -- + pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35 + ; (Row 3, offset 1) + pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0); + paddw xmm3, xmm4 ;D: w3[i] += w4[i]; + movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i]; + pxor xmm4, xmm4 ;D: w4[i] = 0; + pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0); - ; For a negative input, want temp2 = bitwise complement of abs(input) - ; This code assumes we are on a two's complement machine - add esi, edx ; temp2 += temp3; - mov dword [esp+temp], esi ; backup temp2 in temp + pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51 + pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51 + pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51 + pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51 + pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51 + pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27 + ; (Row 2, offset 1) + pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0); + paddw xmm2, xmm4 ;C: w2[i] += w4[i]; + movsx code_temp, word [block] ;Z: code_temp = block[0]; - ; Find the number of bits needed for the magnitude of the coefficient - movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp) - movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)] ; nbits = JPEG_NBITS(temp); - mov dword [esp+temp2], edx ; backup nbits in temp2 +; %1 - stack pointer adjustment +%macro GET_SYM_BEFORE 1 + movaps XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2 + ;C: t[i+16] = w2[i]; + pxor xmm4, xmm4 ;C: w4[i] = 0; + pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0); + sub code_temp, [frame + arg_last_dc_val] ;Z: code_temp -= last_dc_val; - ; Emit the Huffman-coded symbol for the number of bits - mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore - mov eax, INT [ebp + edx * 4] ; code = dctbl->ehufco[nbits]; - movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits]; - EMIT_BITS eax ; EMIT_BITS(code, size) + packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i] + ; w/ signed saturation - mov ecx, dword [esp+temp2] ; restore nbits + movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55 + pmovmskb index_temp, xmm2 ;Z: index_temp = 0; index_temp |= ((b2[i] >> 7) << i); + pmovmskb index, xmm0 ;Z: index = 0; index |= ((b0[i] >> 7) << i); + movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63 + punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63 + shl index_temp, 16 ;Z: index_temp <<= 16; + psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 -- + pxor xmm2, xmm2 ;H: w2[i] = 0; + pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 -- + or index, index_temp ;Z: index |= index_temp; +%undef index_temp +%define free_bits edi +%endmacro - ; Mask off any extra bits in code - mov eax, 1 - shl eax, cl - dec eax - and eax, dword [esp+temp] ; temp2 &= (((JLONG)1)<= 0 ? 1 : 0); + pxor xmm2, xmm2 ;G: w2[i] = 0; + pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0); + pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58 + paddw xmm4, xmm0 ;G: w4[i] += w0[i]; + movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i]; + movd mm_temp, code_temp ;Z: temp = code_temp + pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58 + ; (Row 5, offset 1) + pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0); - pxor xmm7, xmm7 - movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0)); - movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8)); - movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16)); - movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24)); - pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero); - pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero); - pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero); - pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero); - packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1); - packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3); - pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0; - pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16; - shl ecx, 16 - or edx, ecx - not edx ; index = ~index; + packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i] + ; w/ signed saturation - lea esi, [esp+t1] - mov ebp, POINTER [esp+actbl] ; ebp = actbl + lea t, [t - SIZEOF_WORD] ;Z: t = &t[-1] + pxor xmm0, xmm0 ;F: w0[i] = 0; + pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0); + paddw xmm1, xmm2 ;F: w1[i] += w2[i]; + movaps XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i]; + pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0); + pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59 + pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59 + pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59 + pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29 + ; (Row 4, offset 1) +%undef block +%define nbits edx +%define nbitsb dl +%define nbitsh dh + movzx nbits, byte [NBITS(code_temp)] ;Z: nbits = JPEG_NBITS(code_temp); +%undef code_temp +%define state esi + pxor xmm2, xmm2 ;E: w2[i] = 0; + mov state, [frame + arg_state] + movd mm_nbits, nbits ;Z: nbits --> MMX register + pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0); + movd mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4] + ;Z: code = dctbl->ehufco[nbits]; +%define size ecx +%define sizeb cl +%define sizeh ch + paddw xmm5, xmm0 ;E: w5[i] += w0[i]; + movaps XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i]; + movzx size, byte [dctbl + c_derived_tbl.ehufsi + nbits] + ;Z: size = dctbl->ehufsi[nbits]; +%undef dctbl + pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0); -.BLOOP: - bsf ecx, edx ; r = __builtin_ctzl(index); - jz near .ELOOP - lea esi, [esi+ecx*2] ; k += r; - shr edx, cl ; index >>= r; - mov dword [esp+temp3], edx -.BRLOOP: - cmp ecx, 16 ; while (r > 15) { - jl near .ERLOOP - sub ecx, 16 ; r -= 16; - mov dword [esp+temp], ecx - mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; - movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; - EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0) - mov ecx, dword [esp+temp] - jmp .BRLOOP -.ERLOOP: - movsx eax, word [esi] ; temp = t1[k]; - movpic edx, POINTER [esp+gotptr] ; load GOT address (edx) - movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)] ; nbits = JPEG_NBITS(temp); - mov dword [esp+temp2], eax - ; Emit Huffman symbol for run length / number of bits - shl ecx, 4 ; temp3 = (r << 4) + nbits; - add ecx, eax - mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3]; - movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3]; - EMIT_BITS eax + packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i] + ; w/ signed saturation - movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k]; - ; Mask off any extra bits in code - mov ecx, dword [esp+temp2] - mov eax, 1 - shl eax, cl - dec eax - and eax, edx ; temp2 &= (((JLONG)1)<>= 1; + movq mm_put_buffer, [state + working_state.cur.put_buffer.simd] + ;Z: put_buffer = state->cur.put_buffer.simd; + mov free_bits, [state + working_state.cur.free_bits] + ;Z: free_bits = state->cur.free_bits; +%undef state +%define actbl esi + mov actbl, [frame + arg_actbl] +%define buffer eax + mov buffer, [frame + arg_buffer] +%undef frame + jmp .BEGIN - jmp .BLOOP -.ELOOP: - movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0)); - movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8)); - movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16)); - movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24)); - pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero); - pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero); - pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero); - pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero); - packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1); - packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3); - pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0; - pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16; - shl ecx, 16 - or edx, ecx - not edx ; index = ~index; +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - lea eax, [esp + t1 + (DCTSIZE2/2) * 2] - sub eax, esi - shr eax, 1 - bsf ecx, edx ; r = __builtin_ctzl(index); - jz near .ELOOP2 - shr edx, cl ; index >>= r; - add ecx, eax - lea esi, [esi+ecx*2] ; k += r; - mov dword [esp+temp3], edx - jmp .BRLOOP2 -.BLOOP2: - bsf ecx, edx ; r = __builtin_ctzl(index); - jz near .ELOOP2 - lea esi, [esi+ecx*2] ; k += r; - shr edx, cl ; index >>= r; - mov dword [esp+temp3], edx -.BRLOOP2: - cmp ecx, 16 ; while (r > 15) { - jl near .ERLOOP2 - sub ecx, 16 ; r -= 16; - mov dword [esp+temp], ecx - mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; - movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; - EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0) - mov ecx, dword [esp+temp] - jmp .BRLOOP2 -.ERLOOP2: - movsx eax, word [esi] ; temp = t1[k]; - bsr eax, eax ; nbits = 32 - __builtin_clz(temp); - inc eax - mov dword [esp+temp2], eax - ; Emit Huffman symbol for run length / number of bits - shl ecx, 4 ; temp3 = (r << 4) + nbits; - add ecx, eax - mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3]; - movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3]; - EMIT_BITS eax + align 16 +; size <= 32, so this is not really a loop +.BRLOOP1: ; .BRLOOP1: + movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0] + ; nbits = actbl->ehufsi[0xf0]; + movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4] + ; code = actbl->ehufco[0xf0]; + and index, 0x7ffffff ; clear index if size == 32 + sub size, 16 ; size -= 16; + sub free_bits, nbits ; if ((free_bits -= nbits) <= 0) + jle .EMIT_BRLOOP1 ; goto .EMIT_BRLOOP1; + movd mm_nbits, nbits ; nbits --> MMX register + psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; + por mm_put_buffer, mm_code ; put_buffer |= code; + jmp .ERLOOP1 ; goto .ERLOOP1; - movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k]; - ; Mask off any extra bits in code - mov ecx, dword [esp+temp2] - mov eax, 1 - shl eax, cl - dec eax - and eax, edx ; temp2 &= (((JLONG)1)<>= 1; +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - jmp .BLOOP2 -.ELOOP2: - ; If the last coef(s) were zero, emit an end-of-block code - lea edx, [esp + t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k; - cmp edx, esi ; if (r > 0) { - je .EFN - mov eax, INT [ebp] ; code = actbl->ehufco[0]; - movzx ecx, byte [ebp + 1024] ; size = actbl->ehufsi[0]; - EMIT_BITS eax -.EFN: - mov eax, [esp+buffer] - pop esi - ; Save put_buffer & put_bits - mov dword [esi+8], put_buffer ; state->cur.put_buffer = put_buffer; - mov dword [esi+12], put_bits ; state->cur.put_bits = put_bits; + align 16 +%ifdef PIC + times 6 nop +%else + times 2 nop +%endif +.BLOOP1: ; do { /* size = # of zero bits/elements to skip */ +; if size == 32, index remains unchanged. Correct in .BRLOOP. + shr index, sizeb ; index >>= size; + lea t, [t + size * SIZEOF_WORD] ; t += size; + cmp size, 16 ; if (size > 16) + jg .BRLOOP1 ; goto .BRLOOP1; +.ERLOOP1: ; .ERLOOP1: + movsx nbits, word [t] ; nbits = *t; +%ifdef PIC + add size, size ; size += size; +%else + lea size, [size * 2] ; size += size; +%endif + movd mm_temp, nbits ; temp = nbits; + movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits); + lea size, [size * 8 + nbits] ; size = size * 8 + nbits; + movd mm_nbits, nbits ; nbits --> MMX register + movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4] + ; code = actbl->ehufco[size-16]; + movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)] + ; size = actbl->ehufsi[size-16]; +.BEGIN: ; .BEGIN: + pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1; + psllq mm_code, mm_nbits ; code <<= nbits; + add nbits, size ; nbits += size; + por mm_code, mm_temp ; code |= temp; + sub free_bits, nbits ; if ((free_bits -= nbits) <= 0) + jle .EMIT_ERLOOP1 ; insert code, flush buffer, init size, goto .BLOOP1 + xor size, size ; size = 0; /* kill tzcnt input dependency */ + tzcnt size, index ; size = # of trailing 0 bits in index + movd mm_nbits, nbits ; nbits --> MMX register + psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; + inc size ; ++size; + por mm_put_buffer, mm_code ; put_buffer |= code; + test index, index + jnz .BLOOP1 ; } while (index != 0); +; Round 2 +; t points to the last used word, possibly below t_ if the previous index had 32 zero bits. +.ELOOP1: ; .ELOOP1: + pmovmskb size, xmm4 ; size = 0; size |= ((b4[i] >> 7) << i); + pmovmskb index, xmm5 ; index = 0; index |= ((b5[i] >> 7) << i); + shl size, 16 ; size <<= 16; + or index, size ; index |= size; + not index ; index = ~index; + lea nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD] + ; nbits = t + 1 + 64; + and nbits, -DCTSIZE2 * SIZEOF_WORD ; nbits &= -128; /* now points to &t_[64] */ + sub nbits, t ; nbits -= t; + shr nbits, 1 ; nbits >>= 1; /* # of leading 0 bits in old index + 33 */ + tzcnt size, index ; size = # of trailing 0 bits in index + inc size ; ++size; + test index, index ; if (index == 0) + jz .ELOOP2 ; goto .ELOOP2; +; NOTE: size == 32 cannot happen, since the last element is always 0. + shr index, sizeb ; index >>= size; + lea size, [size + nbits - 33] ; size = size + nbits - 33; + lea t, [t + size * SIZEOF_WORD] ; t += size; + cmp size, 16 ; if (size <= 16) + jle .ERLOOP2 ; goto .ERLOOP2; +.BRLOOP2: ; do { + movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0] + ; nbits = actbl->ehufsi[0xf0]; + sub size, 16 ; size -= 16; + movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4] + ; code = actbl->ehufco[0xf0]; + sub free_bits, nbits ; if ((free_bits -= nbits) <= 0) + jle .EMIT_BRLOOP2 ; insert code and flush put_buffer + movd mm_nbits, nbits ; else { nbits --> MMX register + psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; + por mm_put_buffer, mm_code ; put_buffer |= code; + cmp size, 16 ; if (size <= 16) + jle .ERLOOP2 ; goto .ERLOOP2; + jmp .BRLOOP2 ; } while(1); - pop ebp - pop edi - pop esi -; pop edx ; need not be preserved - pop ecx - pop ebx - mov esp, ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 +.BLOOP2: ; do { /* size = # of zero bits/elements to skip */ + shr index, sizeb ; index >>= size; + lea t, [t + size * SIZEOF_WORD] ; t += size; + cmp size, 16 ; if (size > 16) + jg .BRLOOP2 ; goto .BRLOOP2; +.ERLOOP2: ; .ERLOOP2: + movsx nbits, word [t] ; nbits = *t; + add size, size ; size += size; + movd mm_temp, nbits ; temp = nbits; + movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits); + movd mm_nbits, nbits ; nbits --> MMX register + lea size, [size * 8 + nbits] ; size = size * 8 + nbits; + movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4] + ; code = actbl->ehufco[size-16]; + movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)] + ; size = actbl->ehufsi[size-16]; + psllq mm_code, mm_nbits ; code <<= nbits; + pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1; + lea nbits, [nbits + size] ; nbits += size; + por mm_code, mm_temp ; code |= temp; + xor size, size ; size = 0; /* kill tzcnt input dependency */ + sub free_bits, nbits ; if ((free_bits -= nbits) <= 0) + jle .EMIT_ERLOOP2 ; insert code, flush buffer, init size, goto .BLOOP2 + tzcnt size, index ; size = # of trailing 0 bits in index + movd mm_nbits, nbits ; nbits --> MMX register + psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; + inc size ; ++size; + por mm_put_buffer, mm_code ; put_buffer |= code; + test index, index + jnz .BLOOP2 ; } while (index != 0); +.ELOOP2: ; .ELOOP2: + mov nbits, t ; nbits = t; + lea t, [t + SIZEOF_WORD] ; t = &t[1]; + and nbits, DCTSIZE2 * SIZEOF_WORD - 1 ; nbits &= 127; + and t, -DCTSIZE2 * SIZEOF_WORD ; t &= -128; /* t = &t_[0]; */ + cmp nbits, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (nbits != 62 * 2) + je .EFN ; { + movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0] + ; code = actbl->ehufco[0]; + movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0] + ; nbits = actbl->ehufsi[0]; + sub free_bits, nbits ; if ((free_bits -= nbits) <= 0) + jg .EFN_SKIP_EMIT_CODE ; { + EMIT_QWORD size, sizeb, sizeh, , , , , , .EFN ; insert code, flush put_buffer + align 16 +.EFN_SKIP_EMIT_CODE: ; } else { + movd mm_nbits, nbits ; nbits --> MMX register + psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; + por mm_put_buffer, mm_code ; put_buffer |= code; +.EFN: ; } } +%define frame esp + mov frame, [t + save_frame] +%define state ecx + mov state, [frame + arg_state] + movq [state + working_state.cur.put_buffer.simd], mm_put_buffer + ; state->cur.put_buffer.simd = put_buffer; + emms + mov [state + working_state.cur.free_bits], free_bits + ; state->cur.free_bits = free_bits; + POP edi + POP esi + POP ebp + POP ebx ret +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 +.EMIT_BRLOOP1: + EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , , , \ + .ERLOOP1 + +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 +.EMIT_ERLOOP1: + EMIT_QWORD size, sizeb, sizeh, \ + {xor size, size}, \ + {tzcnt size, index}, \ + {inc size}, \ + {test index, index}, \ + {jnz .BLOOP1}, \ + .ELOOP1 + +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 +.EMIT_BRLOOP2: + EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , \ + {cmp size, 16}, \ + {jle .ERLOOP2}, \ + .BRLOOP2 + +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 +.EMIT_ERLOOP2: + EMIT_QWORD size, sizeb, sizeh, \ + {xor size, size}, \ + {tzcnt size, index}, \ + {inc size}, \ + {test index, index}, \ + {jnz .BLOOP2}, \ + .ELOOP2 + ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. align 32 diff --git a/simd/nasm/jsimdext.inc b/simd/nasm/jsimdext.inc index 8b743d74..36a116ff 100644 --- a/simd/nasm/jsimdext.inc +++ b/simd/nasm/jsimdext.inc @@ -135,6 +135,8 @@ section .note.GNU-stack noalloc noexec nowrite progbits %define POINTER qword ; general pointer type %define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER) %define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT +%define resp resq +%define dp dq %define raxp rax %define rbxp rbx %define rcxp rcx @@ -157,6 +159,8 @@ section .note.GNU-stack noalloc noexec nowrite progbits %define POINTER dword ; general pointer type %define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) %define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT +%define resp resd +%define dp dd ; x86_64 ILP32 ABI (x32) %define raxp eax %define rbxp ebx diff --git a/simd/x86_64/jchuff-sse2.asm b/simd/x86_64/jchuff-sse2.asm index 267425e3..64ea8604 100644 --- a/simd/x86_64/jchuff-sse2.asm +++ b/simd/x86_64/jchuff-sse2.asm @@ -1,7 +1,7 @@ ; ; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2) ; -; Copyright (C) 2009-2011, 2014-2016, D. R. Commander. +; Copyright (C) 2009-2011, 2014-2016, 2019, D. R. Commander. ; Copyright (C) 2015, Matthieu Darbois. ; Copyright (C) 2018, Matthias Räncker. ; @@ -16,11 +16,25 @@ ; http://sourceforge.net/project/showfiles.php?group_id=6208 ; ; This file contains an SSE2 implementation for Huffman coding of one block. -; The following code is based directly on jchuff.c; see jchuff.c for more -; details. +; The following code is based on jchuff.c; see jchuff.c for more details. %include "jsimdext.inc" +struc working_state +.next_output_byte: resp 1 ; => next byte to write in buffer +.free_in_buffer: resp 1 ; # of byte spaces remaining in buffer +.cur.put_buffer.simd resq 1 ; current bit accumulation buffer +.cur.free_bits resd 1 ; # of bits available in it +.cur.last_dc_val resd 4 ; last DC coef for each component +.cinfo: resp 1 ; dump_buffer needs access to this +endstruc + +struc c_derived_tbl +.ehufco: resd 256 ; code for each symbol +.ehufsi: resb 256 ; length of code for each symbol +; If no code has been allocated for a symbol S, ehufsi[S] contains 0 +endstruc + ; -------------------------------------------------------------------------- SECTION SEG_CONST @@ -29,134 +43,137 @@ EXTN(jconst_huff_encode_one_block): -%include "jpeg_nbits_table.inc" +jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007 + dd 0x000f, 0x001f, 0x003f, 0x007f + dd 0x00ff, 0x01ff, 0x03ff, 0x07ff + dd 0x0fff, 0x1fff, 0x3fff, 0x7fff alignz 32 +times 1 << 14 db 15 +times 1 << 13 db 14 +times 1 << 12 db 13 +times 1 << 11 db 12 +times 1 << 10 db 11 +times 1 << 9 db 10 +times 1 << 8 db 9 +times 1 << 7 db 8 +times 1 << 6 db 7 +times 1 << 5 db 6 +times 1 << 4 db 5 +times 1 << 3 db 4 +times 1 << 2 db 3 +times 1 << 1 db 2 +times 1 << 0 db 1 +times 1 db 0 +jpeg_nbits_table: +times 1 db 0 +times 1 << 0 db 1 +times 1 << 1 db 2 +times 1 << 2 db 3 +times 1 << 3 db 4 +times 1 << 4 db 5 +times 1 << 5 db 6 +times 1 << 6 db 7 +times 1 << 7 db 8 +times 1 << 8 db 9 +times 1 << 9 db 10 +times 1 << 10 db 11 +times 1 << 11 db 12 +times 1 << 12 db 13 +times 1 << 13 db 14 +times 1 << 14 db 15 + + alignz 32 + +%define NBITS(x) nbits_base + x +%define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table) + ; -------------------------------------------------------------------------- SECTION SEG_TEXT BITS 64 -; These macros perform the same task as the emit_bits() function in the -; original libjpeg code. In addition to reducing overhead by explicitly -; inlining the code, additional performance is achieved by taking into -; account the size of the bit buffer and waiting until it is almost full -; before emptying it. This mostly benefits 64-bit platforms, since 6 -; bytes can be stored in a 64-bit bit buffer before it has to be emptied. +; Shorthand used to describe SIMD operations: +; wN: xmmN treated as eight signed 16-bit values +; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7 +; bN: xmmN treated as 16 unsigned 8-bit values +; bN[i]: perform the same operation on all 16 unsigned 8-bit values, i=0..15 +; Contents of SIMD registers are shown in memory order. -%macro EMIT_BYTE 0 - sub put_bits, 8 ; put_bits -= 8; - mov rdx, put_buffer - mov ecx, put_bits - shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits); - mov byte [buffer], dl ; *buffer++ = c; - add buffer, 1 - cmp dl, 0xFF ; need to stuff a zero byte? - jne %%.EMIT_BYTE_END - mov byte [buffer], 0 ; *buffer++ = 0; - add buffer, 1 -%%.EMIT_BYTE_END: -%endmacro +; Fill the bit buffer to capacity with the leading bits from code, then output +; the bit buffer and put the remaining bits from code into the bit buffer. +; +; Usage: +; code - contains the bits to shift into the bit buffer (LSB-aligned) +; %1 - the label to which to jump when the macro completes +; %2 (optional) - extra instructions to execute after nbits has been set +; +; Upon completion, free_bits will be set to the number of remaining bits from +; code, and put_buffer will contain those remaining bits. temp and code will +; be clobbered. +; +; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE() +; macro in jchuff.c. -%macro PUT_BITS 1 - add put_bits, ecx ; put_bits += size; - shl put_buffer, cl ; put_buffer = (put_buffer << size); - or put_buffer, %1 -%endmacro - -%macro CHECKBUF31 0 - cmp put_bits, 32 ; if (put_bits > 31) { - jl %%.CHECKBUF31_END - EMIT_BYTE - EMIT_BYTE - EMIT_BYTE - EMIT_BYTE -%%.CHECKBUF31_END: -%endmacro - -%macro CHECKBUF47 0 - cmp put_bits, 48 ; if (put_bits > 47) { - jl %%.CHECKBUF47_END - EMIT_BYTE - EMIT_BYTE - EMIT_BYTE - EMIT_BYTE - EMIT_BYTE - EMIT_BYTE -%%.CHECKBUF47_END: -%endmacro - -%macro EMIT_BITS 2 - CHECKBUF47 - mov ecx, %2 - PUT_BITS %1 -%endmacro - -%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3) - pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128(); - pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128(); - pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128(); - pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128(); - pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0]; - pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8]; - pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16]; - pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24]; - pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1]; - pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9]; - pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17]; - pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25]; - pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2]; - pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10]; - pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18]; - pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26]; - pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3]; - pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11]; - pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19]; - pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27]; - pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4]; - pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12]; - pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20]; - pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28]; - pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5]; - pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13]; - pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21]; - pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29]; - pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6]; - pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14]; - pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22]; - pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30]; - pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7]; - pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15]; - pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23]; -%if %1 != 32 - pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31]; -%else - pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31]; -%endif - pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1); - pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1); - pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1); - pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1); - paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg); - paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg); - paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg); - paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg); - pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg); - pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg); - pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg); - pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg); - pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1); - pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1); - pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1); - pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1); - movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1); - movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1); - movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1); - movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1); - movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg); - movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg); - movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg); - movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg); +%macro EMIT_QWORD 1-2 + add nbitsb, free_bitsb ; nbits += free_bits; + neg free_bitsb ; free_bits = -free_bits; + mov tempd, code ; temp = code; + shl put_buffer, nbitsb ; put_buffer <<= nbits; + mov nbitsb, free_bitsb ; nbits = free_bits; + neg free_bitsb ; free_bits = -free_bits; + shr tempd, nbitsb ; temp >>= nbits; + or tempq, put_buffer ; temp |= put_buffer; + movq xmm0, tempq ; xmm0.u64 = { temp, 0 }; + bswap tempq ; temp = htonl(temp); + mov put_buffer, codeq ; put_buffer = code; + pcmpeqb xmm0, xmm1 ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0); + %2 + pmovmskb code, xmm0 ; code = 0; code |= ((b0[i] >> 7) << i); + mov qword [buffer], tempq ; memcpy(buffer, &temp, 8); + ; (speculative; will be overwritten if + ; code contains any 0xFF bytes) + add free_bitsb, 64 ; free_bits += 64; + add bufferp, 8 ; buffer += 8; + test code, code ; if (code == 0) /* No 0xFF bytes */ + jz %1 ; return; + ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8 + ; bytes in the qword. + cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF + mov byte [buffer-7], 0 ; buffer[-7] = 0; + sbb bufferp, 6 ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0)); + mov byte [buffer], temph ; buffer[0] = temp[1]; + cmp temph, 0xFF ; Set CF if temp[1] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); + shr tempq, 16 ; temp >>= 16; + mov byte [buffer], tempb ; buffer[0] = temp[0]; + cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); + mov byte [buffer], temph ; buffer[0] = temp[1]; + cmp temph, 0xFF ; Set CF if temp[1] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); + shr tempq, 16 ; temp >>= 16; + mov byte [buffer], tempb ; buffer[0] = temp[0]; + cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); + mov byte [buffer], temph ; buffer[0] = temp[1]; + cmp temph, 0xFF ; Set CF if temp[1] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); + shr tempd, 16 ; temp >>= 16; + mov byte [buffer], tempb ; buffer[0] = temp[0]; + cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); + mov byte [buffer], temph ; buffer[0] = temp[1]; + cmp temph, 0xFF ; Set CF if temp[1] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); + jmp %1 ; return; %endmacro ; @@ -167,181 +184,398 @@ EXTN(jconst_huff_encode_one_block): ; JCOEFPTR block, int last_dc_val, ; c_derived_tbl *dctbl, c_derived_tbl *actbl) ; +; NOTES: +; When shuffling data, we try to avoid pinsrw as much as possible, since it is +; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on +; modern CPUs, so chains of pinsrw instructions (even with different outputs) +; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and +; requires 2 µops (with memory operand) on Intel. In either case, only one +; pinsrw instruction can be decoded per cycle (and nothing else if they are +; back-to-back), so out-of-order execution cannot be used to work around long +; pinsrw chains (though for Sandy Bridge and later, this may be less of a +; problem if the code runs from the µop cache.) +; +; We use tzcnt instead of bsf without checking for support. The instruction is +; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to +; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is +; an input dependency (although the behavior is not formally defined, Intel +; CPUs usually leave the destination unmodified if the source is zero.) This +; can prevent out-of-order execution, so we clear the destination before +; invoking tzcnt. +; +; Initial register allocation +; rax - buffer +; rbx - temp +; rcx - nbits +; rdx - block --> free_bits +; rsi - nbits_base +; rdi - t +; rbp - code +; r8 - dctbl --> code_temp +; r9 - actbl +; r10 - state +; r11 - index +; r12 - put_buffer -; r10 = working_state *state -; r11 = JOCTET *buffer -; r12 = JCOEFPTR block -; r13d = int last_dc_val -; r14 = c_derived_tbl *dctbl -; r15 = c_derived_tbl *actbl +%define buffer rax +%ifdef WIN64 +%define bufferp rax +%else +%define bufferp raxp +%endif +%define tempq rbx +%define tempd ebx +%define tempb bl +%define temph bh +%define nbitsq rcx +%define nbits ecx +%define nbitsb cl +%define block rdx +%define nbits_base rsi +%define t rdi +%define td edi +%define codeq rbp +%define code ebp +%define dctbl r8 +%define actbl r9 +%define state r10 +%define index r11 +%define indexd r11d +%define put_buffer r12 +%define put_bufferd r12d -%define t1 rbp - (DCTSIZE2 * SIZEOF_WORD) -%define t2 t1 - (DCTSIZE2 * SIZEOF_WORD) -%define put_buffer r8 -%define put_bits r9d -%define buffer rax +; Step 1: Re-arrange input data according to jpeg_natural_order +; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10 +; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05 +; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34 +; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28 +; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36 +; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51 +; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46 +; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63 align 32 GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2) EXTN(jsimd_huff_encode_one_block_sse2): - push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [t2] - push_xmm 4 - collect_args 6 + +%ifdef WIN64 + +; rcx = working_state *state +; rdx = JOCTET *buffer +; r8 = JCOEFPTR block +; r9 = int last_dc_val +; [rax+48] = c_derived_tbl *dctbl +; [rax+56] = c_derived_tbl *actbl + + ;X: X = code stream + mov buffer, rdx + mov block, r8 + movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 push rbx + push rbp + movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 + push rsi + push rdi + push r12 + movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 + mov state, rcx + movsx code, word [block] ;Z: code = block[0]; + pxor xmm4, xmm4 ;A: w4[i] = 0; + sub code, r9d ;Z: code -= last_dc_val; + mov dctbl, POINTER [rsp+6*8+4*8] + mov actbl, POINTER [rsp+6*8+5*8] + punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 + lea nbits_base, [rel jpeg_nbits_table] + add rsp, -DCTSIZE2 * SIZEOF_WORD + mov t, rsp - mov buffer, r11 ; r11 is now sratch +%else - mov put_buffer, MMWORD [r10+SIZEOF_POINTER*2] ; put_buffer = state->cur.put_buffer; - mov put_bits, dword [r10+SIZEOF_POINTER*2+8] ; put_bits = state->cur.put_bits; - push r10 ; r10 is now scratch +; rdi = working_state *state +; rsi = JOCTET *buffer +; rdx = JCOEFPTR block +; rcx = int last_dc_val +; r8 = c_derived_tbl *dctbl +; r9 = c_derived_tbl *actbl - ; Encode the DC coefficient difference per section F.1.2.1 - movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val; - sub edi, r13d ; r13 is not used anymore - mov ebx, edi + ;X: X = code stream + movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 + push rbx + push rbp + movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 + push r12 + mov state, rdi + mov buffer, rsi + movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 + movsx codeq, word [block] ;Z: code = block[0]; + lea nbits_base, [rel jpeg_nbits_table] + pxor xmm4, xmm4 ;A: w4[i] = 0; + sub codeq, rcx ;Z: code -= last_dc_val; + punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 + lea t, [rsp - DCTSIZE2 * SIZEOF_WORD] ; use red zone for t_ - ; This is a well-known technique for obtaining the absolute value - ; without a branch. It is derived from an assembly language technique - ; presented in "How to Optimize for the Pentium Processors", - ; Copyright (c) 1996, 1997 by Agner Fog. - mov esi, edi - sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); - xor edi, esi ; temp ^= temp3; - sub edi, esi ; temp -= temp3; +%endif - ; For a negative input, want temp2 = bitwise complement of abs(input) - ; This code assumes we are on a two's complement machine - add ebx, esi ; temp2 += temp3; + pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11 + pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11 + punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15 + punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13 + pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17 + ;A: (Row 0, offset 1) + pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0); + paddw xmm0, xmm4 ;A: w0[i] += w4[i]; + movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i]; - ; Find the number of bits needed for the magnitude of the coefficient - lea r11, [rel jpeg_nbits_table] - movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp); - ; Emit the Huffman-coded symbol for the number of bits - mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits]; - movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits]; - EMIT_BITS r11, esi ; EMIT_BITS(code, size) + movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- -- + pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- -- + pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12 + movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55 + movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12 + punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51 + pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12 + pxor xmm4, xmm4 ;A: w4[i] = 0; + psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- -- + pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0); + pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12 + ; (Row 1, offset 1) + pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0); + paddw xmm1, xmm4 ;B: w1[i] += w4[i]; + movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i]; + pxor xmm4, xmm4 ;B: w4[i] = 0; + pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0); - ; Mask off any extra bits in code - mov esi, 1 - mov ecx, edi - shl esi, cl - dec esi - and ebx, esi ; temp2 &= (((JLONG)1)<= 0 ? 1 : 0); + pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51 + pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51 + movsxd codeq, code ;Z: sign extend code + pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27 + ; (Row 2, offset 1) + pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0); + paddw xmm2, xmm4 ;C: w2[i] += w4[i]; + movaps XMMWORD [t + 16 * SIZEOF_WORD], xmm2 ;C: t[i+16] = w2[i]; + pxor xmm4, xmm4 ;C: w4[i] = 0; + pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0); - pxor xmm8, xmm8 - pcmpeqw xmm0, xmm8 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero); - pcmpeqw xmm1, xmm8 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero); - pcmpeqw xmm2, xmm8 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero); - pcmpeqw xmm3, xmm8 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero); - pcmpeqw xmm4, xmm8 ; tmp4 = _mm_cmpeq_epi16(tmp4, zero); - pcmpeqw xmm5, xmm8 ; tmp5 = _mm_cmpeq_epi16(tmp5, zero); - pcmpeqw xmm6, xmm8 ; tmp6 = _mm_cmpeq_epi16(tmp6, zero); - pcmpeqw xmm7, xmm8 ; tmp7 = _mm_cmpeq_epi16(tmp7, zero); - packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1); - packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3); - packsswb xmm4, xmm5 ; tmp4 = _mm_packs_epi16(tmp4, tmp5); - packsswb xmm6, xmm7 ; tmp6 = _mm_packs_epi16(tmp6, tmp7); - pmovmskb r11d, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0; - pmovmskb r12d, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16; - pmovmskb r13d, xmm4 ; index = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32; - pmovmskb r14d, xmm6 ; index = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48; - shl r12, 16 - shl r14, 16 - or r11, r12 - or r13, r14 - shl r13, 32 - or r11, r13 - not r11 ; index = ~index; + packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i] + ; w/ signed saturation - ;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11 - ;jmp .EFN + movzx nbitsq, byte [NBITS(codeq)] ;Z: nbits = JPEG_NBITS(code); + movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55 + pmovmskb tempd, xmm2 ;Z: temp = 0; temp |= ((b2[i] >> 7) << i); + pmovmskb put_bufferd, xmm0 ;Z: put_buffer = 0; put_buffer |= ((b0[i] >> 7) << i); + movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63 + punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63 + shl tempd, 16 ;Z: temp <<= 16; + psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 -- + pxor xmm2, xmm2 ;H: w2[i] = 0; + or put_bufferd, tempd ;Z: put_buffer |= temp; + pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 -- + movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- -- + unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59 + pxor xmm0, xmm0 ;H: w0[i] = 0; + pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 -- + ; (Row 7, offset 1) + pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0); + paddw xmm3, xmm2 ;H: w3[i] += w2[i]; + movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i]; + movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- -- + pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0); + punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47 + mov tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4] + ;Z: temp = dctbl->ehufco[nbits]; + movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47 + psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 -- + shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59 + and code, dword [MASK_BITS(nbitsq)] ;Z: code &= (1 << nbits) - 1; + pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 -- + pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58 + shl tempq, nbitsb ;Z: temp <<= nbits; + pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 -- + pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58 + pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 -- + or code, tempd ;Z: code |= temp; + movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58 + pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 -- + pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58 + pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53 + ; (Row 6, offset 1) + pxor xmm2, xmm2 ;G: w2[i] = 0; + pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0); + pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58 + paddw xmm4, xmm0 ;G: w4[i] += w0[i]; + movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i]; + pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58 + ; (Row 5, offset 1) + pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0); + pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59 - mov r13d, INT [r15 + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; - movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; - lea rsi, [t1] -.BLOOP: - bsf r12, r11 ; r = __builtin_ctzl(index); - jz .ELOOP - mov rcx, r12 - lea rsi, [rsi+r12*2] ; k += r; - shr r11, cl ; index >>= r; - movzx rdi, word [rsi] ; temp = t1[k]; - lea rbx, [rel jpeg_nbits_table] - movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp); -.BRLOOP: - cmp r12, 16 ; while (r > 15) { - jl .ERLOOP - EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0) - sub r12, 16 ; r -= 16; - jmp .BRLOOP -.ERLOOP: - ; Emit Huffman symbol for run length / number of bits - CHECKBUF31 ; uses rcx, rdx + packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i] + ; w/ signed saturation - shl r12, 4 ; temp3 = (r << 4) + nbits; - add r12, rdi - mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3]; - movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3]; - PUT_BITS rbx + pxor xmm0, xmm0 ;F: w0[i] = 0; + pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59 + pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0); + pmovmskb tempd, xmm4 ;Z: temp = 0; temp |= ((b4[i] >> 7) << i); + pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59 + paddw xmm1, xmm2 ;F: w1[i] += w2[i]; + movaps XMMWORD [t + 40 * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i]; + pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29 + ; (Row 4, offset 1) +%undef block +%define free_bitsq rdx +%define free_bitsd edx +%define free_bitsb dl + pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0); + shl tempq, 48 ;Z: temp <<= 48; + pxor xmm2, xmm2 ;E: w2[i] = 0; + pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0); + paddw xmm5, xmm0 ;E: w5[i] += w0[i]; + or tempq, put_buffer ;Z: temp |= put_buffer; + movaps XMMWORD [t + 32 * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i]; + lea t, [dword t - 2] ;Z: t = &t[-1]; + pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0); - ;EMIT_CODE(code, size) + packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i] + ; w/ signed saturation - movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k]; - ; Mask off any extra bits in code - mov rcx, rdi - mov rdx, 1 - shl rdx, cl - dec rdx - and rbx, rdx ; temp2 &= (((JLONG)1)<ehufsi[nbits]; +%undef dctbl +%define code_temp r8d + pmovmskb indexd, xmm5 ;Z: index = 0; index |= ((b5[i] >> 7) << i); + mov free_bitsd, [state+working_state.cur.free_bits] + ;Z: free_bits = state->cur.free_bits; + pcmpeqw xmm1, xmm1 ;Z: b1[i] = 0xFF; + shl index, 32 ;Z: index <<= 32; + mov put_buffer, [state+working_state.cur.put_buffer.simd] + ;Z: put_buffer = state->cur.put_buffer.simd; + or index, tempq ;Z: index |= temp; + not index ;Z: index = ~index; + sub free_bitsb, nbitsb ;Z: if ((free_bits -= nbits) >= 0) + jnl .ENTRY_SKIP_EMIT_CODE ;Z: goto .ENTRY_SKIP_EMIT_CODE; + align 16 +.EMIT_CODE: ;Z: .EMIT_CODE: + EMIT_QWORD .BLOOP_COND ;Z: insert code, flush buffer, goto .BLOOP_COND - shr r11, 1 ; index >>= 1; - add rsi, 2 ; ++k; - jmp .BLOOP -.ELOOP: - ; If the last coef(s) were zero, emit an end-of-block code - lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k; - cmp rdi, rsi ; if (r > 0) { - je .EFN - mov ebx, INT [r15] ; code = actbl->ehufco[0]; - movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0]; - EMIT_BITS rbx, r12d -.EFN: - pop r10 - ; Save put_buffer & put_bits - mov MMWORD [r10+SIZEOF_POINTER*2], put_buffer ; state->cur.put_buffer = put_buffer; - mov dword [r10+SIZEOF_POINTER*2+8], put_bits ; state->cur.put_bits = put_bits; +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - pop rbx - uncollect_args 6 - pop_xmm 4 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + align 16 +.BRLOOP: ; do { + lea code_temp, [nbitsq - 16] ; code_temp = nbits - 16; + movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0] + ; nbits = actbl->ehufsi[0xf0]; + mov code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4] + ; code = actbl->ehufco[0xf0]; + sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) + jle .EMIT_BRLOOP_CODE ; goto .EMIT_BRLOOP_CODE; + shl put_buffer, nbitsb ; put_buffer <<= nbits; + mov nbits, code_temp ; nbits = code_temp; + or put_buffer, codeq ; put_buffer |= code; + cmp nbits, 16 ; if (nbits <= 16) + jle .ERLOOP ; break; + jmp .BRLOOP ; } while(1); + +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 + times 5 nop +.ENTRY_SKIP_EMIT_CODE: ; .ENTRY_SKIP_EMIT_CODE: + shl put_buffer, nbitsb ; put_buffer <<= nbits; + or put_buffer, codeq ; put_buffer |= code; +.BLOOP_COND: ; .BLOOP_COND: + test index, index ; if (index != 0) + jz .ELOOP ; { +.BLOOP: ; do { + xor nbits, nbits ; nbits = 0; /* kill tzcnt input dependency */ + tzcnt nbitsq, index ; nbits = # of trailing 0 bits in index + inc nbits ; ++nbits; + lea t, [t + nbitsq * 2] ; t = &t[nbits]; + shr index, nbitsb ; index >>= nbits; +.EMIT_BRLOOP_CODE_END: ; .EMIT_BRLOOP_CODE_END: + cmp nbits, 16 ; if (nbits > 16) + jg .BRLOOP ; goto .BRLOOP; +.ERLOOP: ; .ERLOOP: + movsx codeq, word [t] ; code = *t; + lea tempd, [nbitsq * 2] ; temp = nbits * 2; + movzx nbits, byte [NBITS(codeq)] ; nbits = JPEG_NBITS(code); + lea tempd, [nbitsq + tempq * 8] ; temp = temp * 8 + nbits; + mov code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4] + ; code_temp = actbl->ehufco[temp-16]; + shl code_temp, nbitsb ; code_temp <<= nbits; + and code, dword [MASK_BITS(nbitsq)] ; code &= (1 << nbits) - 1; + add nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)] + ; free_bits -= actbl->ehufsi[temp-16]; + or code, code_temp ; code |= code_temp; + sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) + jle .EMIT_CODE ; goto .EMIT_CODE; + shl put_buffer, nbitsb ; put_buffer <<= nbits; + or put_buffer, codeq ; put_buffer |= code; + test index, index + jnz .BLOOP ; } while (index != 0); +.ELOOP: ; } /* index != 0 */ + sub td, esp ; t -= (WIN64: &t_[0], UNIX: &t_[64]); +%ifdef WIN64 + cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62) +%else + cmp td, -2 * SIZEOF_WORD ; if (t != -2) +%endif + je .EFN ; { + movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0] + ; nbits = actbl->ehufsi[0]; + mov code, [actbl + c_derived_tbl.ehufco + 0] ; code = actbl->ehufco[0]; + sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) + jg .EFN_SKIP_EMIT_CODE ; { + EMIT_QWORD .EFN ; insert code, flush buffer + align 16 +.EFN_SKIP_EMIT_CODE: ; } else { + shl put_buffer, nbitsb ; put_buffer <<= nbits; + or put_buffer, codeq ; put_buffer |= code; +.EFN: ; } } + mov [state + working_state.cur.put_buffer.simd], put_buffer + ; state->cur.put_buffer.simd = put_buffer; + mov byte [state + working_state.cur.free_bits], free_bitsb + ; state->cur.free_bits = free_bits; +%ifdef WIN64 + sub rsp, -DCTSIZE2 * SIZEOF_WORD + pop r12 + pop rdi + pop rsi pop rbp + pop rbx +%else + pop r12 + pop rbp + pop rbx +%endif ret +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 +.EMIT_BRLOOP_CODE: + EMIT_QWORD .EMIT_BRLOOP_CODE_END, {mov nbits, code_temp} ; insert code, flush buffer, + ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END + ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. align 32