Attempting to losslessly transform certain malformed JPEG images can
cause the nbits table index in the Huffman encoder to exceed 32768, so
we need to pad the SSE2 implementation of that table to 65536 entries as
we do with the C implementation.
Regression introduced by 087c29e07f
Fixes #543
584 lines
31 KiB
NASM
584 lines
31 KiB
NASM
;
|
|
; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
|
|
;
|
|
; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander.
|
|
; Copyright (C) 2015, Matthieu Darbois.
|
|
; Copyright (C) 2018, Matthias Räncker.
|
|
;
|
|
; Based on the x86 SIMD extension for IJG JPEG library
|
|
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
|
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
|
;
|
|
; This file should be assembled with NASM (Netwide Assembler),
|
|
; can *not* be assembled with Microsoft's MASM or any compatible
|
|
; assembler (including Borland's Turbo Assembler).
|
|
; NASM is available from http://nasm.sourceforge.net/ or
|
|
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
|
;
|
|
; This file contains an SSE2 implementation for Huffman coding of one block.
|
|
; The following code is based on jchuff.c; see jchuff.c for more details.
|
|
|
|
%include "jsimdext.inc"
|
|
|
|
struc working_state
|
|
.next_output_byte: resp 1 ; => next byte to write in buffer
|
|
.free_in_buffer: resp 1 ; # of byte spaces remaining in buffer
|
|
.cur.put_buffer.simd resq 1 ; current bit accumulation buffer
|
|
.cur.free_bits resd 1 ; # of bits available in it
|
|
.cur.last_dc_val resd 4 ; last DC coef for each component
|
|
.cinfo: resp 1 ; dump_buffer needs access to this
|
|
endstruc
|
|
|
|
struc c_derived_tbl
|
|
.ehufco: resd 256 ; code for each symbol
|
|
.ehufsi: resb 256 ; length of code for each symbol
|
|
; If no code has been allocated for a symbol S, ehufsi[S] contains 0
|
|
endstruc
|
|
|
|
; --------------------------------------------------------------------------
|
|
SECTION SEG_CONST
|
|
|
|
alignz 32
|
|
GLOBAL_DATA(jconst_huff_encode_one_block)
|
|
|
|
EXTN(jconst_huff_encode_one_block):
|
|
|
|
jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
|
|
dd 0x000f, 0x001f, 0x003f, 0x007f
|
|
dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
|
|
dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
|
|
|
|
alignz 32
|
|
|
|
times 1 << 14 db 15
|
|
times 1 << 13 db 14
|
|
times 1 << 12 db 13
|
|
times 1 << 11 db 12
|
|
times 1 << 10 db 11
|
|
times 1 << 9 db 10
|
|
times 1 << 8 db 9
|
|
times 1 << 7 db 8
|
|
times 1 << 6 db 7
|
|
times 1 << 5 db 6
|
|
times 1 << 4 db 5
|
|
times 1 << 3 db 4
|
|
times 1 << 2 db 3
|
|
times 1 << 1 db 2
|
|
times 1 << 0 db 1
|
|
times 1 db 0
|
|
jpeg_nbits_table:
|
|
times 1 db 0
|
|
times 1 << 0 db 1
|
|
times 1 << 1 db 2
|
|
times 1 << 2 db 3
|
|
times 1 << 3 db 4
|
|
times 1 << 4 db 5
|
|
times 1 << 5 db 6
|
|
times 1 << 6 db 7
|
|
times 1 << 7 db 8
|
|
times 1 << 8 db 9
|
|
times 1 << 9 db 10
|
|
times 1 << 10 db 11
|
|
times 1 << 11 db 12
|
|
times 1 << 12 db 13
|
|
times 1 << 13 db 14
|
|
times 1 << 14 db 15
|
|
times 1 << 15 db 16
|
|
|
|
alignz 32
|
|
|
|
%define NBITS(x) nbits_base + x
|
|
%define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table)
|
|
|
|
; --------------------------------------------------------------------------
|
|
SECTION SEG_TEXT
|
|
BITS 64
|
|
|
|
; Shorthand used to describe SIMD operations:
|
|
; wN: xmmN treated as eight signed 16-bit values
|
|
; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7
|
|
; bN: xmmN treated as 16 unsigned 8-bit values
|
|
; bN[i]: perform the same operation on all 16 unsigned 8-bit values, i=0..15
|
|
; Contents of SIMD registers are shown in memory order.
|
|
|
|
; Fill the bit buffer to capacity with the leading bits from code, then output
|
|
; the bit buffer and put the remaining bits from code into the bit buffer.
|
|
;
|
|
; Usage:
|
|
; code - contains the bits to shift into the bit buffer (LSB-aligned)
|
|
; %1 - the label to which to jump when the macro completes
|
|
; %2 (optional) - extra instructions to execute after nbits has been set
|
|
;
|
|
; Upon completion, free_bits will be set to the number of remaining bits from
|
|
; code, and put_buffer will contain those remaining bits. temp and code will
|
|
; be clobbered.
|
|
;
|
|
; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
|
|
; macro in jchuff.c.
|
|
|
|
%macro EMIT_QWORD 1-2
|
|
add nbitsb, free_bitsb ; nbits += free_bits;
|
|
neg free_bitsb ; free_bits = -free_bits;
|
|
mov tempd, code ; temp = code;
|
|
shl put_buffer, nbitsb ; put_buffer <<= nbits;
|
|
mov nbitsb, free_bitsb ; nbits = free_bits;
|
|
neg free_bitsb ; free_bits = -free_bits;
|
|
shr tempd, nbitsb ; temp >>= nbits;
|
|
or tempq, put_buffer ; temp |= put_buffer;
|
|
movq xmm0, tempq ; xmm0.u64 = { temp, 0 };
|
|
bswap tempq ; temp = htonl(temp);
|
|
mov put_buffer, codeq ; put_buffer = code;
|
|
pcmpeqb xmm0, xmm1 ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0);
|
|
%2
|
|
pmovmskb code, xmm0 ; code = 0; code |= ((b0[i] >> 7) << i);
|
|
mov qword [buffer], tempq ; memcpy(buffer, &temp, 8);
|
|
; (speculative; will be overwritten if
|
|
; code contains any 0xFF bytes)
|
|
add free_bitsb, 64 ; free_bits += 64;
|
|
add bufferp, 8 ; buffer += 8;
|
|
test code, code ; if (code == 0) /* No 0xFF bytes */
|
|
jz %1 ; return;
|
|
; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
|
|
; bytes in the qword.
|
|
cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
|
|
mov byte [buffer-7], 0 ; buffer[-7] = 0;
|
|
sbb bufferp, 6 ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0));
|
|
mov byte [buffer], temph ; buffer[0] = temp[1];
|
|
cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
|
|
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
|
sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
|
|
shr tempq, 16 ; temp >>= 16;
|
|
mov byte [buffer], tempb ; buffer[0] = temp[0];
|
|
cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
|
|
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
|
sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
|
|
mov byte [buffer], temph ; buffer[0] = temp[1];
|
|
cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
|
|
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
|
sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
|
|
shr tempq, 16 ; temp >>= 16;
|
|
mov byte [buffer], tempb ; buffer[0] = temp[0];
|
|
cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
|
|
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
|
sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
|
|
mov byte [buffer], temph ; buffer[0] = temp[1];
|
|
cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
|
|
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
|
sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
|
|
shr tempd, 16 ; temp >>= 16;
|
|
mov byte [buffer], tempb ; buffer[0] = temp[0];
|
|
cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
|
|
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
|
sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
|
|
mov byte [buffer], temph ; buffer[0] = temp[1];
|
|
cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
|
|
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
|
sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
|
|
jmp %1 ; return;
|
|
%endmacro
|
|
|
|
;
|
|
; Encode a single block's worth of coefficients.
|
|
;
|
|
; GLOBAL(JOCTET *)
|
|
; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
|
|
; JCOEFPTR block, int last_dc_val,
|
|
; c_derived_tbl *dctbl, c_derived_tbl *actbl)
|
|
;
|
|
; NOTES:
|
|
; When shuffling data, we try to avoid pinsrw as much as possible, since it is
|
|
; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on
|
|
; modern CPUs, so chains of pinsrw instructions (even with different outputs)
|
|
; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and
|
|
; requires 2 µops (with memory operand) on Intel. In either case, only one
|
|
; pinsrw instruction can be decoded per cycle (and nothing else if they are
|
|
; back-to-back), so out-of-order execution cannot be used to work around long
|
|
; pinsrw chains (though for Sandy Bridge and later, this may be less of a
|
|
; problem if the code runs from the µop cache.)
|
|
;
|
|
; We use tzcnt instead of bsf without checking for support. The instruction is
|
|
; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
|
|
; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is
|
|
; an input dependency (although the behavior is not formally defined, Intel
|
|
; CPUs usually leave the destination unmodified if the source is zero.) This
|
|
; can prevent out-of-order execution, so we clear the destination before
|
|
; invoking tzcnt.
|
|
;
|
|
; Initial register allocation
|
|
; rax - buffer
|
|
; rbx - temp
|
|
; rcx - nbits
|
|
; rdx - block --> free_bits
|
|
; rsi - nbits_base
|
|
; rdi - t
|
|
; rbp - code
|
|
; r8 - dctbl --> code_temp
|
|
; r9 - actbl
|
|
; r10 - state
|
|
; r11 - index
|
|
; r12 - put_buffer
|
|
|
|
%define buffer rax
|
|
%ifdef WIN64
|
|
%define bufferp rax
|
|
%else
|
|
%define bufferp raxp
|
|
%endif
|
|
%define tempq rbx
|
|
%define tempd ebx
|
|
%define tempb bl
|
|
%define temph bh
|
|
%define nbitsq rcx
|
|
%define nbits ecx
|
|
%define nbitsb cl
|
|
%define block rdx
|
|
%define nbits_base rsi
|
|
%define t rdi
|
|
%define td edi
|
|
%define codeq rbp
|
|
%define code ebp
|
|
%define dctbl r8
|
|
%define actbl r9
|
|
%define state r10
|
|
%define index r11
|
|
%define indexd r11d
|
|
%define put_buffer r12
|
|
%define put_bufferd r12d
|
|
|
|
; Step 1: Re-arrange input data according to jpeg_natural_order
|
|
; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
|
|
; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05
|
|
; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34
|
|
; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28
|
|
; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36
|
|
; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51
|
|
; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46
|
|
; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63
|
|
|
|
align 32
|
|
GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
|
|
|
|
EXTN(jsimd_huff_encode_one_block_sse2):
|
|
|
|
%ifdef WIN64
|
|
|
|
; rcx = working_state *state
|
|
; rdx = JOCTET *buffer
|
|
; r8 = JCOEFPTR block
|
|
; r9 = int last_dc_val
|
|
; [rax+48] = c_derived_tbl *dctbl
|
|
; [rax+56] = c_derived_tbl *actbl
|
|
|
|
;X: X = code stream
|
|
mov buffer, rdx
|
|
mov block, r8
|
|
movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
|
|
push rbx
|
|
push rbp
|
|
movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
|
|
push rsi
|
|
push rdi
|
|
push r12
|
|
movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
|
|
mov state, rcx
|
|
movsx code, word [block] ;Z: code = block[0];
|
|
pxor xmm4, xmm4 ;A: w4[i] = 0;
|
|
sub code, r9d ;Z: code -= last_dc_val;
|
|
mov dctbl, POINTER [rsp+6*8+4*8]
|
|
mov actbl, POINTER [rsp+6*8+5*8]
|
|
punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
|
|
lea nbits_base, [rel jpeg_nbits_table]
|
|
add rsp, -DCTSIZE2 * SIZEOF_WORD
|
|
mov t, rsp
|
|
|
|
%else
|
|
|
|
; rdi = working_state *state
|
|
; rsi = JOCTET *buffer
|
|
; rdx = JCOEFPTR block
|
|
; rcx = int last_dc_val
|
|
; r8 = c_derived_tbl *dctbl
|
|
; r9 = c_derived_tbl *actbl
|
|
|
|
;X: X = code stream
|
|
movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
|
|
push rbx
|
|
push rbp
|
|
movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
|
|
push r12
|
|
mov state, rdi
|
|
mov buffer, rsi
|
|
movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
|
|
movsx codeq, word [block] ;Z: code = block[0];
|
|
lea nbits_base, [rel jpeg_nbits_table]
|
|
pxor xmm4, xmm4 ;A: w4[i] = 0;
|
|
sub codeq, rcx ;Z: code -= last_dc_val;
|
|
punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
|
|
lea t, [rsp - DCTSIZE2 * SIZEOF_WORD] ; use red zone for t_
|
|
|
|
%endif
|
|
|
|
pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
|
|
pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
|
|
punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
|
|
punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13
|
|
pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17
|
|
;A: (Row 0, offset 1)
|
|
pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
|
|
paddw xmm0, xmm4 ;A: w0[i] += w4[i];
|
|
movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i];
|
|
|
|
movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- --
|
|
pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- --
|
|
pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12
|
|
movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55
|
|
movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12
|
|
punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51
|
|
pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12
|
|
pxor xmm4, xmm4 ;A: w4[i] = 0;
|
|
psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- --
|
|
pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
|
|
pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12
|
|
; (Row 1, offset 1)
|
|
pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
|
|
paddw xmm1, xmm4 ;B: w1[i] += w4[i];
|
|
movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i];
|
|
pxor xmm4, xmm4 ;B: w4[i] = 0;
|
|
pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
|
|
|
|
packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
|
|
; w/ signed saturation
|
|
|
|
pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- --
|
|
pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- --
|
|
pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 --
|
|
pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35
|
|
; (Row 3, offset 1)
|
|
pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
|
|
paddw xmm3, xmm4 ;D: w3[i] += w4[i];
|
|
movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i];
|
|
pxor xmm4, xmm4 ;D: w4[i] = 0;
|
|
pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
|
|
|
|
pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51
|
|
cmp code, 1 << 31 ;Z: Set CF if code < 0x80000000,
|
|
;Z: i.e. if code is positive
|
|
pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51
|
|
pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51
|
|
adc code, -1 ;Z: code += -1 + (code >= 0 ? 1 : 0);
|
|
pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51
|
|
pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51
|
|
movsxd codeq, code ;Z: sign extend code
|
|
pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27
|
|
; (Row 2, offset 1)
|
|
pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
|
|
paddw xmm2, xmm4 ;C: w2[i] += w4[i];
|
|
movaps XMMWORD [t + 16 * SIZEOF_WORD], xmm2 ;C: t[i+16] = w2[i];
|
|
pxor xmm4, xmm4 ;C: w4[i] = 0;
|
|
pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
|
|
|
|
packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
|
|
; w/ signed saturation
|
|
|
|
movzx nbitsq, byte [NBITS(codeq)] ;Z: nbits = JPEG_NBITS(code);
|
|
movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55
|
|
pmovmskb tempd, xmm2 ;Z: temp = 0; temp |= ((b2[i] >> 7) << i);
|
|
pmovmskb put_bufferd, xmm0 ;Z: put_buffer = 0; put_buffer |= ((b0[i] >> 7) << i);
|
|
movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63
|
|
punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63
|
|
shl tempd, 16 ;Z: temp <<= 16;
|
|
psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 --
|
|
pxor xmm2, xmm2 ;H: w2[i] = 0;
|
|
or put_bufferd, tempd ;Z: put_buffer |= temp;
|
|
pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 --
|
|
movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- --
|
|
unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59
|
|
pxor xmm0, xmm0 ;H: w0[i] = 0;
|
|
pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 --
|
|
; (Row 7, offset 1)
|
|
pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
|
|
paddw xmm3, xmm2 ;H: w3[i] += w2[i];
|
|
movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i];
|
|
movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- --
|
|
pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
|
|
punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47
|
|
mov tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4]
|
|
;Z: temp = dctbl->ehufco[nbits];
|
|
movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47
|
|
psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 --
|
|
shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59
|
|
and code, dword [MASK_BITS(nbitsq)] ;Z: code &= (1 << nbits) - 1;
|
|
pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 --
|
|
pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58
|
|
shl tempq, nbitsb ;Z: temp <<= nbits;
|
|
pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 --
|
|
pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58
|
|
pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 --
|
|
or code, tempd ;Z: code |= temp;
|
|
movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58
|
|
pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 --
|
|
pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58
|
|
pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53
|
|
; (Row 6, offset 1)
|
|
pxor xmm2, xmm2 ;G: w2[i] = 0;
|
|
pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
|
|
pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58
|
|
paddw xmm4, xmm0 ;G: w4[i] += w0[i];
|
|
movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i];
|
|
pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58
|
|
; (Row 5, offset 1)
|
|
pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
|
|
pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59
|
|
|
|
packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
|
|
; w/ signed saturation
|
|
|
|
pxor xmm0, xmm0 ;F: w0[i] = 0;
|
|
pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59
|
|
pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
|
|
pmovmskb tempd, xmm4 ;Z: temp = 0; temp |= ((b4[i] >> 7) << i);
|
|
pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59
|
|
paddw xmm1, xmm2 ;F: w1[i] += w2[i];
|
|
movaps XMMWORD [t + 40 * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i];
|
|
pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
|
|
; (Row 4, offset 1)
|
|
%undef block
|
|
%define free_bitsq rdx
|
|
%define free_bitsd edx
|
|
%define free_bitsb dl
|
|
pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
|
|
shl tempq, 48 ;Z: temp <<= 48;
|
|
pxor xmm2, xmm2 ;E: w2[i] = 0;
|
|
pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
|
|
paddw xmm5, xmm0 ;E: w5[i] += w0[i];
|
|
or tempq, put_buffer ;Z: temp |= put_buffer;
|
|
movaps XMMWORD [t + 32 * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i];
|
|
lea t, [dword t - 2] ;Z: t = &t[-1];
|
|
pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
|
|
|
|
packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
|
|
; w/ signed saturation
|
|
|
|
add nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq]
|
|
;Z: nbits += dctbl->ehufsi[nbits];
|
|
%undef dctbl
|
|
%define code_temp r8d
|
|
pmovmskb indexd, xmm5 ;Z: index = 0; index |= ((b5[i] >> 7) << i);
|
|
mov free_bitsd, [state+working_state.cur.free_bits]
|
|
;Z: free_bits = state->cur.free_bits;
|
|
pcmpeqw xmm1, xmm1 ;Z: b1[i] = 0xFF;
|
|
shl index, 32 ;Z: index <<= 32;
|
|
mov put_buffer, [state+working_state.cur.put_buffer.simd]
|
|
;Z: put_buffer = state->cur.put_buffer.simd;
|
|
or index, tempq ;Z: index |= temp;
|
|
not index ;Z: index = ~index;
|
|
sub free_bitsb, nbitsb ;Z: if ((free_bits -= nbits) >= 0)
|
|
jnl .ENTRY_SKIP_EMIT_CODE ;Z: goto .ENTRY_SKIP_EMIT_CODE;
|
|
align 16
|
|
.EMIT_CODE: ;Z: .EMIT_CODE:
|
|
EMIT_QWORD .BLOOP_COND ;Z: insert code, flush buffer, goto .BLOOP_COND
|
|
|
|
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
|
|
align 16
|
|
.BRLOOP: ; do {
|
|
lea code_temp, [nbitsq - 16] ; code_temp = nbits - 16;
|
|
movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
|
|
; nbits = actbl->ehufsi[0xf0];
|
|
mov code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
|
|
; code = actbl->ehufco[0xf0];
|
|
sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
|
|
jle .EMIT_BRLOOP_CODE ; goto .EMIT_BRLOOP_CODE;
|
|
shl put_buffer, nbitsb ; put_buffer <<= nbits;
|
|
mov nbits, code_temp ; nbits = code_temp;
|
|
or put_buffer, codeq ; put_buffer |= code;
|
|
cmp nbits, 16 ; if (nbits <= 16)
|
|
jle .ERLOOP ; break;
|
|
jmp .BRLOOP ; } while (1);
|
|
|
|
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
|
|
align 16
|
|
times 5 nop
|
|
.ENTRY_SKIP_EMIT_CODE: ; .ENTRY_SKIP_EMIT_CODE:
|
|
shl put_buffer, nbitsb ; put_buffer <<= nbits;
|
|
or put_buffer, codeq ; put_buffer |= code;
|
|
.BLOOP_COND: ; .BLOOP_COND:
|
|
test index, index ; if (index != 0)
|
|
jz .ELOOP ; {
|
|
.BLOOP: ; do {
|
|
xor nbits, nbits ; nbits = 0; /* kill tzcnt input dependency */
|
|
tzcnt nbitsq, index ; nbits = # of trailing 0 bits in index
|
|
inc nbits ; ++nbits;
|
|
lea t, [t + nbitsq * 2] ; t = &t[nbits];
|
|
shr index, nbitsb ; index >>= nbits;
|
|
.EMIT_BRLOOP_CODE_END: ; .EMIT_BRLOOP_CODE_END:
|
|
cmp nbits, 16 ; if (nbits > 16)
|
|
jg .BRLOOP ; goto .BRLOOP;
|
|
.ERLOOP: ; .ERLOOP:
|
|
movsx codeq, word [t] ; code = *t;
|
|
lea tempd, [nbitsq * 2] ; temp = nbits * 2;
|
|
movzx nbits, byte [NBITS(codeq)] ; nbits = JPEG_NBITS(code);
|
|
lea tempd, [nbitsq + tempq * 8] ; temp = temp * 8 + nbits;
|
|
mov code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4]
|
|
; code_temp = actbl->ehufco[temp-16];
|
|
shl code_temp, nbitsb ; code_temp <<= nbits;
|
|
and code, dword [MASK_BITS(nbitsq)] ; code &= (1 << nbits) - 1;
|
|
add nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)]
|
|
; free_bits -= actbl->ehufsi[temp-16];
|
|
or code, code_temp ; code |= code_temp;
|
|
sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
|
|
jle .EMIT_CODE ; goto .EMIT_CODE;
|
|
shl put_buffer, nbitsb ; put_buffer <<= nbits;
|
|
or put_buffer, codeq ; put_buffer |= code;
|
|
test index, index
|
|
jnz .BLOOP ; } while (index != 0);
|
|
.ELOOP: ; } /* index != 0 */
|
|
sub td, esp ; t -= (WIN64: &t_[0], UNIX: &t_[64]);
|
|
%ifdef WIN64
|
|
cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62)
|
|
%else
|
|
cmp td, -2 * SIZEOF_WORD ; if (t != -2)
|
|
%endif
|
|
je .EFN ; {
|
|
movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
|
|
; nbits = actbl->ehufsi[0];
|
|
mov code, [actbl + c_derived_tbl.ehufco + 0] ; code = actbl->ehufco[0];
|
|
sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
|
|
jg .EFN_SKIP_EMIT_CODE ; {
|
|
EMIT_QWORD .EFN ; insert code, flush buffer
|
|
align 16
|
|
.EFN_SKIP_EMIT_CODE: ; } else {
|
|
shl put_buffer, nbitsb ; put_buffer <<= nbits;
|
|
or put_buffer, codeq ; put_buffer |= code;
|
|
.EFN: ; } }
|
|
mov [state + working_state.cur.put_buffer.simd], put_buffer
|
|
; state->cur.put_buffer.simd = put_buffer;
|
|
mov byte [state + working_state.cur.free_bits], free_bitsb
|
|
; state->cur.free_bits = free_bits;
|
|
%ifdef WIN64
|
|
sub rsp, -DCTSIZE2 * SIZEOF_WORD
|
|
pop r12
|
|
pop rdi
|
|
pop rsi
|
|
pop rbp
|
|
pop rbx
|
|
%else
|
|
pop r12
|
|
pop rbp
|
|
pop rbx
|
|
%endif
|
|
ret
|
|
|
|
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
|
|
align 16
|
|
.EMIT_BRLOOP_CODE:
|
|
EMIT_QWORD .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp }
|
|
; insert code, flush buffer,
|
|
; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END
|
|
|
|
; For some reason, the OS X linker does not honor the request to align the
|
|
; segment unless we do this.
|
|
align 32
|