Reformat SSE/SSE2 SIMD code to improve readability

This commit is contained in:
DRC
2016-05-27 16:58:23 -05:00
parent 3ff13e651b
commit ff5685d534
43 changed files with 11067 additions and 11065 deletions

View File

@@ -33,454 +33,454 @@
; r13 = JDIMENSION output_row
; r14 = int num_rows
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 8
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 8
align 16
align 16
global EXTN(jsimd_rgb_ycc_convert_sse2)
global EXTN(jsimd_rgb_ycc_convert_sse2)
EXTN(jsimd_rgb_ycc_convert_sse2):
push rbp
mov rax,rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax
mov rbp,rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
push rbx
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
push rbx
mov ecx, r10d
test rcx,rcx
jz near .return
mov ecx, r10d
test rcx, rcx
jz near .return
push rcx
push rcx
mov rsi, r12
mov ecx, r13d
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
mov rsi, r12
mov ecx, r13d
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
pop rcx
pop rcx
mov rsi, r11
mov eax, r14d
test rax,rax
jle near .return
mov rsi, r11
mov eax, r14d
test rax, rax
jle near .return
.rowloop:
push rdx
push rbx
push rdi
push rsi
push rcx ; col
push rdx
push rbx
push rdi
push rsi
push rcx ; col
mov rsi, JSAMPROW [rsi] ; inptr
mov rdi, JSAMPROW [rdi] ; outptr0
mov rbx, JSAMPROW [rbx] ; outptr1
mov rdx, JSAMPROW [rdx] ; outptr2
mov rsi, JSAMPROW [rsi] ; inptr
mov rdi, JSAMPROW [rdi] ; outptr0
mov rbx, JSAMPROW [rbx] ; outptr1
mov rdx, JSAMPROW [rdx] ; outptr2
cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop
cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop
%if RGB_PIXELSIZE == 3 ; ---------------
%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
push rax
push rdx
lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub rcx, byte SIZEOF_BYTE
movzx rax, BYTE [rsi+rcx]
push rax
push rdx
lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub rcx, byte SIZEOF_BYTE
movzx rax, BYTE [rsi+rcx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub rcx, byte SIZEOF_WORD
movzx rdx, WORD [rsi+rcx]
shl rax, WORD_BIT
or rax,rdx
test cl, SIZEOF_WORD
jz short .column_ld4
sub rcx, byte SIZEOF_WORD
movzx rdx, WORD [rsi+rcx]
shl rax, WORD_BIT
or rax, rdx
.column_ld4:
movd xmmA,eax
pop rdx
pop rax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub rcx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [rsi+rcx]
pslldq xmmA, SIZEOF_DWORD
por xmmA,xmmF
movd xmmA, eax
pop rdx
pop rax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub rcx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [rsi+rcx]
pslldq xmmA, SIZEOF_DWORD
por xmmA, xmmF
.column_ld8:
test cl, SIZEOF_MMWORD
jz short .column_ld16
sub rcx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [rsi+rcx]
pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmB
test cl, SIZEOF_MMWORD
jz short .column_ld16
sub rcx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [rsi+rcx]
pslldq xmmA, SIZEOF_MMWORD
por xmmA, xmmB
.column_ld16:
test cl, SIZEOF_XMMWORD
jz short .column_ld32
movdqa xmmF,xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
mov rcx, SIZEOF_XMMWORD
jmp short .rgb_ycc_cnv
test cl, SIZEOF_XMMWORD
jz short .column_ld32
movdqa xmmF, xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
mov rcx, SIZEOF_XMMWORD
jmp short .rgb_ycc_cnv
.column_ld32:
test cl, 2*SIZEOF_XMMWORD
mov rcx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv
movdqa xmmB,xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv
test cl, 2*SIZEOF_XMMWORD
mov rcx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv
movdqa xmmB, xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv
.columnloop:
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
.rgb_ycc_cnv:
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
movdqa xmmG,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
movdqa xmmG, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
movdqa xmmD,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
movdqa xmmD, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
movdqa xmmE,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
movdqa xmmE, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
pxor xmmH,xmmH
pxor xmmH, xmmH
movdqa xmmC,xmmA
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmC, xmmA
punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmB,xmmE
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmB, xmmE
punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmF,xmmD
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
movdqa xmmF, xmmD
punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
%else ; RGB_PIXELSIZE == 4 ; -----------
%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
test cl, SIZEOF_XMMWORD/16
jz short .column_ld2
sub rcx, byte SIZEOF_XMMWORD/16
movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
test cl, SIZEOF_XMMWORD/16
jz short .column_ld2
sub rcx, byte SIZEOF_XMMWORD/16
movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
.column_ld2:
test cl, SIZEOF_XMMWORD/8
jz short .column_ld4
sub rcx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmE
test cl, SIZEOF_XMMWORD/8
jz short .column_ld4
sub rcx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD
por xmmA, xmmE
.column_ld4:
test cl, SIZEOF_XMMWORD/4
jz short .column_ld8
sub rcx, byte SIZEOF_XMMWORD/4
movdqa xmmE,xmmA
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
test cl, SIZEOF_XMMWORD/4
jz short .column_ld8
sub rcx, byte SIZEOF_XMMWORD/4
movdqa xmmE, xmmA
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
.column_ld8:
test cl, SIZEOF_XMMWORD/2
mov rcx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv
movdqa xmmF,xmmA
movdqa xmmH,xmmE
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv
test cl, SIZEOF_XMMWORD/2
mov rcx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv
movdqa xmmF, xmmA
movdqa xmmH, xmmE
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv
.columnloop:
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
.rgb_ycc_cnv:
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD,xmmA
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmD, xmmA
punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmC,xmmF
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmC, xmmF
punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmB,xmmA
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmB, xmmA
punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmG,xmmD
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmG, xmmD
punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmE,xmmA
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmE, xmmA
punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmH,xmmB
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
movdqa xmmH, xmmB
punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
pxor xmmF,xmmF
pxor xmmF, xmmF
movdqa xmmC,xmmA
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmC, xmmA
punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmD,xmmB
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmD, xmmB
punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmG,xmmE
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
movdqa xmmG, xmmE
punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
punpcklbw xmmF,xmmH
punpckhbw xmmH,xmmH
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
punpcklbw xmmF, xmmH
punpckhbw xmmH, xmmH
psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
%endif ; RGB_PIXELSIZE ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
movdqa xmm6,xmm1
punpcklwd xmm1,xmm3
punpckhwd xmm6,xmm3
movdqa xmm7,xmm1
movdqa xmm4,xmm6
pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
pmaddwd xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
movdqa xmm6, xmm1
punpcklwd xmm1, xmm3
punpckhwd xmm6, xmm3
movdqa xmm7, xmm1
movdqa xmm4, xmm6
pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
pmaddwd xmm7, [rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
pxor xmm1,xmm1
pxor xmm6,xmm6
punpcklwd xmm1,xmm5 ; xmm1=BOL
punpckhwd xmm6,xmm5 ; xmm6=BOH
psrld xmm1,1 ; xmm1=BOL*FIX(0.500)
psrld xmm6,1 ; xmm6=BOH*FIX(0.500)
pxor xmm1, xmm1
pxor xmm6, xmm6
punpcklwd xmm1, xmm5 ; xmm1=BOL
punpckhwd xmm6, xmm5 ; xmm6=BOH
psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
movdqa xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
movdqa xmm5, [rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
paddd xmm7,xmm1
paddd xmm4,xmm6
paddd xmm7,xmm5
paddd xmm4,xmm5
psrld xmm7,SCALEBITS ; xmm7=CbOL
psrld xmm4,SCALEBITS ; xmm4=CbOH
packssdw xmm7,xmm4 ; xmm7=CbO
paddd xmm7, xmm1
paddd xmm4, xmm6
paddd xmm7, xmm5
paddd xmm4, xmm5
psrld xmm7, SCALEBITS ; xmm7=CbOL
psrld xmm4, SCALEBITS ; xmm4=CbOH
packssdw xmm7, xmm4 ; xmm7=CbO
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
movdqa xmm6,xmm0
punpcklwd xmm0,xmm2
punpckhwd xmm6,xmm2
movdqa xmm5,xmm0
movdqa xmm4,xmm6
pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
pmaddwd xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
movdqa xmm6, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm6, xmm2
movdqa xmm5, xmm0
movdqa xmm4, xmm6
pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
pmaddwd xmm5, [rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
pxor xmm0,xmm0
pxor xmm6,xmm6
punpcklwd xmm0,xmm1 ; xmm0=BEL
punpckhwd xmm6,xmm1 ; xmm6=BEH
psrld xmm0,1 ; xmm0=BEL*FIX(0.500)
psrld xmm6,1 ; xmm6=BEH*FIX(0.500)
pxor xmm0, xmm0
pxor xmm6, xmm6
punpcklwd xmm0, xmm1 ; xmm0=BEL
punpckhwd xmm6, xmm1 ; xmm6=BEH
psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
paddd xmm5,xmm0
paddd xmm4,xmm6
paddd xmm5,xmm1
paddd xmm4,xmm1
psrld xmm5,SCALEBITS ; xmm5=CbEL
psrld xmm4,SCALEBITS ; xmm4=CbEH
packssdw xmm5,xmm4 ; xmm5=CbE
paddd xmm5, xmm0
paddd xmm4, xmm6
paddd xmm5, xmm1
paddd xmm4, xmm1
psrld xmm5, SCALEBITS ; xmm5=CbEL
psrld xmm4, SCALEBITS ; xmm4=CbEH
packssdw xmm5, xmm4 ; xmm5=CbE
psllw xmm7,BYTE_BIT
por xmm5,xmm7 ; xmm5=Cb
movdqa XMMWORD [rbx], xmm5 ; Save Cb
psllw xmm7, BYTE_BIT
por xmm5, xmm7 ; xmm5=Cb
movdqa XMMWORD [rbx], xmm5 ; Save Cb
movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
movdqa xmm4,xmm0
punpcklwd xmm0,xmm3
punpckhwd xmm4,xmm3
movdqa xmm7,xmm0
movdqa xmm5,xmm4
pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
pmaddwd xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
movdqa xmm4, xmm0
punpcklwd xmm0, xmm3
punpckhwd xmm4, xmm3
movdqa xmm7, xmm0
movdqa xmm5, xmm4
pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
pmaddwd xmm7, [rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
paddd xmm0, XMMWORD [wk(4)]
paddd xmm4, XMMWORD [wk(5)]
paddd xmm0,xmm3
paddd xmm4,xmm3
psrld xmm0,SCALEBITS ; xmm0=YOL
psrld xmm4,SCALEBITS ; xmm4=YOH
packssdw xmm0,xmm4 ; xmm0=YO
paddd xmm0, XMMWORD [wk(4)]
paddd xmm4, XMMWORD [wk(5)]
paddd xmm0, xmm3
paddd xmm4, xmm3
psrld xmm0, SCALEBITS ; xmm0=YOL
psrld xmm4, SCALEBITS ; xmm4=YOH
packssdw xmm0, xmm4 ; xmm0=YO
pxor xmm3,xmm3
pxor xmm4,xmm4
punpcklwd xmm3,xmm1 ; xmm3=ROL
punpckhwd xmm4,xmm1 ; xmm4=ROH
psrld xmm3,1 ; xmm3=ROL*FIX(0.500)
psrld xmm4,1 ; xmm4=ROH*FIX(0.500)
pxor xmm3, xmm3
pxor xmm4, xmm4
punpcklwd xmm3, xmm1 ; xmm3=ROL
punpckhwd xmm4, xmm1 ; xmm4=ROH
psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
paddd xmm7,xmm3
paddd xmm5,xmm4
paddd xmm7,xmm1
paddd xmm5,xmm1
psrld xmm7,SCALEBITS ; xmm7=CrOL
psrld xmm5,SCALEBITS ; xmm5=CrOH
packssdw xmm7,xmm5 ; xmm7=CrO
paddd xmm7, xmm3
paddd xmm5, xmm4
paddd xmm7, xmm1
paddd xmm5, xmm1
psrld xmm7, SCALEBITS ; xmm7=CrOL
psrld xmm5, SCALEBITS ; xmm5=CrOH
packssdw xmm7, xmm5 ; xmm7=CrO
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
movdqa xmm4,xmm6
punpcklwd xmm6,xmm2
punpckhwd xmm4,xmm2
movdqa xmm1,xmm6
movdqa xmm5,xmm4
pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
pmaddwd xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
movdqa xmm4, xmm6
punpcklwd xmm6, xmm2
punpckhwd xmm4, xmm2
movdqa xmm1, xmm6
movdqa xmm5, xmm4
pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
pmaddwd xmm1, [rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
paddd xmm6, XMMWORD [wk(6)]
paddd xmm4, XMMWORD [wk(7)]
paddd xmm6,xmm2
paddd xmm4,xmm2
psrld xmm6,SCALEBITS ; xmm6=YEL
psrld xmm4,SCALEBITS ; xmm4=YEH
packssdw xmm6,xmm4 ; xmm6=YE
paddd xmm6, XMMWORD [wk(6)]
paddd xmm4, XMMWORD [wk(7)]
paddd xmm6, xmm2
paddd xmm4, xmm2
psrld xmm6, SCALEBITS ; xmm6=YEL
psrld xmm4, SCALEBITS ; xmm4=YEH
packssdw xmm6, xmm4 ; xmm6=YE
psllw xmm0,BYTE_BIT
por xmm6,xmm0 ; xmm6=Y
movdqa XMMWORD [rdi], xmm6 ; Save Y
psllw xmm0, BYTE_BIT
por xmm6, xmm0 ; xmm6=Y
movdqa XMMWORD [rdi], xmm6 ; Save Y
pxor xmm2,xmm2
pxor xmm4,xmm4
punpcklwd xmm2,xmm3 ; xmm2=REL
punpckhwd xmm4,xmm3 ; xmm4=REH
psrld xmm2,1 ; xmm2=REL*FIX(0.500)
psrld xmm4,1 ; xmm4=REH*FIX(0.500)
pxor xmm2, xmm2
pxor xmm4, xmm4
punpcklwd xmm2, xmm3 ; xmm2=REL
punpckhwd xmm4, xmm3 ; xmm4=REH
psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
movdqa xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
movdqa xmm0, [rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
paddd xmm1,xmm2
paddd xmm5,xmm4
paddd xmm1,xmm0
paddd xmm5,xmm0
psrld xmm1,SCALEBITS ; xmm1=CrEL
psrld xmm5,SCALEBITS ; xmm5=CrEH
packssdw xmm1,xmm5 ; xmm1=CrE
paddd xmm1, xmm2
paddd xmm5, xmm4
paddd xmm1, xmm0
paddd xmm5, xmm0
psrld xmm1, SCALEBITS ; xmm1=CrEL
psrld xmm5, SCALEBITS ; xmm5=CrEH
packssdw xmm1, xmm5 ; xmm1=CrE
psllw xmm7,BYTE_BIT
por xmm1,xmm7 ; xmm1=Cr
movdqa XMMWORD [rdx], xmm1 ; Save Cr
psllw xmm7, BYTE_BIT
por xmm1, xmm7 ; xmm1=Cr
movdqa XMMWORD [rdx], xmm1 ; Save Cr
sub rcx, byte SIZEOF_XMMWORD
add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
add rdi, byte SIZEOF_XMMWORD ; outptr0
add rbx, byte SIZEOF_XMMWORD ; outptr1
add rdx, byte SIZEOF_XMMWORD ; outptr2
cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop
test rcx,rcx
jnz near .column_ld1
sub rcx, byte SIZEOF_XMMWORD
add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
add rdi, byte SIZEOF_XMMWORD ; outptr0
add rbx, byte SIZEOF_XMMWORD ; outptr1
add rdx, byte SIZEOF_XMMWORD ; outptr2
cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop
test rcx, rcx
jnz near .column_ld1
pop rcx ; col
pop rsi
pop rdi
pop rbx
pop rdx
pop rcx ; col
pop rsi
pop rdi
pop rbx
pop rdx
add rsi, byte SIZEOF_JSAMPROW ; input_buf
add rdi, byte SIZEOF_JSAMPROW
add rbx, byte SIZEOF_JSAMPROW
add rdx, byte SIZEOF_JSAMPROW
dec rax ; num_rows
jg near .rowloop
add rsi, byte SIZEOF_JSAMPROW ; input_buf
add rdi, byte SIZEOF_JSAMPROW
add rbx, byte SIZEOF_JSAMPROW
add rdx, byte SIZEOF_JSAMPROW
dec rax ; num_rows
jg near .rowloop
.return:
pop rbx
uncollect_args
mov rsp,rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
pop rbx
uncollect_args
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -25,479 +25,479 @@
; JDIMENSION output_row, int num_rows);
;
%define img_width(b) (b)+8 ; JDIMENSION img_width
%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
%define output_row(b) (b)+20 ; JDIMENSION output_row
%define num_rows(b) (b)+24 ; int num_rows
%define img_width(b) (b)+8 ; JDIMENSION img_width
%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
%define output_row(b) (b)+20 ; JDIMENSION output_row
%define num_rows(b) (b)+24 ; int num_rows
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 8
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 8
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
align 16
align 16
global EXTN(jsimd_rgb_ycc_convert_sse2)
global EXTN(jsimd_rgb_ycc_convert_sse2)
EXTN(jsimd_rgb_ycc_convert_sse2):
push ebp
mov eax,esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax
mov ebp,esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)]
test ecx,ecx
jz near .return
mov ecx, JDIMENSION [img_width(eax)]
test ecx, ecx
jz near .return
push ecx
push ecx
mov esi, JSAMPIMAGE [output_buf(eax)]
mov ecx, JDIMENSION [output_row(eax)]
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
mov esi, JSAMPIMAGE [output_buf(eax)]
mov ecx, JDIMENSION [output_row(eax)]
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
pop ecx
pop ecx
mov esi, JSAMPARRAY [input_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax,eax
jle near .return
alignx 16,7
mov esi, JSAMPARRAY [input_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax, eax
jle near .return
alignx 16, 7
.rowloop:
pushpic eax
push edx
push ebx
push edi
push esi
push ecx ; col
pushpic eax
push edx
push ebx
push edi
push esi
push ecx ; col
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0
mov ebx, JSAMPROW [ebx] ; outptr1
mov edx, JSAMPROW [edx] ; outptr2
movpic eax, POINTER [gotptr] ; load GOT address (eax)
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0
mov ebx, JSAMPROW [ebx] ; outptr1
mov edx, JSAMPROW [edx] ; outptr2
movpic eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
alignx 16,7
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
alignx 16, 7
%if RGB_PIXELSIZE == 3 ; ---------------
%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
push eax
push edx
lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
movzx eax, BYTE [esi+ecx]
push eax
push edx
lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
movzx eax, BYTE [esi+ecx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
movzx edx, WORD [esi+ecx]
shl eax, WORD_BIT
or eax,edx
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
movzx edx, WORD [esi+ecx]
shl eax, WORD_BIT
or eax, edx
.column_ld4:
movd xmmA,eax
pop edx
pop eax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub ecx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [esi+ecx]
pslldq xmmA, SIZEOF_DWORD
por xmmA,xmmF
movd xmmA, eax
pop edx
pop eax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub ecx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [esi+ecx]
pslldq xmmA, SIZEOF_DWORD
por xmmA, xmmF
.column_ld8:
test cl, SIZEOF_MMWORD
jz short .column_ld16
sub ecx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [esi+ecx]
pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmB
test cl, SIZEOF_MMWORD
jz short .column_ld16
sub ecx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [esi+ecx]
pslldq xmmA, SIZEOF_MMWORD
por xmmA, xmmB
.column_ld16:
test cl, SIZEOF_XMMWORD
jz short .column_ld32
movdqa xmmF,xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
mov ecx, SIZEOF_XMMWORD
jmp short .rgb_ycc_cnv
test cl, SIZEOF_XMMWORD
jz short .column_ld32
movdqa xmmF, xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
mov ecx, SIZEOF_XMMWORD
jmp short .rgb_ycc_cnv
.column_ld32:
test cl, 2*SIZEOF_XMMWORD
mov ecx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv
movdqa xmmB,xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv
alignx 16,7
test cl, 2*SIZEOF_XMMWORD
mov ecx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv
movdqa xmmB, xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv
alignx 16, 7
.columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
.rgb_ycc_cnv:
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
movdqa xmmG,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
movdqa xmmG, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
movdqa xmmD,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
movdqa xmmD, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
movdqa xmmE,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
movdqa xmmE, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
pxor xmmH,xmmH
pxor xmmH, xmmH
movdqa xmmC,xmmA
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmC, xmmA
punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmB,xmmE
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmB, xmmE
punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmF,xmmD
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
movdqa xmmF, xmmD
punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
%else ; RGB_PIXELSIZE == 4 ; -----------
%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
test cl, SIZEOF_XMMWORD/16
jz short .column_ld2
sub ecx, byte SIZEOF_XMMWORD/16
movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
test cl, SIZEOF_XMMWORD/16
jz short .column_ld2
sub ecx, byte SIZEOF_XMMWORD/16
movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld2:
test cl, SIZEOF_XMMWORD/8
jz short .column_ld4
sub ecx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmE
test cl, SIZEOF_XMMWORD/8
jz short .column_ld4
sub ecx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD
por xmmA, xmmE
.column_ld4:
test cl, SIZEOF_XMMWORD/4
jz short .column_ld8
sub ecx, byte SIZEOF_XMMWORD/4
movdqa xmmE,xmmA
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
test cl, SIZEOF_XMMWORD/4
jz short .column_ld8
sub ecx, byte SIZEOF_XMMWORD/4
movdqa xmmE, xmmA
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld8:
test cl, SIZEOF_XMMWORD/2
mov ecx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv
movdqa xmmF,xmmA
movdqa xmmH,xmmE
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv
alignx 16,7
test cl, SIZEOF_XMMWORD/2
mov ecx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv
movdqa xmmF, xmmA
movdqa xmmH, xmmE
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv
alignx 16, 7
.columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
.rgb_ycc_cnv:
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD,xmmA
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmD, xmmA
punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmC,xmmF
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmC, xmmF
punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmB,xmmA
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmB, xmmA
punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmG,xmmD
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmG, xmmD
punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmE,xmmA
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmE, xmmA
punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmH,xmmB
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
movdqa xmmH, xmmB
punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
pxor xmmF,xmmF
pxor xmmF, xmmF
movdqa xmmC,xmmA
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmC, xmmA
punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmD,xmmB
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmD, xmmB
punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmG,xmmE
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
movdqa xmmG, xmmE
punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
punpcklbw xmmF,xmmH
punpckhbw xmmH,xmmH
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
punpcklbw xmmF, xmmH
punpckhbw xmmH, xmmH
psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
%endif ; RGB_PIXELSIZE ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
movdqa xmm6,xmm1
punpcklwd xmm1,xmm3
punpckhwd xmm6,xmm3
movdqa xmm7,xmm1
movdqa xmm4,xmm6
pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
movdqa xmm6, xmm1
punpcklwd xmm1, xmm3
punpckhwd xmm6, xmm3
movdqa xmm7, xmm1
movdqa xmm4, xmm6
pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
pmaddwd xmm7, [GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
pxor xmm1,xmm1
pxor xmm6,xmm6
punpcklwd xmm1,xmm5 ; xmm1=BOL
punpckhwd xmm6,xmm5 ; xmm6=BOH
psrld xmm1,1 ; xmm1=BOL*FIX(0.500)
psrld xmm6,1 ; xmm6=BOH*FIX(0.500)
pxor xmm1, xmm1
pxor xmm6, xmm6
punpcklwd xmm1, xmm5 ; xmm1=BOL
punpckhwd xmm6, xmm5 ; xmm6=BOH
psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
movdqa xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
paddd xmm7,xmm1
paddd xmm4,xmm6
paddd xmm7,xmm5
paddd xmm4,xmm5
psrld xmm7,SCALEBITS ; xmm7=CbOL
psrld xmm4,SCALEBITS ; xmm4=CbOH
packssdw xmm7,xmm4 ; xmm7=CbO
paddd xmm7, xmm1
paddd xmm4, xmm6
paddd xmm7, xmm5
paddd xmm4, xmm5
psrld xmm7, SCALEBITS ; xmm7=CbOL
psrld xmm4, SCALEBITS ; xmm4=CbOH
packssdw xmm7, xmm4 ; xmm7=CbO
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
movdqa xmm6,xmm0
punpcklwd xmm0,xmm2
punpckhwd xmm6,xmm2
movdqa xmm5,xmm0
movdqa xmm4,xmm6
pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
movdqa xmm6, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm6, xmm2
movdqa xmm5, xmm0
movdqa xmm4, xmm6
pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
pmaddwd xmm5, [GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
pxor xmm0,xmm0
pxor xmm6,xmm6
punpcklwd xmm0,xmm1 ; xmm0=BEL
punpckhwd xmm6,xmm1 ; xmm6=BEH
psrld xmm0,1 ; xmm0=BEL*FIX(0.500)
psrld xmm6,1 ; xmm6=BEH*FIX(0.500)
pxor xmm0, xmm0
pxor xmm6, xmm6
punpcklwd xmm0, xmm1 ; xmm0=BEL
punpckhwd xmm6, xmm1 ; xmm6=BEH
psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
paddd xmm5,xmm0
paddd xmm4,xmm6
paddd xmm5,xmm1
paddd xmm4,xmm1
psrld xmm5,SCALEBITS ; xmm5=CbEL
psrld xmm4,SCALEBITS ; xmm4=CbEH
packssdw xmm5,xmm4 ; xmm5=CbE
paddd xmm5, xmm0
paddd xmm4, xmm6
paddd xmm5, xmm1
paddd xmm4, xmm1
psrld xmm5, SCALEBITS ; xmm5=CbEL
psrld xmm4, SCALEBITS ; xmm4=CbEH
packssdw xmm5, xmm4 ; xmm5=CbE
psllw xmm7,BYTE_BIT
por xmm5,xmm7 ; xmm5=Cb
movdqa XMMWORD [ebx], xmm5 ; Save Cb
psllw xmm7, BYTE_BIT
por xmm5, xmm7 ; xmm5=Cb
movdqa XMMWORD [ebx], xmm5 ; Save Cb
movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
movdqa xmm4,xmm0
punpcklwd xmm0,xmm3
punpckhwd xmm4,xmm3
movdqa xmm7,xmm0
movdqa xmm5,xmm4
pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
movdqa xmm4, xmm0
punpcklwd xmm0, xmm3
punpckhwd xmm4, xmm3
movdqa xmm7, xmm0
movdqa xmm5, xmm4
pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
pmaddwd xmm7, [GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
paddd xmm0, XMMWORD [wk(4)]
paddd xmm4, XMMWORD [wk(5)]
paddd xmm0,xmm3
paddd xmm4,xmm3
psrld xmm0,SCALEBITS ; xmm0=YOL
psrld xmm4,SCALEBITS ; xmm4=YOH
packssdw xmm0,xmm4 ; xmm0=YO
paddd xmm0, XMMWORD [wk(4)]
paddd xmm4, XMMWORD [wk(5)]
paddd xmm0, xmm3
paddd xmm4, xmm3
psrld xmm0, SCALEBITS ; xmm0=YOL
psrld xmm4, SCALEBITS ; xmm4=YOH
packssdw xmm0, xmm4 ; xmm0=YO
pxor xmm3,xmm3
pxor xmm4,xmm4
punpcklwd xmm3,xmm1 ; xmm3=ROL
punpckhwd xmm4,xmm1 ; xmm4=ROH
psrld xmm3,1 ; xmm3=ROL*FIX(0.500)
psrld xmm4,1 ; xmm4=ROH*FIX(0.500)
pxor xmm3, xmm3
pxor xmm4, xmm4
punpcklwd xmm3, xmm1 ; xmm3=ROL
punpckhwd xmm4, xmm1 ; xmm4=ROH
psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
paddd xmm7,xmm3
paddd xmm5,xmm4
paddd xmm7,xmm1
paddd xmm5,xmm1
psrld xmm7,SCALEBITS ; xmm7=CrOL
psrld xmm5,SCALEBITS ; xmm5=CrOH
packssdw xmm7,xmm5 ; xmm7=CrO
paddd xmm7, xmm3
paddd xmm5, xmm4
paddd xmm7, xmm1
paddd xmm5, xmm1
psrld xmm7, SCALEBITS ; xmm7=CrOL
psrld xmm5, SCALEBITS ; xmm5=CrOH
packssdw xmm7, xmm5 ; xmm7=CrO
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
movdqa xmm4,xmm6
punpcklwd xmm6,xmm2
punpckhwd xmm4,xmm2
movdqa xmm1,xmm6
movdqa xmm5,xmm4
pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
movdqa xmm4, xmm6
punpcklwd xmm6, xmm2
punpckhwd xmm4, xmm2
movdqa xmm1, xmm6
movdqa xmm5, xmm4
pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
pmaddwd xmm1, [GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
paddd xmm6, XMMWORD [wk(6)]
paddd xmm4, XMMWORD [wk(7)]
paddd xmm6,xmm2
paddd xmm4,xmm2
psrld xmm6,SCALEBITS ; xmm6=YEL
psrld xmm4,SCALEBITS ; xmm4=YEH
packssdw xmm6,xmm4 ; xmm6=YE
paddd xmm6, XMMWORD [wk(6)]
paddd xmm4, XMMWORD [wk(7)]
paddd xmm6, xmm2
paddd xmm4, xmm2
psrld xmm6, SCALEBITS ; xmm6=YEL
psrld xmm4, SCALEBITS ; xmm4=YEH
packssdw xmm6, xmm4 ; xmm6=YE
psllw xmm0,BYTE_BIT
por xmm6,xmm0 ; xmm6=Y
movdqa XMMWORD [edi], xmm6 ; Save Y
psllw xmm0, BYTE_BIT
por xmm6, xmm0 ; xmm6=Y
movdqa XMMWORD [edi], xmm6 ; Save Y
pxor xmm2,xmm2
pxor xmm4,xmm4
punpcklwd xmm2,xmm3 ; xmm2=REL
punpckhwd xmm4,xmm3 ; xmm4=REH
psrld xmm2,1 ; xmm2=REL*FIX(0.500)
psrld xmm4,1 ; xmm4=REH*FIX(0.500)
pxor xmm2, xmm2
pxor xmm4, xmm4
punpcklwd xmm2, xmm3 ; xmm2=REL
punpckhwd xmm4, xmm3 ; xmm4=REH
psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
movdqa xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
paddd xmm1,xmm2
paddd xmm5,xmm4
paddd xmm1,xmm0
paddd xmm5,xmm0
psrld xmm1,SCALEBITS ; xmm1=CrEL
psrld xmm5,SCALEBITS ; xmm5=CrEH
packssdw xmm1,xmm5 ; xmm1=CrE
paddd xmm1, xmm2
paddd xmm5, xmm4
paddd xmm1, xmm0
paddd xmm5, xmm0
psrld xmm1, SCALEBITS ; xmm1=CrEL
psrld xmm5, SCALEBITS ; xmm5=CrEH
packssdw xmm1, xmm5 ; xmm1=CrE
psllw xmm7,BYTE_BIT
por xmm1,xmm7 ; xmm1=Cr
movdqa XMMWORD [edx], xmm1 ; Save Cr
psllw xmm7, BYTE_BIT
por xmm1, xmm7 ; xmm1=Cr
movdqa XMMWORD [edx], xmm1 ; Save Cr
sub ecx, byte SIZEOF_XMMWORD
add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
add edi, byte SIZEOF_XMMWORD ; outptr0
add ebx, byte SIZEOF_XMMWORD ; outptr1
add edx, byte SIZEOF_XMMWORD ; outptr2
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
test ecx,ecx
jnz near .column_ld1
sub ecx, byte SIZEOF_XMMWORD
add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
add edi, byte SIZEOF_XMMWORD ; outptr0
add ebx, byte SIZEOF_XMMWORD ; outptr1
add edx, byte SIZEOF_XMMWORD ; outptr2
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
test ecx, ecx
jnz near .column_ld1
pop ecx ; col
pop esi
pop edi
pop ebx
pop edx
poppic eax
pop ecx ; col
pop esi
pop edi
pop ebx
pop edx
poppic eax
add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW
add ebx, byte SIZEOF_JSAMPROW
add edx, byte SIZEOF_JSAMPROW
dec eax ; num_rows
jg near .rowloop
add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW
add ebx, byte SIZEOF_JSAMPROW
add edx, byte SIZEOF_JSAMPROW
dec eax ; num_rows
jg near .rowloop
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp,ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -19,23 +19,23 @@
; --------------------------------------------------------------------------
%define SCALEBITS 16
%define SCALEBITS 16
F_0_081 equ 5329 ; FIX(0.08131)
F_0_114 equ 7471 ; FIX(0.11400)
F_0_168 equ 11059 ; FIX(0.16874)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_331 equ 21709 ; FIX(0.33126)
F_0_418 equ 27439 ; FIX(0.41869)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
F_0_081 equ 5329 ; FIX(0.08131)
F_0_114 equ 7471 ; FIX(0.11400)
F_0_168 equ 11059 ; FIX(0.16874)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_331 equ 21709 ; FIX(0.33126)
F_0_418 equ 27439 ; FIX(0.41869)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
alignz 16
global EXTN(jconst_rgb_ycc_convert_sse2)
alignz 16
global EXTN(jconst_rgb_ycc_convert_sse2)
EXTN(jconst_rgb_ycc_convert_sse2):
@@ -46,11 +46,11 @@ PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418
PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
SECTION SEG_TEXT
BITS 64
%include "jccolext-sse2-64.asm"

View File

@@ -19,23 +19,23 @@
; --------------------------------------------------------------------------
%define SCALEBITS 16
%define SCALEBITS 16
F_0_081 equ 5329 ; FIX(0.08131)
F_0_114 equ 7471 ; FIX(0.11400)
F_0_168 equ 11059 ; FIX(0.16874)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_331 equ 21709 ; FIX(0.33126)
F_0_418 equ 27439 ; FIX(0.41869)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
F_0_081 equ 5329 ; FIX(0.08131)
F_0_114 equ 7471 ; FIX(0.11400)
F_0_168 equ 11059 ; FIX(0.16874)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_331 equ 21709 ; FIX(0.33126)
F_0_418 equ 27439 ; FIX(0.41869)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
alignz 16
global EXTN(jconst_rgb_ycc_convert_sse2)
alignz 16
global EXTN(jconst_rgb_ycc_convert_sse2)
EXTN(jconst_rgb_ycc_convert_sse2):
@@ -46,11 +46,11 @@ PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418
PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
SECTION SEG_TEXT
BITS 32
%include "jccolext-sse2.asm"

View File

@@ -19,31 +19,31 @@
; --------------------------------------------------------------------------
%define SCALEBITS 16
%define SCALEBITS 16
F_0_114 equ 7471 ; FIX(0.11400)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
F_0_114 equ 7471 ; FIX(0.11400)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
alignz 16
global EXTN(jconst_rgb_gray_convert_sse2)
alignz 16
global EXTN(jconst_rgb_gray_convert_sse2)
EXTN(jconst_rgb_gray_convert_sse2):
PW_F0299_F0337 times 4 dw F_0_299, F_0_337
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
PW_F0299_F0337 times 4 dw F_0_299, F_0_337
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
SECTION SEG_TEXT
BITS 64
%include "jcgryext-sse2-64.asm"

View File

@@ -19,31 +19,31 @@
; --------------------------------------------------------------------------
%define SCALEBITS 16
%define SCALEBITS 16
F_0_114 equ 7471 ; FIX(0.11400)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
F_0_114 equ 7471 ; FIX(0.11400)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
alignz 16
global EXTN(jconst_rgb_gray_convert_sse2)
alignz 16
global EXTN(jconst_rgb_gray_convert_sse2)
EXTN(jconst_rgb_gray_convert_sse2):
PW_F0299_F0337 times 4 dw F_0_299, F_0_337
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
PW_F0299_F0337 times 4 dw F_0_299, F_0_337
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
SECTION SEG_TEXT
BITS 32
%include "jcgryext-sse2.asm"

View File

@@ -33,333 +33,333 @@
; r13 = JDIMENSION output_row
; r14 = int num_rows
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
align 16
align 16
global EXTN(jsimd_rgb_gray_convert_sse2)
global EXTN(jsimd_rgb_gray_convert_sse2)
EXTN(jsimd_rgb_gray_convert_sse2):
push rbp
mov rax,rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax
mov rbp,rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
push rbx
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
push rbx
mov ecx, r10d
test rcx,rcx
jz near .return
mov ecx, r10d
test rcx, rcx
jz near .return
push rcx
push rcx
mov rsi, r12
mov ecx, r13d
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
mov rsi, r12
mov ecx, r13d
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
pop rcx
pop rcx
mov rsi, r11
mov eax, r14d
test rax,rax
jle near .return
mov rsi, r11
mov eax, r14d
test rax, rax
jle near .return
.rowloop:
push rdi
push rsi
push rcx ; col
push rdi
push rsi
push rcx ; col
mov rsi, JSAMPROW [rsi] ; inptr
mov rdi, JSAMPROW [rdi] ; outptr0
mov rsi, JSAMPROW [rsi] ; inptr
mov rdi, JSAMPROW [rdi] ; outptr0
cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop
cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop
%if RGB_PIXELSIZE == 3 ; ---------------
%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
push rax
push rdx
lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub rcx, byte SIZEOF_BYTE
movzx rax, BYTE [rsi+rcx]
push rax
push rdx
lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub rcx, byte SIZEOF_BYTE
movzx rax, BYTE [rsi+rcx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub rcx, byte SIZEOF_WORD
movzx rdx, WORD [rsi+rcx]
shl rax, WORD_BIT
or rax,rdx
test cl, SIZEOF_WORD
jz short .column_ld4
sub rcx, byte SIZEOF_WORD
movzx rdx, WORD [rsi+rcx]
shl rax, WORD_BIT
or rax, rdx
.column_ld4:
movd xmmA,eax
pop rdx
pop rax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub rcx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [rsi+rcx]
pslldq xmmA, SIZEOF_DWORD
por xmmA,xmmF
movd xmmA, eax
pop rdx
pop rax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub rcx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [rsi+rcx]
pslldq xmmA, SIZEOF_DWORD
por xmmA, xmmF
.column_ld8:
test cl, SIZEOF_MMWORD
jz short .column_ld16
sub rcx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [rsi+rcx]
pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmB
test cl, SIZEOF_MMWORD
jz short .column_ld16
sub rcx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [rsi+rcx]
pslldq xmmA, SIZEOF_MMWORD
por xmmA, xmmB
.column_ld16:
test cl, SIZEOF_XMMWORD
jz short .column_ld32
movdqa xmmF,xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
mov rcx, SIZEOF_XMMWORD
jmp short .rgb_gray_cnv
test cl, SIZEOF_XMMWORD
jz short .column_ld32
movdqa xmmF, xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
mov rcx, SIZEOF_XMMWORD
jmp short .rgb_gray_cnv
.column_ld32:
test cl, 2*SIZEOF_XMMWORD
mov rcx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv
movdqa xmmB,xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv
test cl, 2*SIZEOF_XMMWORD
mov rcx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv
movdqa xmmB, xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv
.columnloop:
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
.rgb_gray_cnv:
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
movdqa xmmG,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
movdqa xmmG, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
movdqa xmmD,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
movdqa xmmD, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
movdqa xmmE,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
movdqa xmmE, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
pxor xmmH,xmmH
pxor xmmH, xmmH
movdqa xmmC,xmmA
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmC, xmmA
punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmB,xmmE
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmB, xmmE
punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmF,xmmD
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
movdqa xmmF, xmmD
punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
%else ; RGB_PIXELSIZE == 4 ; -----------
%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
test cl, SIZEOF_XMMWORD/16
jz short .column_ld2
sub rcx, byte SIZEOF_XMMWORD/16
movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
test cl, SIZEOF_XMMWORD/16
jz short .column_ld2
sub rcx, byte SIZEOF_XMMWORD/16
movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
.column_ld2:
test cl, SIZEOF_XMMWORD/8
jz short .column_ld4
sub rcx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmE
test cl, SIZEOF_XMMWORD/8
jz short .column_ld4
sub rcx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD
por xmmA, xmmE
.column_ld4:
test cl, SIZEOF_XMMWORD/4
jz short .column_ld8
sub rcx, byte SIZEOF_XMMWORD/4
movdqa xmmE,xmmA
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
test cl, SIZEOF_XMMWORD/4
jz short .column_ld8
sub rcx, byte SIZEOF_XMMWORD/4
movdqa xmmE, xmmA
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
.column_ld8:
test cl, SIZEOF_XMMWORD/2
mov rcx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv
movdqa xmmF,xmmA
movdqa xmmH,xmmE
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv
test cl, SIZEOF_XMMWORD/2
mov rcx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv
movdqa xmmF, xmmA
movdqa xmmH, xmmE
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv
.columnloop:
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
.rgb_gray_cnv:
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD,xmmA
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmD, xmmA
punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmC,xmmF
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmC, xmmF
punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmB,xmmA
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmB, xmmA
punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmG,xmmD
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmG, xmmD
punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmE,xmmA
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmE, xmmA
punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmH,xmmB
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
movdqa xmmH, xmmB
punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
pxor xmmF,xmmF
pxor xmmF, xmmF
movdqa xmmC,xmmA
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmC, xmmA
punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmD,xmmB
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmD, xmmB
punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmG,xmmE
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
movdqa xmmG, xmmE
punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
punpcklbw xmmF,xmmH
punpckhbw xmmH,xmmH
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
punpcklbw xmmF, xmmH
punpckhbw xmmH, xmmH
psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
%endif ; RGB_PIXELSIZE ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
movdqa xmm6,xmm1
punpcklwd xmm1,xmm3
punpckhwd xmm6,xmm3
pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm6, xmm1
punpcklwd xmm1, xmm3
punpckhwd xmm6, xmm3
pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm6,xmm0
punpcklwd xmm0,xmm2
punpckhwd xmm6,xmm2
pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa xmm6, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm6, xmm2
pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa xmm0, xmm5 ; xmm0=BO
movdqa xmm6, xmm4 ; xmm6=BE
movdqa xmm0, xmm5 ; xmm0=BO
movdqa xmm6, xmm4 ; xmm6=BE
movdqa xmm4,xmm0
punpcklwd xmm0,xmm3
punpckhwd xmm4,xmm3
pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
movdqa xmm4, xmm0
punpcklwd xmm0, xmm3
punpckhwd xmm4, xmm3
pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
paddd xmm0, xmm1
paddd xmm4, xmm7
paddd xmm0,xmm3
paddd xmm4,xmm3
psrld xmm0,SCALEBITS ; xmm0=YOL
psrld xmm4,SCALEBITS ; xmm4=YOH
packssdw xmm0,xmm4 ; xmm0=YO
paddd xmm0, xmm1
paddd xmm4, xmm7
paddd xmm0, xmm3
paddd xmm4, xmm3
psrld xmm0, SCALEBITS ; xmm0=YOL
psrld xmm4, SCALEBITS ; xmm4=YOH
packssdw xmm0, xmm4 ; xmm0=YO
movdqa xmm4,xmm6
punpcklwd xmm6,xmm2
punpckhwd xmm4,xmm2
pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
movdqa xmm4, xmm6
punpcklwd xmm6, xmm2
punpckhwd xmm4, xmm2
pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
paddd xmm6, XMMWORD [wk(0)]
paddd xmm4, XMMWORD [wk(1)]
paddd xmm6,xmm2
paddd xmm4,xmm2
psrld xmm6,SCALEBITS ; xmm6=YEL
psrld xmm4,SCALEBITS ; xmm4=YEH
packssdw xmm6,xmm4 ; xmm6=YE
paddd xmm6, XMMWORD [wk(0)]
paddd xmm4, XMMWORD [wk(1)]
paddd xmm6, xmm2
paddd xmm4, xmm2
psrld xmm6, SCALEBITS ; xmm6=YEL
psrld xmm4, SCALEBITS ; xmm4=YEH
packssdw xmm6, xmm4 ; xmm6=YE
psllw xmm0,BYTE_BIT
por xmm6,xmm0 ; xmm6=Y
movdqa XMMWORD [rdi], xmm6 ; Save Y
psllw xmm0, BYTE_BIT
por xmm6, xmm0 ; xmm6=Y
movdqa XMMWORD [rdi], xmm6 ; Save Y
sub rcx, byte SIZEOF_XMMWORD
add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
add rdi, byte SIZEOF_XMMWORD ; outptr0
cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop
test rcx,rcx
jnz near .column_ld1
sub rcx, byte SIZEOF_XMMWORD
add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
add rdi, byte SIZEOF_XMMWORD ; outptr0
cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop
test rcx, rcx
jnz near .column_ld1
pop rcx ; col
pop rsi
pop rdi
pop rcx ; col
pop rsi
pop rdi
add rsi, byte SIZEOF_JSAMPROW ; input_buf
add rdi, byte SIZEOF_JSAMPROW
dec rax ; num_rows
jg near .rowloop
add rsi, byte SIZEOF_JSAMPROW ; input_buf
add rdi, byte SIZEOF_JSAMPROW
dec rax ; num_rows
jg near .rowloop
.return:
pop rbx
uncollect_args
mov rsp,rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
pop rbx
uncollect_args
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -27,358 +27,358 @@
; JDIMENSION output_row, int num_rows);
;
%define img_width(b) (b)+8 ; JDIMENSION img_width
%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
%define output_row(b) (b)+20 ; JDIMENSION output_row
%define num_rows(b) (b)+24 ; int num_rows
%define img_width(b) (b)+8 ; JDIMENSION img_width
%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
%define output_row(b) (b)+20 ; JDIMENSION output_row
%define num_rows(b) (b)+24 ; int num_rows
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
align 16
align 16
global EXTN(jsimd_rgb_gray_convert_sse2)
global EXTN(jsimd_rgb_gray_convert_sse2)
EXTN(jsimd_rgb_gray_convert_sse2):
push ebp
mov eax,esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax
mov ebp,esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)]
test ecx,ecx
jz near .return
mov ecx, JDIMENSION [img_width(eax)]
test ecx, ecx
jz near .return
push ecx
push ecx
mov esi, JSAMPIMAGE [output_buf(eax)]
mov ecx, JDIMENSION [output_row(eax)]
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
mov esi, JSAMPIMAGE [output_buf(eax)]
mov ecx, JDIMENSION [output_row(eax)]
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
pop ecx
pop ecx
mov esi, JSAMPARRAY [input_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax,eax
jle near .return
alignx 16,7
mov esi, JSAMPARRAY [input_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax, eax
jle near .return
alignx 16, 7
.rowloop:
pushpic eax
push edi
push esi
push ecx ; col
pushpic eax
push edi
push esi
push ecx ; col
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0
movpic eax, POINTER [gotptr] ; load GOT address (eax)
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0
movpic eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
alignx 16,7
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
alignx 16, 7
%if RGB_PIXELSIZE == 3 ; ---------------
%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
push eax
push edx
lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
movzx eax, BYTE [esi+ecx]
push eax
push edx
lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
movzx eax, BYTE [esi+ecx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
movzx edx, WORD [esi+ecx]
shl eax, WORD_BIT
or eax,edx
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
movzx edx, WORD [esi+ecx]
shl eax, WORD_BIT
or eax, edx
.column_ld4:
movd xmmA,eax
pop edx
pop eax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub ecx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [esi+ecx]
pslldq xmmA, SIZEOF_DWORD
por xmmA,xmmF
movd xmmA, eax
pop edx
pop eax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub ecx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [esi+ecx]
pslldq xmmA, SIZEOF_DWORD
por xmmA, xmmF
.column_ld8:
test cl, SIZEOF_MMWORD
jz short .column_ld16
sub ecx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [esi+ecx]
pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmB
test cl, SIZEOF_MMWORD
jz short .column_ld16
sub ecx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [esi+ecx]
pslldq xmmA, SIZEOF_MMWORD
por xmmA, xmmB
.column_ld16:
test cl, SIZEOF_XMMWORD
jz short .column_ld32
movdqa xmmF,xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
mov ecx, SIZEOF_XMMWORD
jmp short .rgb_gray_cnv
test cl, SIZEOF_XMMWORD
jz short .column_ld32
movdqa xmmF, xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
mov ecx, SIZEOF_XMMWORD
jmp short .rgb_gray_cnv
.column_ld32:
test cl, 2*SIZEOF_XMMWORD
mov ecx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv
movdqa xmmB,xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv
alignx 16,7
test cl, 2*SIZEOF_XMMWORD
mov ecx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv
movdqa xmmB, xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv
alignx 16, 7
.columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
.rgb_gray_cnv:
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
movdqa xmmG,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
movdqa xmmG, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
movdqa xmmD,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
movdqa xmmD, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
movdqa xmmE,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
movdqa xmmE, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
pxor xmmH,xmmH
pxor xmmH, xmmH
movdqa xmmC,xmmA
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmC, xmmA
punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmB,xmmE
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmB, xmmE
punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmF,xmmD
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
movdqa xmmF, xmmD
punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
%else ; RGB_PIXELSIZE == 4 ; -----------
%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
test cl, SIZEOF_XMMWORD/16
jz short .column_ld2
sub ecx, byte SIZEOF_XMMWORD/16
movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
test cl, SIZEOF_XMMWORD/16
jz short .column_ld2
sub ecx, byte SIZEOF_XMMWORD/16
movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld2:
test cl, SIZEOF_XMMWORD/8
jz short .column_ld4
sub ecx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmE
test cl, SIZEOF_XMMWORD/8
jz short .column_ld4
sub ecx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD
por xmmA, xmmE
.column_ld4:
test cl, SIZEOF_XMMWORD/4
jz short .column_ld8
sub ecx, byte SIZEOF_XMMWORD/4
movdqa xmmE,xmmA
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
test cl, SIZEOF_XMMWORD/4
jz short .column_ld8
sub ecx, byte SIZEOF_XMMWORD/4
movdqa xmmE, xmmA
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld8:
test cl, SIZEOF_XMMWORD/2
mov ecx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv
movdqa xmmF,xmmA
movdqa xmmH,xmmE
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv
alignx 16,7
test cl, SIZEOF_XMMWORD/2
mov ecx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv
movdqa xmmF, xmmA
movdqa xmmH, xmmE
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv
alignx 16, 7
.columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
.rgb_gray_cnv:
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD,xmmA
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmD, xmmA
punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmC,xmmF
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmC, xmmF
punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmB,xmmA
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmB, xmmA
punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmG,xmmD
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmG, xmmD
punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmE,xmmA
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmE, xmmA
punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmH,xmmB
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
movdqa xmmH, xmmB
punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
pxor xmmF,xmmF
pxor xmmF, xmmF
movdqa xmmC,xmmA
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmC, xmmA
punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmD,xmmB
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmD, xmmB
punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmG,xmmE
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
movdqa xmmG, xmmE
punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
punpcklbw xmmF,xmmH
punpckhbw xmmH,xmmH
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
punpcklbw xmmF, xmmH
punpckhbw xmmH, xmmH
psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
%endif ; RGB_PIXELSIZE ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
movdqa xmm6,xmm1
punpcklwd xmm1,xmm3
punpckhwd xmm6,xmm3
pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm6, xmm1
punpcklwd xmm1, xmm3
punpckhwd xmm6, xmm3
pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm6,xmm0
punpcklwd xmm0,xmm2
punpckhwd xmm6,xmm2
pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa xmm6, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm6, xmm2
pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa xmm0, xmm5 ; xmm0=BO
movdqa xmm6, xmm4 ; xmm6=BE
movdqa xmm0, xmm5 ; xmm0=BO
movdqa xmm6, xmm4 ; xmm6=BE
movdqa xmm4,xmm0
punpcklwd xmm0,xmm3
punpckhwd xmm4,xmm3
pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
movdqa xmm4, xmm0
punpcklwd xmm0, xmm3
punpckhwd xmm4, xmm3
pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
paddd xmm0, xmm1
paddd xmm4, xmm7
paddd xmm0,xmm3
paddd xmm4,xmm3
psrld xmm0,SCALEBITS ; xmm0=YOL
psrld xmm4,SCALEBITS ; xmm4=YOH
packssdw xmm0,xmm4 ; xmm0=YO
paddd xmm0, xmm1
paddd xmm4, xmm7
paddd xmm0, xmm3
paddd xmm4, xmm3
psrld xmm0, SCALEBITS ; xmm0=YOL
psrld xmm4, SCALEBITS ; xmm4=YOH
packssdw xmm0, xmm4 ; xmm0=YO
movdqa xmm4,xmm6
punpcklwd xmm6,xmm2
punpckhwd xmm4,xmm2
pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
movdqa xmm4, xmm6
punpcklwd xmm6, xmm2
punpckhwd xmm4, xmm2
pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
paddd xmm6, XMMWORD [wk(0)]
paddd xmm4, XMMWORD [wk(1)]
paddd xmm6,xmm2
paddd xmm4,xmm2
psrld xmm6,SCALEBITS ; xmm6=YEL
psrld xmm4,SCALEBITS ; xmm4=YEH
packssdw xmm6,xmm4 ; xmm6=YE
paddd xmm6, XMMWORD [wk(0)]
paddd xmm4, XMMWORD [wk(1)]
paddd xmm6, xmm2
paddd xmm4, xmm2
psrld xmm6, SCALEBITS ; xmm6=YEL
psrld xmm4, SCALEBITS ; xmm4=YEH
packssdw xmm6, xmm4 ; xmm6=YE
psllw xmm0,BYTE_BIT
por xmm6,xmm0 ; xmm6=Y
movdqa XMMWORD [edi], xmm6 ; Save Y
psllw xmm0, BYTE_BIT
por xmm6, xmm0 ; xmm6=Y
movdqa XMMWORD [edi], xmm6 ; Save Y
sub ecx, byte SIZEOF_XMMWORD
add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
add edi, byte SIZEOF_XMMWORD ; outptr0
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
test ecx,ecx
jnz near .column_ld1
sub ecx, byte SIZEOF_XMMWORD
add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
add edi, byte SIZEOF_XMMWORD ; outptr0
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
test ecx, ecx
jnz near .column_ld1
pop ecx ; col
pop esi
pop edi
poppic eax
pop ecx ; col
pop esi
pop edi
poppic eax
add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW
dec eax ; num_rows
jg near .rowloop
add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW
dec eax ; num_rows
jg near .rowloop
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp,ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -23,20 +23,20 @@
%include "jsimdext.inc"
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
alignz 16
global EXTN(jconst_huff_encode_one_block)
alignz 16
global EXTN(jconst_huff_encode_one_block)
EXTN(jconst_huff_encode_one_block):
%include "jpeg_nbits_table.inc"
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
SECTION SEG_TEXT
BITS 64
; These macros perform the same task as the emit_bits() function in the
; original libjpeg code. In addition to reducing overhead by explicitly
@@ -46,118 +46,118 @@ EXTN(jconst_huff_encode_one_block):
; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
%macro EMIT_BYTE 0
sub put_bits, 8 ; put_bits -= 8;
mov rdx, put_buffer
mov ecx, put_bits
shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
mov byte [buffer], dl ; *buffer++ = c;
add buffer, 1
cmp dl, 0xFF ; need to stuff a zero byte?
jne %%.EMIT_BYTE_END
mov byte [buffer], 0 ; *buffer++ = 0;
add buffer, 1
sub put_bits, 8 ; put_bits -= 8;
mov rdx, put_buffer
mov ecx, put_bits
shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
mov byte [buffer], dl ; *buffer++ = c;
add buffer, 1
cmp dl, 0xFF ; need to stuff a zero byte?
jne %%.EMIT_BYTE_END
mov byte [buffer], 0 ; *buffer++ = 0;
add buffer, 1
%%.EMIT_BYTE_END:
%endmacro
%macro PUT_BITS 1
add put_bits, ecx ; put_bits += size;
shl put_buffer, cl ; put_buffer = (put_buffer << size);
or put_buffer, %1
add put_bits, ecx ; put_bits += size;
shl put_buffer, cl ; put_buffer = (put_buffer << size);
or put_buffer, %1
%endmacro
%macro CHECKBUF31 0
cmp put_bits, 32 ; if (put_bits > 31) {
jl %%.CHECKBUF31_END
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
cmp put_bits, 32 ; if (put_bits > 31) {
jl %%.CHECKBUF31_END
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
%%.CHECKBUF31_END:
%endmacro
%macro CHECKBUF47 0
cmp put_bits, 48 ; if (put_bits > 47) {
jl %%.CHECKBUF47_END
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
cmp put_bits, 48 ; if (put_bits > 47) {
jl %%.CHECKBUF47_END
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
%%.CHECKBUF47_END:
%endmacro
%macro EMIT_BITS 2
CHECKBUF47
mov ecx, %2
PUT_BITS %1
CHECKBUF47
mov ecx, %2
PUT_BITS %1
%endmacro
%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128();
pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128();
pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128();
pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128();
pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128();
pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128();
pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128();
pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128();
pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
%if %1 != 32
pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
%else
pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31];
pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31];
%endif
pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg);
paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg);
paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg);
paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg);
pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg);
pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg);
pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg);
pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg);
pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1);
pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1);
pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1);
pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1);
movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg);
paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg);
paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg);
paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg);
pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg);
pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg);
pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg);
pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg);
pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1);
pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1);
pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1);
pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1);
movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
%endmacro
;
@@ -176,185 +176,185 @@ EXTN(jconst_huff_encode_one_block):
; r14 = c_derived_tbl *dctbl
; r15 = c_derived_tbl *actbl
%define t1 rbp-(DCTSIZE2*SIZEOF_WORD)
%define t2 t1-(DCTSIZE2*SIZEOF_WORD)
%define put_buffer r8
%define put_bits r9d
%define buffer rax
%define t1 rbp-(DCTSIZE2*SIZEOF_WORD)
%define t2 t1-(DCTSIZE2*SIZEOF_WORD)
%define put_buffer r8
%define put_bits r9d
%define buffer rax
align 16
global EXTN(jsimd_huff_encode_one_block_sse2)
align 16
global EXTN(jsimd_huff_encode_one_block_sse2)
EXTN(jsimd_huff_encode_one_block_sse2):
push rbp
mov rax,rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax
mov rbp,rsp ; rbp = aligned rbp
lea rsp, [t2]
collect_args
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp
lea rsp, [t2]
collect_args
%ifdef WIN64
movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
movaps XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10
movaps XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11
sub rsp, 4*SIZEOF_XMMWORD
movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
movaps XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10
movaps XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11
sub rsp, 4*SIZEOF_XMMWORD
%endif
push rbx
push rbx
mov buffer, r11 ; r11 is now sratch
mov buffer, r11 ; r11 is now sratch
mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer;
mov put_bits, DWORD [r10+24] ; put_bits = state->cur.put_bits;
push r10 ; r10 is now scratch
mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer;
mov put_bits, DWORD [r10+24] ; put_bits = state->cur.put_bits;
push r10 ; r10 is now scratch
; Encode the DC coefficient difference per section F.1.2.1
movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val;
sub edi, r13d ; r13 is not used anymore
mov ebx, edi
; Encode the DC coefficient difference per section F.1.2.1
movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val;
sub edi, r13d ; r13 is not used anymore
mov ebx, edi
; This is a well-known technique for obtaining the absolute value
; without a branch. It is derived from an assembly language technique
; presented in "How to Optimize for the Pentium Processors",
; Copyright (c) 1996, 1997 by Agner Fog.
mov esi, edi
sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
xor edi, esi ; temp ^= temp3;
sub edi, esi ; temp -= temp3;
; This is a well-known technique for obtaining the absolute value
; without a branch. It is derived from an assembly language technique
; presented in "How to Optimize for the Pentium Processors",
; Copyright (c) 1996, 1997 by Agner Fog.
mov esi, edi
sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
xor edi, esi ; temp ^= temp3;
sub edi, esi ; temp -= temp3;
; For a negative input, want temp2 = bitwise complement of abs(input)
; This code assumes we are on a two's complement machine
add ebx, esi ; temp2 += temp3;
; For a negative input, want temp2 = bitwise complement of abs(input)
; This code assumes we are on a two's complement machine
add ebx, esi ; temp2 += temp3;
; Find the number of bits needed for the magnitude of the coefficient
lea r11, [rel jpeg_nbits_table]
movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp);
; Emit the Huffman-coded symbol for the number of bits
mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits];
movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits];
EMIT_BITS r11, esi ; EMIT_BITS(code, size)
; Find the number of bits needed for the magnitude of the coefficient
lea r11, [rel jpeg_nbits_table]
movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp);
; Emit the Huffman-coded symbol for the number of bits
mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits];
movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits];
EMIT_BITS r11, esi ; EMIT_BITS(code, size)
; Mask off any extra bits in code
mov esi, 1
mov ecx, edi
shl esi, cl
dec esi
and ebx, esi ; temp2 &= (((JLONG) 1)<<nbits) - 1;
; Mask off any extra bits in code
mov esi, 1
mov ecx, edi
shl esi, cl
dec esi
and ebx, esi ; temp2 &= (((JLONG) 1)<<nbits) - 1;
; Emit that number of bits of the value, if positive,
; or the complement of its magnitude, if negative.
EMIT_BITS rbx, edi ; EMIT_BITS(temp2, nbits)
; Emit that number of bits of the value, if positive,
; or the complement of its magnitude, if negative.
EMIT_BITS rbx, edi ; EMIT_BITS(temp2, nbits)
; Prepare data
xor ebx, ebx
kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
27, 20, 13, 6, 7, 14, 21, 28, 35, \
xmm0, xmm1, xmm2, xmm3
kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
53, 60, 61, 54, 47, 55, 62, 63, 63, \
xmm4, xmm5, xmm6, xmm7
; Prepare data
xor ebx, ebx
kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
27, 20, 13, 6, 7, 14, 21, 28, 35, \
xmm0, xmm1, xmm2, xmm3
kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
53, 60, 61, 54, 47, 55, 62, 63, 63, \
xmm4, xmm5, xmm6, xmm7
pxor xmm8, xmm8
pcmpeqw xmm0, xmm8 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
pcmpeqw xmm1, xmm8 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
pcmpeqw xmm2, xmm8 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
pcmpeqw xmm3, xmm8 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
pcmpeqw xmm4, xmm8 ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
pcmpeqw xmm5, xmm8 ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
pcmpeqw xmm6, xmm8 ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
pcmpeqw xmm7, xmm8 ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
packsswb xmm4, xmm5 ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
packsswb xmm6, xmm7 ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
pmovmskb r11d, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
pmovmskb r12d, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
pmovmskb r13d, xmm4 ; index = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
pmovmskb r14d, xmm6 ; index = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
shl r12, 16
shl r14, 16
or r11, r12
or r13, r14
shl r13, 32
or r11, r13
not r11 ; index = ~index;
pxor xmm8, xmm8
pcmpeqw xmm0, xmm8 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
pcmpeqw xmm1, xmm8 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
pcmpeqw xmm2, xmm8 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
pcmpeqw xmm3, xmm8 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
pcmpeqw xmm4, xmm8 ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
pcmpeqw xmm5, xmm8 ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
pcmpeqw xmm6, xmm8 ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
pcmpeqw xmm7, xmm8 ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
packsswb xmm4, xmm5 ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
packsswb xmm6, xmm7 ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
pmovmskb r11d, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
pmovmskb r12d, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
pmovmskb r13d, xmm4 ; index = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
pmovmskb r14d, xmm6 ; index = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
shl r12, 16
shl r14, 16
or r11, r12
or r13, r14
shl r13, 32
or r11, r13
not r11 ; index = ~index;
;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
;jmp .EFN
;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
;jmp .EFN
mov r13d, INT [r15 + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
lea rsi, [t1]
mov r13d, INT [r15 + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
lea rsi, [t1]
.BLOOP:
bsf r12, r11 ; r = __builtin_ctzl(index);
jz .ELOOP
mov rcx, r12
lea rsi, [rsi+r12*2] ; k += r;
shr r11, cl ; index >>= r;
movzx rdi, word [rsi] ; temp = t1[k];
lea rbx, [rel jpeg_nbits_table]
movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp);
bsf r12, r11 ; r = __builtin_ctzl(index);
jz .ELOOP
mov rcx, r12
lea rsi, [rsi+r12*2] ; k += r;
shr r11, cl ; index >>= r;
movzx rdi, word [rsi] ; temp = t1[k];
lea rbx, [rel jpeg_nbits_table]
movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp);
.BRLOOP:
cmp r12, 16 ; while (r > 15) {
jl .ERLOOP
EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0)
sub r12, 16 ; r -= 16;
jmp .BRLOOP
cmp r12, 16 ; while (r > 15) {
jl .ERLOOP
EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0)
sub r12, 16 ; r -= 16;
jmp .BRLOOP
.ERLOOP:
; Emit Huffman symbol for run length / number of bits
CHECKBUF31 ; uses rcx, rdx
; Emit Huffman symbol for run length / number of bits
CHECKBUF31 ; uses rcx, rdx
shl r12, 4 ; temp3 = (r << 4) + nbits;
add r12, rdi
mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3];
movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3];
PUT_BITS rbx
shl r12, 4 ; temp3 = (r << 4) + nbits;
add r12, rdi
mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3];
movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3];
PUT_BITS rbx
;EMIT_CODE(code, size)
;EMIT_CODE(code, size)
movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k];
; Mask off any extra bits in code
mov rcx, rdi
mov rdx, 1
shl rdx, cl
dec rdx
and rbx, rdx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
PUT_BITS rbx ; PUT_BITS(temp2, nbits)
movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k];
; Mask off any extra bits in code
mov rcx, rdi
mov rdx, 1
shl rdx, cl
dec rdx
and rbx, rdx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
PUT_BITS rbx ; PUT_BITS(temp2, nbits)
shr r11, 1 ; index >>= 1;
add rsi, 2 ; ++k;
jmp .BLOOP
shr r11, 1 ; index >>= 1;
add rsi, 2 ; ++k;
jmp .BLOOP
.ELOOP:
; If the last coef(s) were zero, emit an end-of-block code
lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
cmp rdi, rsi ; if (r > 0) {
je .EFN
mov ebx, INT [r15] ; code = actbl->ehufco[0];
movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0];
EMIT_BITS rbx, r12d
; If the last coef(s) were zero, emit an end-of-block code
lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
cmp rdi, rsi ; if (r > 0) {
je .EFN
mov ebx, INT [r15] ; code = actbl->ehufco[0];
movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0];
EMIT_BITS rbx, r12d
.EFN:
pop r10
; Save put_buffer & put_bits
mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer;
mov DWORD [r10+24], put_bits ; state->cur.put_bits = put_bits;
pop r10
; Save put_buffer & put_bits
mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer;
mov DWORD [r10+24], put_bits ; state->cur.put_bits = put_bits;
pop rbx
pop rbx
%ifdef WIN64
movaps xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD]
movaps xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD]
movaps xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD]
movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
add rsp, 4*SIZEOF_XMMWORD
movaps xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD]
movaps xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD]
movaps xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD]
movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
add rsp, 4*SIZEOF_XMMWORD
%endif
uncollect_args
mov rsp,rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
uncollect_args
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -23,20 +23,20 @@
%include "jsimdext.inc"
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
alignz 16
global EXTN(jconst_huff_encode_one_block)
alignz 16
global EXTN(jconst_huff_encode_one_block)
EXTN(jconst_huff_encode_one_block):
%include "jpeg_nbits_table.inc"
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
SECTION SEG_TEXT
BITS 32
; These macros perform the same task as the emit_bits() function in the
; original libjpeg code. In addition to reducing overhead by explicitly
@@ -46,105 +46,105 @@ EXTN(jconst_huff_encode_one_block):
; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
%macro EMIT_BYTE 0
sub put_bits, 8 ; put_bits -= 8;
mov edx, put_buffer
mov ecx, put_bits
shr edx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
mov byte [eax], dl ; *buffer++ = c;
add eax, 1
cmp dl, 0xFF ; need to stuff a zero byte?
jne %%.EMIT_BYTE_END
mov byte [eax], 0 ; *buffer++ = 0;
add eax, 1
sub put_bits, 8 ; put_bits -= 8;
mov edx, put_buffer
mov ecx, put_bits
shr edx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
mov byte [eax], dl ; *buffer++ = c;
add eax, 1
cmp dl, 0xFF ; need to stuff a zero byte?
jne %%.EMIT_BYTE_END
mov byte [eax], 0 ; *buffer++ = 0;
add eax, 1
%%.EMIT_BYTE_END:
%endmacro
%macro PUT_BITS 1
add put_bits, ecx ; put_bits += size;
shl put_buffer, cl ; put_buffer = (put_buffer << size);
or put_buffer, %1
add put_bits, ecx ; put_bits += size;
shl put_buffer, cl ; put_buffer = (put_buffer << size);
or put_buffer, %1
%endmacro
%macro CHECKBUF15 0
cmp put_bits, 16 ; if (put_bits > 31) {
jl %%.CHECKBUF15_END
mov eax, POINTER [esp+buffer]
EMIT_BYTE
EMIT_BYTE
mov POINTER [esp+buffer], eax
cmp put_bits, 16 ; if (put_bits > 31) {
jl %%.CHECKBUF15_END
mov eax, POINTER [esp+buffer]
EMIT_BYTE
EMIT_BYTE
mov POINTER [esp+buffer], eax
%%.CHECKBUF15_END:
%endmacro
%macro EMIT_BITS 1
PUT_BITS %1
CHECKBUF15
PUT_BITS %1
CHECKBUF15
%endmacro
%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
pxor xmm4, xmm4 ; __m128i neg = _mm_setzero_si128();
pxor xmm5, xmm5 ; __m128i neg = _mm_setzero_si128();
pxor xmm6, xmm6 ; __m128i neg = _mm_setzero_si128();
pxor xmm7, xmm7 ; __m128i neg = _mm_setzero_si128();
pinsrw %34, word [esi + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
pinsrw %34, word [esi + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
pinsrw %34, word [esi + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
pinsrw %34, word [esi + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
pinsrw %34, word [esi + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
pinsrw %34, word [esi + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
pinsrw %34, word [esi + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
pinsrw %34, word [esi + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
pxor xmm4, xmm4 ; __m128i neg = _mm_setzero_si128();
pxor xmm5, xmm5 ; __m128i neg = _mm_setzero_si128();
pxor xmm6, xmm6 ; __m128i neg = _mm_setzero_si128();
pxor xmm7, xmm7 ; __m128i neg = _mm_setzero_si128();
pinsrw %34, word [esi + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
pinsrw %34, word [esi + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
pinsrw %34, word [esi + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
pinsrw %34, word [esi + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
pinsrw %34, word [esi + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
pinsrw %34, word [esi + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
pinsrw %34, word [esi + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
pinsrw %34, word [esi + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
%if %1 != 32
pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
%else
pinsrw %37, ecx, 7 ; xmm_shadow[31] = block[jno31];
pinsrw %37, ecx, 7 ; xmm_shadow[31] = block[jno31];
%endif
pcmpgtw xmm4, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm5, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm6, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm7, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
paddw %34, xmm4 ; x1 = _mm_add_epi16(x1, neg);
paddw %35, xmm5 ; x1 = _mm_add_epi16(x1, neg);
paddw %36, xmm6 ; x1 = _mm_add_epi16(x1, neg);
paddw %37, xmm7 ; x1 = _mm_add_epi16(x1, neg);
pxor %34, xmm4 ; x1 = _mm_xor_si128(x1, neg);
pxor %35, xmm5 ; x1 = _mm_xor_si128(x1, neg);
pxor %36, xmm6 ; x1 = _mm_xor_si128(x1, neg);
pxor %37, xmm7 ; x1 = _mm_xor_si128(x1, neg);
pxor xmm4, %34 ; neg = _mm_xor_si128(neg, x1);
pxor xmm5, %35 ; neg = _mm_xor_si128(neg, x1);
pxor xmm6, %36 ; neg = _mm_xor_si128(neg, x1);
pxor xmm7, %37 ; neg = _mm_xor_si128(neg, x1);
movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
pcmpgtw xmm4, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm5, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm6, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm7, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
paddw %34, xmm4 ; x1 = _mm_add_epi16(x1, neg);
paddw %35, xmm5 ; x1 = _mm_add_epi16(x1, neg);
paddw %36, xmm6 ; x1 = _mm_add_epi16(x1, neg);
paddw %37, xmm7 ; x1 = _mm_add_epi16(x1, neg);
pxor %34, xmm4 ; x1 = _mm_xor_si128(x1, neg);
pxor %35, xmm5 ; x1 = _mm_xor_si128(x1, neg);
pxor %36, xmm6 ; x1 = _mm_xor_si128(x1, neg);
pxor %37, xmm7 ; x1 = _mm_xor_si128(x1, neg);
pxor xmm4, %34 ; neg = _mm_xor_si128(neg, x1);
pxor xmm5, %35 ; neg = _mm_xor_si128(neg, x1);
pxor xmm6, %36 ; neg = _mm_xor_si128(neg, x1);
pxor xmm7, %37 ; neg = _mm_xor_si128(neg, x1);
movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
%endmacro
;
@@ -163,264 +163,264 @@ EXTN(jconst_huff_encode_one_block):
; eax + 24 = c_derived_tbl *dctbl
; eax + 28 = c_derived_tbl *actbl
%define pad 6*SIZEOF_DWORD ; Align to 16 bytes
%define t1 pad
%define t2 t1+(DCTSIZE2*SIZEOF_WORD)
%define block t2+(DCTSIZE2*SIZEOF_WORD)
%define actbl block+SIZEOF_DWORD
%define buffer actbl+SIZEOF_DWORD
%define temp buffer+SIZEOF_DWORD
%define temp2 temp+SIZEOF_DWORD
%define temp3 temp2+SIZEOF_DWORD
%define temp4 temp3+SIZEOF_DWORD
%define temp5 temp4+SIZEOF_DWORD
%define gotptr temp5+SIZEOF_DWORD ; void *gotptr
%define put_buffer ebx
%define put_bits edi
%define pad 6*SIZEOF_DWORD ; Align to 16 bytes
%define t1 pad
%define t2 t1+(DCTSIZE2*SIZEOF_WORD)
%define block t2+(DCTSIZE2*SIZEOF_WORD)
%define actbl block+SIZEOF_DWORD
%define buffer actbl+SIZEOF_DWORD
%define temp buffer+SIZEOF_DWORD
%define temp2 temp+SIZEOF_DWORD
%define temp3 temp2+SIZEOF_DWORD
%define temp4 temp3+SIZEOF_DWORD
%define temp5 temp4+SIZEOF_DWORD
%define gotptr temp5+SIZEOF_DWORD ; void *gotptr
%define put_buffer ebx
%define put_bits edi
align 16
global EXTN(jsimd_huff_encode_one_block_sse2)
align 16
global EXTN(jsimd_huff_encode_one_block_sse2)
EXTN(jsimd_huff_encode_one_block_sse2):
push ebp
mov eax,esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax
mov ebp,esp ; ebp = aligned ebp
sub esp, temp5+9*SIZEOF_DWORD-pad
push ebx
push ecx
; push edx ; need not be preserved
push esi
push edi
push ebp
push ebp
mov eax,esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax
mov ebp,esp ; ebp = aligned ebp
sub esp, temp5+9*SIZEOF_DWORD-pad
push ebx
push ecx
; push edx ; need not be preserved
push esi
push edi
push ebp
mov esi, POINTER [eax+8] ; (working_state *state)
mov put_buffer, DWORD [esi+8] ; put_buffer = state->cur.put_buffer;
mov put_bits, DWORD [esi+12] ; put_bits = state->cur.put_bits;
push esi ; esi is now scratch
mov esi, POINTER [eax+8] ; (working_state *state)
mov put_buffer, DWORD [esi+8] ; put_buffer = state->cur.put_buffer;
mov put_bits, DWORD [esi+12] ; put_bits = state->cur.put_bits;
push esi ; esi is now scratch
get_GOT edx ; get GOT address
movpic POINTER [esp+gotptr], edx ; save GOT address
get_GOT edx ; get GOT address
movpic POINTER [esp+gotptr], edx ; save GOT address
mov ecx, POINTER [eax+28]
mov edx, POINTER [eax+16]
mov esi, POINTER [eax+12]
mov POINTER [esp+actbl], ecx
mov POINTER [esp+block], edx
mov POINTER [esp+buffer], esi
mov ecx, POINTER [eax+28]
mov edx, POINTER [eax+16]
mov esi, POINTER [eax+12]
mov POINTER [esp+actbl], ecx
mov POINTER [esp+block], edx
mov POINTER [esp+buffer], esi
; Encode the DC coefficient difference per section F.1.2.1
mov esi, POINTER [esp+block] ; block
movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val;
sub ecx, DWORD [eax+20]
mov esi, ecx
; Encode the DC coefficient difference per section F.1.2.1
mov esi, POINTER [esp+block] ; block
movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val;
sub ecx, DWORD [eax+20]
mov esi, ecx
; This is a well-known technique for obtaining the absolute value
; without a branch. It is derived from an assembly language technique
; presented in "How to Optimize for the Pentium Processors",
; Copyright (c) 1996, 1997 by Agner Fog.
mov edx, ecx
sar edx, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
xor ecx, edx ; temp ^= temp3;
sub ecx, edx ; temp -= temp3;
; This is a well-known technique for obtaining the absolute value
; with out a branch. It is derived from an assembly language technique
; presented in "How to Optimize for the Pentium Processors",
; Copyright (c) 1996, 1997 by Agner Fog.
mov edx, ecx
sar edx, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
xor ecx, edx ; temp ^= temp3;
sub ecx, edx ; temp -= temp3;
; For a negative input, want temp2 = bitwise complement of abs(input)
; This code assumes we are on a two's complement machine
add esi, edx ; temp2 += temp3;
mov DWORD [esp+temp], esi ; backup temp2 in temp
; For a negative input, want temp2 = bitwise complement of abs(input)
; This code assumes we are on a two's complement machine
add esi, edx ; temp2 += temp3;
mov DWORD [esp+temp], esi ; backup temp2 in temp
; Find the number of bits needed for the magnitude of the coefficient
movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp)
movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)] ; nbits = JPEG_NBITS(temp);
mov DWORD [esp+temp2], edx ; backup nbits in temp2
; Find the number of bits needed for the magnitude of the coefficient
movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp)
movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)] ; nbits = JPEG_NBITS(temp);
mov DWORD [esp+temp2], edx ; backup nbits in temp2
; Emit the Huffman-coded symbol for the number of bits
mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore
mov eax, INT [ebp + edx * 4] ; code = dctbl->ehufco[nbits];
movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits];
EMIT_BITS eax ; EMIT_BITS(code, size)
; Emit the Huffman-coded symbol for the number of bits
mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore
mov eax, INT [ebp + edx * 4] ; code = dctbl->ehufco[nbits];
movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits];
EMIT_BITS eax ; EMIT_BITS(code, size)
mov ecx, DWORD [esp+temp2] ; restore nbits
mov ecx, DWORD [esp+temp2] ; restore nbits
; Mask off any extra bits in code
mov eax, 1
shl eax, cl
dec eax
and eax, DWORD [esp+temp] ; temp2 &= (((JLONG) 1)<<nbits) - 1;
; Mask off any extra bits in code
mov eax, 1
shl eax, cl
dec eax
and eax, DWORD [esp+temp] ; temp2 &= (((JLONG) 1)<<nbits) - 1;
; Emit that number of bits of the value, if positive,
; or the complement of its magnitude, if negative.
EMIT_BITS eax ; EMIT_BITS(temp2, nbits)
; Emit that number of bits of the value, if positive,
; or the complement of its magnitude, if negative.
EMIT_BITS eax ; EMIT_BITS(temp2, nbits)
; Prepare data
xor ecx, ecx
mov esi, POINTER [esp+block]
kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
27, 20, 13, 6, 7, 14, 21, 28, 35, \
xmm0, xmm1, xmm2, xmm3
kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
53, 60, 61, 54, 47, 55, 62, 63, 63, \
xmm0, xmm1, xmm2, xmm3
; Prepare data
xor ecx, ecx
mov esi, POINTER [esp+block]
kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
27, 20, 13, 6, 7, 14, 21, 28, 35, \
xmm0, xmm1, xmm2, xmm3
kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
53, 60, 61, 54, 47, 55, 62, 63, 63, \
xmm0, xmm1, xmm2, xmm3
pxor xmm7, xmm7
movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
shl ecx, 16
or edx, ecx
not edx ; index = ~index;
pxor xmm7, xmm7
movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
shl ecx, 16
or edx, ecx
not edx ; index = ~index;
lea esi, [esp+t1]
mov ebp, POINTER [esp+actbl] ; ebp = actbl
lea esi, [esp+t1]
mov ebp, POINTER [esp+actbl] ; ebp = actbl
.BLOOP:
bsf ecx, edx ; r = __builtin_ctzl(index);
jz .ELOOP
lea esi, [esi+ecx*2] ; k += r;
shr edx, cl ; index >>= r;
mov DWORD [esp+temp3], edx
bsf ecx, edx ; r = __builtin_ctzl(index);
jz .ELOOP
lea esi, [esi+ecx*2] ; k += r;
shr edx, cl ; index >>= r;
mov DWORD [esp+temp3], edx
.BRLOOP:
cmp ecx, 16 ; while (r > 15) {
jl .ERLOOP
sub ecx, 16 ; r -= 16;
mov DWORD [esp+temp], ecx
mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
mov ecx, DWORD [esp+temp]
jmp .BRLOOP
cmp ecx, 16 ; while (r > 15) {
jl .ERLOOP
sub ecx, 16 ; r -= 16;
mov DWORD [esp+temp], ecx
mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
mov ecx, DWORD [esp+temp]
jmp .BRLOOP
.ERLOOP:
movsx eax, word [esi] ; temp = t1[k];
movpic edx, POINTER [esp+gotptr] ; load GOT address (edx)
movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)] ; nbits = JPEG_NBITS(temp);
mov DWORD [esp+temp2], eax
; Emit Huffman symbol for run length / number of bits
shl ecx, 4 ; temp3 = (r << 4) + nbits;
add ecx, eax
mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3];
movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3];
EMIT_BITS eax
movsx eax, word [esi] ; temp = t1[k];
movpic edx, POINTER [esp+gotptr] ; load GOT address (edx)
movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)] ; nbits = JPEG_NBITS(temp);
mov DWORD [esp+temp2], eax
; Emit Huffman symbol for run length / number of bits
shl ecx, 4 ; temp3 = (r << 4) + nbits;
add ecx, eax
mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3];
movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3];
EMIT_BITS eax
movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
; Mask off any extra bits in code
mov ecx, DWORD [esp+temp2]
mov eax, 1
shl eax, cl
dec eax
and eax, edx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
EMIT_BITS eax ; PUT_BITS(temp2, nbits)
mov edx, DWORD [esp+temp3]
add esi, 2 ; ++k;
shr edx, 1 ; index >>= 1;
movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
; Mask off any extra bits in code
mov ecx, DWORD [esp+temp2]
mov eax, 1
shl eax, cl
dec eax
and eax, edx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
EMIT_BITS eax ; PUT_BITS(temp2, nbits)
mov edx, DWORD [esp+temp3]
add esi, 2 ; ++k;
shr edx, 1 ; index >>= 1;
jmp .BLOOP
jmp .BLOOP
.ELOOP:
movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
shl ecx, 16
or edx, ecx
not edx ; index = ~index;
movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
shl ecx, 16
or edx, ecx
not edx ; index = ~index;
lea eax, [esp + t1 + (DCTSIZE2/2) * 2]
sub eax, esi
shr eax, 1
bsf ecx, edx ; r = __builtin_ctzl(index);
jz .ELOOP2
shr edx, cl ; index >>= r;
add ecx, eax
lea esi, [esi+ecx*2] ; k += r;
mov DWORD [esp+temp3], edx
jmp .BRLOOP2
lea eax, [esp + t1 + (DCTSIZE2/2) * 2]
sub eax, esi
shr eax, 1
bsf ecx, edx ; r = __builtin_ctzl(index);
jz .ELOOP2
shr edx, cl ; index >>= r;
add ecx, eax
lea esi, [esi+ecx*2] ; k += r;
mov DWORD [esp+temp3], edx
jmp .BRLOOP2
.BLOOP2:
bsf ecx, edx ; r = __builtin_ctzl(index);
jz .ELOOP2
lea esi, [esi+ecx*2] ; k += r;
shr edx, cl ; index >>= r;
mov DWORD [esp+temp3], edx
bsf ecx, edx ; r = __builtin_ctzl(index);
jz .ELOOP2
lea esi, [esi+ecx*2] ; k += r;
shr edx, cl ; index >>= r;
mov DWORD [esp+temp3], edx
.BRLOOP2:
cmp ecx, 16 ; while (r > 15) {
jl .ERLOOP2
sub ecx, 16 ; r -= 16;
mov DWORD [esp+temp], ecx
mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
mov ecx, DWORD [esp+temp]
jmp .BRLOOP2
cmp ecx, 16 ; while (r > 15) {
jl .ERLOOP2
sub ecx, 16 ; r -= 16;
mov DWORD [esp+temp], ecx
mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
mov ecx, DWORD [esp+temp]
jmp .BRLOOP2
.ERLOOP2:
movsx eax, word [esi] ; temp = t1[k];
bsr eax, eax ; nbits = 32 - __builtin_clz(temp);
inc eax
mov DWORD [esp+temp2], eax
; Emit Huffman symbol for run length / number of bits
shl ecx, 4 ; temp3 = (r << 4) + nbits;
add ecx, eax
mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3];
movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3];
EMIT_BITS eax
movsx eax, word [esi] ; temp = t1[k];
bsr eax, eax ; nbits = 32 - __builtin_clz(temp);
inc eax
mov DWORD [esp+temp2], eax
; Emit Huffman symbol for run length / number of bits
shl ecx, 4 ; temp3 = (r << 4) + nbits;
add ecx, eax
mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3];
movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3];
EMIT_BITS eax
movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
; Mask off any extra bits in code
mov ecx, DWORD [esp+temp2]
mov eax, 1
shl eax, cl
dec eax
and eax, edx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
EMIT_BITS eax ; PUT_BITS(temp2, nbits)
mov edx, DWORD [esp+temp3]
add esi, 2 ; ++k;
shr edx, 1 ; index >>= 1;
movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
; Mask off any extra bits in code
mov ecx, DWORD [esp+temp2]
mov eax, 1
shl eax, cl
dec eax
and eax, edx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
EMIT_BITS eax ; PUT_BITS(temp2, nbits)
mov edx, DWORD [esp+temp3]
add esi, 2 ; ++k;
shr edx, 1 ; index >>= 1;
jmp .BLOOP2
jmp .BLOOP2
.ELOOP2:
; If the last coef(s) were zero, emit an end-of-block code
lea edx, [esp + t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
cmp edx, esi ; if (r > 0) {
je .EFN
mov eax, INT [ebp] ; code = actbl->ehufco[0];
movzx ecx, byte [ebp + 1024] ; size = actbl->ehufsi[0];
EMIT_BITS eax
; If the last coef(s) were zero, emit an end-of-block code
lea edx, [esp + t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
cmp edx, esi ; if (r > 0) {
je .EFN
mov eax, INT [ebp] ; code = actbl->ehufco[0];
movzx ecx, byte [ebp + 1024] ; size = actbl->ehufsi[0];
EMIT_BITS eax
.EFN:
mov eax, [esp+buffer]
pop esi
; Save put_buffer & put_bits
mov DWORD [esi+8], put_buffer ; state->cur.put_buffer = put_buffer;
mov DWORD [esi+12], put_bits ; state->cur.put_bits = put_bits;
mov eax, [esp+buffer]
pop esi
; Save put_buffer & put_bits
mov DWORD [esi+8], put_buffer ; state->cur.put_buffer = put_buffer;
mov DWORD [esi+12], put_bits ; state->cur.put_bits = put_bits;
pop ebp
pop edi
pop esi
; pop edx ; need not be preserved
pop ecx
pop ebx
mov esp,ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
pop ebp
pop edi
pop esi
; pop edx ; need not be preserved
pop ecx
pop ebx
mov esp,ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -19,8 +19,8 @@
%include "jsimdext.inc"
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
SECTION SEG_TEXT
BITS 64
;
; Downsample pixel values of a single component.
; This version handles the common case of 2:1 horizontal and 1:1 vertical,
@@ -39,130 +39,130 @@
; r14 = JSAMPARRAY input_data
; r15 = JSAMPARRAY output_data
align 16
global EXTN(jsimd_h2v1_downsample_sse2)
align 16
global EXTN(jsimd_h2v1_downsample_sse2)
EXTN(jsimd_h2v1_downsample_sse2):
push rbp
mov rax,rsp
mov rbp,rsp
collect_args
push rbp
mov rax, rsp
mov rbp, rsp
collect_args
mov ecx, r13d
shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
jz near .return
mov ecx, r13d
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
jz near .return
mov edx, r10d
mov edx, r10d
; -- expand_right_edge
; -- expand_right_edge
push rcx
shl rcx,1 ; output_cols * 2
sub rcx,rdx
jle short .expand_end
push rcx
shl rcx, 1 ; output_cols * 2
sub rcx, rdx
jle short .expand_end
mov rax, r11
test rax,rax
jle short .expand_end
mov rax, r11
test rax, rax
jle short .expand_end
cld
mov rsi, r14 ; input_data
cld
mov rsi, r14 ; input_data
.expandloop:
push rax
push rcx
push rax
push rcx
mov rdi, JSAMPROW [rsi]
add rdi,rdx
mov al, JSAMPLE [rdi-1]
mov rdi, JSAMPROW [rsi]
add rdi, rdx
mov al, JSAMPLE [rdi-1]
rep stosb
rep stosb
pop rcx
pop rax
pop rcx
pop rax
add rsi, byte SIZEOF_JSAMPROW
dec rax
jg short .expandloop
add rsi, byte SIZEOF_JSAMPROW
dec rax
jg short .expandloop
.expand_end:
pop rcx ; output_cols
pop rcx ; output_cols
; -- h2v1_downsample
; -- h2v1_downsample
mov eax, r12d ; rowctr
test eax,eax
jle near .return
mov eax, r12d ; rowctr
test eax, eax
jle near .return
mov rdx, 0x00010000 ; bias pattern
movd xmm7,edx
pcmpeqw xmm6,xmm6
pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov rdx, 0x00010000 ; bias pattern
movd xmm7, edx
pcmpeqw xmm6, xmm6
pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov rsi, r14 ; input_data
mov rdi, r15 ; output_data
mov rsi, r14 ; input_data
mov rdi, r15 ; output_data
.rowloop:
push rcx
push rdi
push rsi
push rcx
push rdi
push rsi
mov rsi, JSAMPROW [rsi] ; inptr
mov rdi, JSAMPROW [rdi] ; outptr
mov rsi, JSAMPROW [rsi] ; inptr
mov rdi, JSAMPROW [rdi] ; outptr
cmp rcx, byte SIZEOF_XMMWORD
jae short .columnloop
cmp rcx, byte SIZEOF_XMMWORD
jae short .columnloop
.columnloop_r8:
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
pxor xmm1,xmm1
mov rcx, SIZEOF_XMMWORD
jmp short .downsample
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
pxor xmm1, xmm1
mov rcx, SIZEOF_XMMWORD
jmp short .downsample
.columnloop:
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
.downsample:
movdqa xmm2,xmm0
movdqa xmm3,xmm1
movdqa xmm2, xmm0
movdqa xmm3, xmm1
pand xmm0,xmm6
psrlw xmm2,BYTE_BIT
pand xmm1,xmm6
psrlw xmm3,BYTE_BIT
pand xmm0, xmm6
psrlw xmm2, BYTE_BIT
pand xmm1, xmm6
psrlw xmm3, BYTE_BIT
paddw xmm0,xmm2
paddw xmm1,xmm3
paddw xmm0,xmm7
paddw xmm1,xmm7
psrlw xmm0,1
psrlw xmm1,1
paddw xmm0, xmm2
paddw xmm1, xmm3
paddw xmm0, xmm7
paddw xmm1, xmm7
psrlw xmm0, 1
psrlw xmm1, 1
packuswb xmm0,xmm1
packuswb xmm0, xmm1
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
sub rcx, byte SIZEOF_XMMWORD ; outcol
add rsi, byte 2*SIZEOF_XMMWORD ; inptr
add rdi, byte 1*SIZEOF_XMMWORD ; outptr
cmp rcx, byte SIZEOF_XMMWORD
jae short .columnloop
test rcx,rcx
jnz short .columnloop_r8
sub rcx, byte SIZEOF_XMMWORD ; outcol
add rsi, byte 2*SIZEOF_XMMWORD ; inptr
add rdi, byte 1*SIZEOF_XMMWORD ; outptr
cmp rcx, byte SIZEOF_XMMWORD
jae short .columnloop
test rcx, rcx
jnz short .columnloop_r8
pop rsi
pop rdi
pop rcx
pop rsi
pop rdi
pop rcx
add rsi, byte SIZEOF_JSAMPROW ; input_data
add rdi, byte SIZEOF_JSAMPROW ; output_data
dec rax ; rowctr
jg near .rowloop
add rsi, byte SIZEOF_JSAMPROW ; input_data
add rdi, byte SIZEOF_JSAMPROW ; output_data
dec rax ; rowctr
jg near .rowloop
.return:
uncollect_args
pop rbp
ret
uncollect_args
pop rbp
ret
; --------------------------------------------------------------------------
;
@@ -183,147 +183,147 @@ EXTN(jsimd_h2v1_downsample_sse2):
; r14 = JSAMPARRAY input_data
; r15 = JSAMPARRAY output_data
align 16
global EXTN(jsimd_h2v2_downsample_sse2)
align 16
global EXTN(jsimd_h2v2_downsample_sse2)
EXTN(jsimd_h2v2_downsample_sse2):
push rbp
mov rax,rsp
mov rbp,rsp
collect_args
push rbp
mov rax, rsp
mov rbp, rsp
collect_args
mov ecx, r13d
shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
jz near .return
mov ecx, r13d
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
jz near .return
mov edx, r10d
mov edx, r10d
; -- expand_right_edge
; -- expand_right_edge
push rcx
shl rcx,1 ; output_cols * 2
sub rcx,rdx
jle short .expand_end
push rcx
shl rcx, 1 ; output_cols * 2
sub rcx, rdx
jle short .expand_end
mov rax, r11
test rax,rax
jle short .expand_end
mov rax, r11
test rax, rax
jle short .expand_end
cld
mov rsi, r14 ; input_data
cld
mov rsi, r14 ; input_data
.expandloop:
push rax
push rcx
push rax
push rcx
mov rdi, JSAMPROW [rsi]
add rdi,rdx
mov al, JSAMPLE [rdi-1]
mov rdi, JSAMPROW [rsi]
add rdi, rdx
mov al, JSAMPLE [rdi-1]
rep stosb
rep stosb
pop rcx
pop rax
pop rcx
pop rax
add rsi, byte SIZEOF_JSAMPROW
dec rax
jg short .expandloop
add rsi, byte SIZEOF_JSAMPROW
dec rax
jg short .expandloop
.expand_end:
pop rcx ; output_cols
pop rcx ; output_cols
; -- h2v2_downsample
; -- h2v2_downsample
mov eax, r12d ; rowctr
test rax,rax
jle near .return
mov eax, r12d ; rowctr
test rax, rax
jle near .return
mov rdx, 0x00020001 ; bias pattern
movd xmm7,edx
pcmpeqw xmm6,xmm6
pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov rdx, 0x00020001 ; bias pattern
movd xmm7, edx
pcmpeqw xmm6, xmm6
pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov rsi, r14 ; input_data
mov rdi, r15 ; output_data
mov rsi, r14 ; input_data
mov rdi, r15 ; output_data
.rowloop:
push rcx
push rdi
push rsi
push rcx
push rdi
push rsi
mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
mov rdi, JSAMPROW [rdi] ; outptr
mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
mov rdi, JSAMPROW [rdi] ; outptr
cmp rcx, byte SIZEOF_XMMWORD
jae short .columnloop
cmp rcx, byte SIZEOF_XMMWORD
jae short .columnloop
.columnloop_r8:
movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
pxor xmm2,xmm2
pxor xmm3,xmm3
mov rcx, SIZEOF_XMMWORD
jmp short .downsample
movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
pxor xmm2, xmm2
pxor xmm3, xmm3
mov rcx, SIZEOF_XMMWORD
jmp short .downsample
.columnloop:
movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
.downsample:
movdqa xmm4,xmm0
movdqa xmm5,xmm1
pand xmm0,xmm6
psrlw xmm4,BYTE_BIT
pand xmm1,xmm6
psrlw xmm5,BYTE_BIT
paddw xmm0,xmm4
paddw xmm1,xmm5
movdqa xmm4, xmm0
movdqa xmm5, xmm1
pand xmm0, xmm6
psrlw xmm4, BYTE_BIT
pand xmm1, xmm6
psrlw xmm5, BYTE_BIT
paddw xmm0, xmm4
paddw xmm1, xmm5
movdqa xmm4,xmm2
movdqa xmm5,xmm3
pand xmm2,xmm6
psrlw xmm4,BYTE_BIT
pand xmm3,xmm6
psrlw xmm5,BYTE_BIT
paddw xmm2,xmm4
paddw xmm3,xmm5
movdqa xmm4, xmm2
movdqa xmm5, xmm3
pand xmm2, xmm6
psrlw xmm4, BYTE_BIT
pand xmm3, xmm6
psrlw xmm5, BYTE_BIT
paddw xmm2, xmm4
paddw xmm3, xmm5
paddw xmm0,xmm1
paddw xmm2,xmm3
paddw xmm0,xmm7
paddw xmm2,xmm7
psrlw xmm0,2
psrlw xmm2,2
paddw xmm0, xmm1
paddw xmm2, xmm3
paddw xmm0, xmm7
paddw xmm2, xmm7
psrlw xmm0, 2
psrlw xmm2, 2
packuswb xmm0,xmm2
packuswb xmm0, xmm2
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
sub rcx, byte SIZEOF_XMMWORD ; outcol
add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
add rdi, byte 1*SIZEOF_XMMWORD ; outptr
cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop
test rcx,rcx
jnz near .columnloop_r8
sub rcx, byte SIZEOF_XMMWORD ; outcol
add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
add rdi, byte 1*SIZEOF_XMMWORD ; outptr
cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop
test rcx, rcx
jnz near .columnloop_r8
pop rsi
pop rdi
pop rcx
pop rsi
pop rdi
pop rcx
add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
dec rax ; rowctr
jg near .rowloop
add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
dec rax ; rowctr
jg near .rowloop
.return:
uncollect_args
pop rbp
ret
uncollect_args
pop rbp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -18,8 +18,8 @@
%include "jsimdext.inc"
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
SECTION SEG_TEXT
BITS 32
;
; Downsample pixel values of a single component.
; This version handles the common case of 2:1 horizontal and 1:1 vertical,
@@ -38,141 +38,141 @@
%define input_data(b) (b)+24 ; JSAMPARRAY input_data
%define output_data(b) (b)+28 ; JSAMPARRAY output_data
align 16
global EXTN(jsimd_h2v1_downsample_sse2)
align 16
global EXTN(jsimd_h2v1_downsample_sse2)
EXTN(jsimd_h2v1_downsample_sse2):
push ebp
mov ebp,esp
; push ebx ; unused
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
push ebp
mov ebp, esp
; push ebx ; unused
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov ecx, JDIMENSION [width_blks(ebp)]
shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
jz near .return
mov ecx, JDIMENSION [width_blks(ebp)]
shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
jz near .return
mov edx, JDIMENSION [img_width(ebp)]
mov edx, JDIMENSION [img_width(ebp)]
; -- expand_right_edge
; -- expand_right_edge
push ecx
shl ecx,1 ; output_cols * 2
sub ecx,edx
jle short .expand_end
push ecx
shl ecx, 1 ; output_cols * 2
sub ecx, edx
jle short .expand_end
mov eax, INT [max_v_samp(ebp)]
test eax,eax
jle short .expand_end
mov eax, INT [max_v_samp(ebp)]
test eax, eax
jle short .expand_end
cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16,7
cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16, 7
.expandloop:
push eax
push ecx
push eax
push ecx
mov edi, JSAMPROW [esi]
add edi,edx
mov al, JSAMPLE [edi-1]
mov edi, JSAMPROW [esi]
add edi, edx
mov al, JSAMPLE [edi-1]
rep stosb
rep stosb
pop ecx
pop eax
pop ecx
pop eax
add esi, byte SIZEOF_JSAMPROW
dec eax
jg short .expandloop
add esi, byte SIZEOF_JSAMPROW
dec eax
jg short .expandloop
.expand_end:
pop ecx ; output_cols
pop ecx ; output_cols
; -- h2v1_downsample
; -- h2v1_downsample
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
test eax,eax
jle near .return
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
test eax, eax
jle near .return
mov edx, 0x00010000 ; bias pattern
movd xmm7,edx
pcmpeqw xmm6,xmm6
pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov edx, 0x00010000 ; bias pattern
movd xmm7, edx
pcmpeqw xmm6, xmm6
pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16,7
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16, 7
.rowloop:
push ecx
push edi
push esi
push ecx
push edi
push esi
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr
cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop
alignx 16,7
cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop
alignx 16, 7
.columnloop_r8:
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
pxor xmm1,xmm1
mov ecx, SIZEOF_XMMWORD
jmp short .downsample
alignx 16,7
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
pxor xmm1, xmm1
mov ecx, SIZEOF_XMMWORD
jmp short .downsample
alignx 16, 7
.columnloop:
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
.downsample:
movdqa xmm2,xmm0
movdqa xmm3,xmm1
movdqa xmm2, xmm0
movdqa xmm3, xmm1
pand xmm0,xmm6
psrlw xmm2,BYTE_BIT
pand xmm1,xmm6
psrlw xmm3,BYTE_BIT
pand xmm0, xmm6
psrlw xmm2, BYTE_BIT
pand xmm1, xmm6
psrlw xmm3, BYTE_BIT
paddw xmm0,xmm2
paddw xmm1,xmm3
paddw xmm0,xmm7
paddw xmm1,xmm7
psrlw xmm0,1
psrlw xmm1,1
paddw xmm0, xmm2
paddw xmm1, xmm3
paddw xmm0, xmm7
paddw xmm1, xmm7
psrlw xmm0, 1
psrlw xmm1, 1
packuswb xmm0,xmm1
packuswb xmm0, xmm1
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
sub ecx, byte SIZEOF_XMMWORD ; outcol
add esi, byte 2*SIZEOF_XMMWORD ; inptr
add edi, byte 1*SIZEOF_XMMWORD ; outptr
cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop
test ecx,ecx
jnz short .columnloop_r8
sub ecx, byte SIZEOF_XMMWORD ; outcol
add esi, byte 2*SIZEOF_XMMWORD ; inptr
add edi, byte 1*SIZEOF_XMMWORD ; outptr
cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop
test ecx, ecx
jnz short .columnloop_r8
pop esi
pop edi
pop ecx
pop esi
pop edi
pop ecx
add esi, byte SIZEOF_JSAMPROW ; input_data
add edi, byte SIZEOF_JSAMPROW ; output_data
dec eax ; rowctr
jg near .rowloop
add esi, byte SIZEOF_JSAMPROW ; input_data
add edi, byte SIZEOF_JSAMPROW ; output_data
dec eax ; rowctr
jg near .rowloop
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
; pop ebx ; unused
pop ebp
ret
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
; pop ebx ; unused
pop ebp
ret
; --------------------------------------------------------------------------
;
@@ -193,158 +193,158 @@ EXTN(jsimd_h2v1_downsample_sse2):
%define input_data(b) (b)+24 ; JSAMPARRAY input_data
%define output_data(b) (b)+28 ; JSAMPARRAY output_data
align 16
global EXTN(jsimd_h2v2_downsample_sse2)
align 16
global EXTN(jsimd_h2v2_downsample_sse2)
EXTN(jsimd_h2v2_downsample_sse2):
push ebp
mov ebp,esp
; push ebx ; unused
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
push ebp
mov ebp, esp
; push ebx ; unused
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov ecx, JDIMENSION [width_blks(ebp)]
shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
jz near .return
mov ecx, JDIMENSION [width_blks(ebp)]
shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
jz near .return
mov edx, JDIMENSION [img_width(ebp)]
mov edx, JDIMENSION [img_width(ebp)]
; -- expand_right_edge
; -- expand_right_edge
push ecx
shl ecx,1 ; output_cols * 2
sub ecx,edx
jle short .expand_end
push ecx
shl ecx, 1 ; output_cols * 2
sub ecx, edx
jle short .expand_end
mov eax, INT [max_v_samp(ebp)]
test eax,eax
jle short .expand_end
mov eax, INT [max_v_samp(ebp)]
test eax, eax
jle short .expand_end
cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16,7
cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16, 7
.expandloop:
push eax
push ecx
push eax
push ecx
mov edi, JSAMPROW [esi]
add edi,edx
mov al, JSAMPLE [edi-1]
mov edi, JSAMPROW [esi]
add edi, edx
mov al, JSAMPLE [edi-1]
rep stosb
rep stosb
pop ecx
pop eax
pop ecx
pop eax
add esi, byte SIZEOF_JSAMPROW
dec eax
jg short .expandloop
add esi, byte SIZEOF_JSAMPROW
dec eax
jg short .expandloop
.expand_end:
pop ecx ; output_cols
pop ecx ; output_cols
; -- h2v2_downsample
; -- h2v2_downsample
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
test eax,eax
jle near .return
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
test eax, eax
jle near .return
mov edx, 0x00020001 ; bias pattern
movd xmm7,edx
pcmpeqw xmm6,xmm6
pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov edx, 0x00020001 ; bias pattern
movd xmm7, edx
pcmpeqw xmm6, xmm6
pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16,7
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16, 7
.rowloop:
push ecx
push edi
push esi
push ecx
push edi
push esi
mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
mov edi, JSAMPROW [edi] ; outptr
mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
mov edi, JSAMPROW [edi] ; outptr
cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop
alignx 16,7
cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop
alignx 16, 7
.columnloop_r8:
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
pxor xmm2,xmm2
pxor xmm3,xmm3
mov ecx, SIZEOF_XMMWORD
jmp short .downsample
alignx 16,7
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
pxor xmm2, xmm2
pxor xmm3, xmm3
mov ecx, SIZEOF_XMMWORD
jmp short .downsample
alignx 16, 7
.columnloop:
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
.downsample:
movdqa xmm4,xmm0
movdqa xmm5,xmm1
pand xmm0,xmm6
psrlw xmm4,BYTE_BIT
pand xmm1,xmm6
psrlw xmm5,BYTE_BIT
paddw xmm0,xmm4
paddw xmm1,xmm5
movdqa xmm4, xmm0
movdqa xmm5, xmm1
pand xmm0, xmm6
psrlw xmm4, BYTE_BIT
pand xmm1, xmm6
psrlw xmm5, BYTE_BIT
paddw xmm0, xmm4
paddw xmm1, xmm5
movdqa xmm4,xmm2
movdqa xmm5,xmm3
pand xmm2,xmm6
psrlw xmm4,BYTE_BIT
pand xmm3,xmm6
psrlw xmm5,BYTE_BIT
paddw xmm2,xmm4
paddw xmm3,xmm5
movdqa xmm4, xmm2
movdqa xmm5, xmm3
pand xmm2, xmm6
psrlw xmm4, BYTE_BIT
pand xmm3, xmm6
psrlw xmm5, BYTE_BIT
paddw xmm2, xmm4
paddw xmm3, xmm5
paddw xmm0,xmm1
paddw xmm2,xmm3
paddw xmm0,xmm7
paddw xmm2,xmm7
psrlw xmm0,2
psrlw xmm2,2
paddw xmm0, xmm1
paddw xmm2, xmm3
paddw xmm0, xmm7
paddw xmm2, xmm7
psrlw xmm0, 2
psrlw xmm2, 2
packuswb xmm0,xmm2
packuswb xmm0, xmm2
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
sub ecx, byte SIZEOF_XMMWORD ; outcol
add edx, byte 2*SIZEOF_XMMWORD ; inptr0
add esi, byte 2*SIZEOF_XMMWORD ; inptr1
add edi, byte 1*SIZEOF_XMMWORD ; outptr
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
test ecx,ecx
jnz near .columnloop_r8
sub ecx, byte SIZEOF_XMMWORD ; outcol
add edx, byte 2*SIZEOF_XMMWORD ; inptr0
add esi, byte 2*SIZEOF_XMMWORD ; inptr1
add edi, byte 1*SIZEOF_XMMWORD ; outptr
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
test ecx, ecx
jnz near .columnloop_r8
pop esi
pop edi
pop ecx
pop esi
pop edi
pop ecx
add esi, byte 2*SIZEOF_JSAMPROW ; input_data
add edi, byte 1*SIZEOF_JSAMPROW ; output_data
dec eax ; rowctr
jg near .rowloop
add esi, byte 2*SIZEOF_JSAMPROW ; input_data
add edi, byte 1*SIZEOF_JSAMPROW ; output_data
dec eax ; rowctr
jg near .rowloop
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
; pop ebx ; unused
pop ebp
ret
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
; pop ebx ; unused
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -34,407 +34,407 @@
; r13 = JSAMPARRAY output_buf
; r14 = int num_rows
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
align 16
global EXTN(jsimd_ycc_rgb_convert_sse2)
align 16
global EXTN(jsimd_ycc_rgb_convert_sse2)
EXTN(jsimd_ycc_rgb_convert_sse2):
push rbp
mov rax,rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax
mov rbp,rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
push rbx
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
push rbx
mov ecx, r10d ; num_cols
test rcx,rcx
jz near .return
mov ecx, r10d ; num_cols
test rcx, rcx
jz near .return
push rcx
push rcx
mov rdi, r11
mov ecx, r12d
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
mov rdi, r11
mov ecx, r12d
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
pop rcx
pop rcx
mov rdi, r13
mov eax, r14d
test rax,rax
jle near .return
mov rdi, r13
mov eax, r14d
test rax, rax
jle near .return
.rowloop:
push rax
push rdi
push rdx
push rbx
push rsi
push rcx ; col
push rax
push rdi
push rdx
push rbx
push rsi
push rcx ; col
mov rsi, JSAMPROW [rsi] ; inptr0
mov rbx, JSAMPROW [rbx] ; inptr1
mov rdx, JSAMPROW [rdx] ; inptr2
mov rdi, JSAMPROW [rdi] ; outptr
mov rsi, JSAMPROW [rsi] ; inptr0
mov rbx, JSAMPROW [rbx] ; inptr1
mov rdx, JSAMPROW [rdx] ; inptr2
mov rdi, JSAMPROW [rdi] ; outptr
.columnloop:
movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF)
movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF)
movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF)
movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF)
pcmpeqw xmm4,xmm4
pcmpeqw xmm7,xmm7
psrlw xmm4,BYTE_BIT
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
pcmpeqw xmm4, xmm4
pcmpeqw xmm7, xmm7
psrlw xmm4, BYTE_BIT
psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE
psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE
psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
paddw xmm4,xmm7
paddw xmm5,xmm7
paddw xmm0,xmm7
paddw xmm1,xmm7
paddw xmm4, xmm7
paddw xmm5, xmm7
paddw xmm0, xmm7
paddw xmm1, xmm7
; (Original)
; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb
;
; (This implementation)
; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb
; (Original)
; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb
;
; (This implementation)
; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb
movdqa xmm2,xmm4 ; xmm2=CbE
movdqa xmm3,xmm5 ; xmm3=CbO
paddw xmm4,xmm4 ; xmm4=2*CbE
paddw xmm5,xmm5 ; xmm5=2*CbO
movdqa xmm6,xmm0 ; xmm6=CrE
movdqa xmm7,xmm1 ; xmm7=CrO
paddw xmm0,xmm0 ; xmm0=2*CrE
paddw xmm1,xmm1 ; xmm1=2*CrO
movdqa xmm2, xmm4 ; xmm2=CbE
movdqa xmm3, xmm5 ; xmm3=CbO
paddw xmm4, xmm4 ; xmm4=2*CbE
paddw xmm5, xmm5 ; xmm5=2*CbO
movdqa xmm6, xmm0 ; xmm6=CrE
movdqa xmm7, xmm1 ; xmm7=CrO
paddw xmm0, xmm0 ; xmm0=2*CrE
paddw xmm1, xmm1 ; xmm1=2*CrO
pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800))
pmulhw xmm5,[rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800))
pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200))
pmulhw xmm1,[rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200))
pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800))
pmulhw xmm5, [rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800))
pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200))
pmulhw xmm1, [rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200))
paddw xmm4,[rel PW_ONE]
paddw xmm5,[rel PW_ONE]
psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800))
psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800))
paddw xmm0,[rel PW_ONE]
paddw xmm1,[rel PW_ONE]
psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200))
psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200))
paddw xmm4, [rel PW_ONE]
paddw xmm5, [rel PW_ONE]
psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
paddw xmm0, [rel PW_ONE]
paddw xmm1, [rel PW_ONE]
psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
paddw xmm4,xmm2
paddw xmm5,xmm3
paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
paddw xmm4, xmm2
paddw xmm5, xmm3
paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
movdqa xmm4,xmm2
movdqa xmm5,xmm3
punpcklwd xmm2,xmm6
punpckhwd xmm4,xmm6
pmaddwd xmm2,[rel PW_MF0344_F0285]
pmaddwd xmm4,[rel PW_MF0344_F0285]
punpcklwd xmm3,xmm7
punpckhwd xmm5,xmm7
pmaddwd xmm3,[rel PW_MF0344_F0285]
pmaddwd xmm5,[rel PW_MF0344_F0285]
movdqa xmm4, xmm2
movdqa xmm5, xmm3
punpcklwd xmm2, xmm6
punpckhwd xmm4, xmm6
pmaddwd xmm2, [rel PW_MF0344_F0285]
pmaddwd xmm4, [rel PW_MF0344_F0285]
punpcklwd xmm3, xmm7
punpckhwd xmm5, xmm7
pmaddwd xmm3, [rel PW_MF0344_F0285]
pmaddwd xmm5, [rel PW_MF0344_F0285]
paddd xmm2,[rel PD_ONEHALF]
paddd xmm4,[rel PD_ONEHALF]
psrad xmm2,SCALEBITS
psrad xmm4,SCALEBITS
paddd xmm3,[rel PD_ONEHALF]
paddd xmm5,[rel PD_ONEHALF]
psrad xmm3,SCALEBITS
psrad xmm5,SCALEBITS
paddd xmm2, [rel PD_ONEHALF]
paddd xmm4, [rel PD_ONEHALF]
psrad xmm2, SCALEBITS
psrad xmm4, SCALEBITS
paddd xmm3, [rel PD_ONEHALF]
paddd xmm5, [rel PD_ONEHALF]
psrad xmm3, SCALEBITS
psrad xmm5, SCALEBITS
packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF)
movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF)
pcmpeqw xmm4,xmm4
psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE
psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO
pcmpeqw xmm4, xmm4
psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
%if RGB_PIXELSIZE == 3 ; ---------------
%if RGB_PIXELSIZE == 3 ; ---------------
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
movdqa xmmG,xmmA
movdqa xmmH,xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
movdqa xmmG, xmmA
movdqa xmmH, xmmA
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
movdqa xmmC,xmmD
movdqa xmmB,xmmD
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
movdqa xmmC, xmmD
movdqa xmmB, xmmD
punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
movdqa xmmF,xmmE
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
movdqa xmmF, xmmE
punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB,xmmE
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB, xmmE
punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB,xmmF
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB, xmmF
punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st32
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st32
test rdi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
jmp short .out0
test rdi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
jmp short .out0
.out1: ; --(unaligned)-----------------
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
.out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow
add rsi, byte SIZEOF_XMMWORD ; inptr0
add rbx, byte SIZEOF_XMMWORD ; inptr1
add rdx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
add rsi, byte SIZEOF_XMMWORD ; inptr0
add rbx, byte SIZEOF_XMMWORD ; inptr1
add rdx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
.column_st32:
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmF
sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmD
sub rcx, byte SIZEOF_XMMWORD
.column_st15:
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_MMWORD
jb short .column_st7
movq XMM_MMWORD [rdi], xmmA
add rdi, byte SIZEOF_MMWORD
sub rcx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_MMWORD
jb short .column_st7
movq XMM_MMWORD [rdi], xmmA
add rdi, byte SIZEOF_MMWORD
sub rcx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD
.column_st7:
; Store the lower 4 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_DWORD
jb short .column_st3
movd XMM_DWORD [rdi], xmmA
add rdi, byte SIZEOF_DWORD
sub rcx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD
; Store the lower 4 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_DWORD
jb short .column_st3
movd XMM_DWORD [rdi], xmmA
add rdi, byte SIZEOF_DWORD
sub rcx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD
.column_st3:
; Store the lower 2 bytes of rax to the output when it has enough
; space.
movd eax, xmmA
cmp rcx, byte SIZEOF_WORD
jb short .column_st1
mov WORD [rdi], ax
add rdi, byte SIZEOF_WORD
sub rcx, byte SIZEOF_WORD
shr rax, 16
; Store the lower 2 bytes of rax to the output when it has enough
; space.
movd eax, xmmA
cmp rcx, byte SIZEOF_WORD
jb short .column_st1
mov WORD [rdi], ax
add rdi, byte SIZEOF_WORD
sub rcx, byte SIZEOF_WORD
shr rax, 16
.column_st1:
; Store the lower 1 byte of rax to the output when it has enough
; space.
test rcx, rcx
jz short .nextrow
mov BYTE [rdi], al
; Store the lower 1 byte of rax to the output when it has enough
; space.
test rcx, rcx
jz short .nextrow
mov BYTE [rdi], al
%else ; RGB_PIXELSIZE == 4 ; -----------
%else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
movdqa xmmC,xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG,xmmB
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmC, xmmA
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG, xmmB
punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmD,xmmA
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH,xmmC
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD, xmmA
punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH, xmmC
punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st32
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st32
test rdi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
jmp short .out0
test rdi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
jmp short .out0
.out1: ; --(unaligned)-----------------
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
.out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow
add rsi, byte SIZEOF_XMMWORD ; inptr0
add rbx, byte SIZEOF_XMMWORD ; inptr1
add rdx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
add rsi, byte SIZEOF_XMMWORD ; inptr0
add rbx, byte SIZEOF_XMMWORD ; inptr1
add rdx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
.column_st32:
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub rcx, byte SIZEOF_XMMWORD/2
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmC
movdqa xmmD, xmmH
sub rcx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmD
sub rcx, byte SIZEOF_XMMWORD/4
.column_st15:
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_XMMWORD/8
jb short .column_st7
movq MMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD/8*4
sub rcx, byte SIZEOF_XMMWORD/8
psrldq xmmA, SIZEOF_XMMWORD/8*4
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_XMMWORD/8
jb short .column_st7
movq MMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD/8*4
sub rcx, byte SIZEOF_XMMWORD/8
psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7:
; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space.
test rcx, rcx
jz short .nextrow
movd XMM_DWORD [rdi], xmmA
; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space.
test rcx, rcx
jz short .nextrow
movd XMM_DWORD [rdi], xmmA
%endif ; RGB_PIXELSIZE ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
.nextrow:
pop rcx
pop rsi
pop rbx
pop rdx
pop rdi
pop rax
pop rcx
pop rsi
pop rbx
pop rdx
pop rdi
pop rax
add rsi, byte SIZEOF_JSAMPROW
add rbx, byte SIZEOF_JSAMPROW
add rdx, byte SIZEOF_JSAMPROW
add rdi, byte SIZEOF_JSAMPROW ; output_buf
dec rax ; num_rows
jg near .rowloop
add rsi, byte SIZEOF_JSAMPROW
add rbx, byte SIZEOF_JSAMPROW
add rdx, byte SIZEOF_JSAMPROW
add rdi, byte SIZEOF_JSAMPROW ; output_buf
dec rax ; num_rows
jg near .rowloop
sfence ; flush the write buffer
sfence ; flush the write buffer
.return:
pop rbx
uncollect_args
mov rsp,rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
pop rbx
uncollect_args
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -28,432 +28,432 @@
; JSAMPARRAY output_buf, int num_rows)
;
%define out_width(b) (b)+8 ; JDIMENSION out_width
%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
%define input_row(b) (b)+16 ; JDIMENSION input_row
%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
%define num_rows(b) (b)+24 ; int num_rows
%define out_width(b) (b)+8 ; JDIMENSION out_width
%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
%define input_row(b) (b)+16 ; JDIMENSION input_row
%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
%define num_rows(b) (b)+24 ; int num_rows
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
align 16
global EXTN(jsimd_ycc_rgb_convert_sse2)
align 16
global EXTN(jsimd_ycc_rgb_convert_sse2)
EXTN(jsimd_ycc_rgb_convert_sse2):
push ebp
mov eax,esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax
mov ebp,esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [out_width(eax)] ; num_cols
test ecx,ecx
jz near .return
mov ecx, JDIMENSION [out_width(eax)] ; num_cols
test ecx, ecx
jz near .return
push ecx
push ecx
mov edi, JSAMPIMAGE [input_buf(eax)]
mov ecx, JDIMENSION [input_row(eax)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
mov edi, JSAMPIMAGE [input_buf(eax)]
mov ecx, JDIMENSION [input_row(eax)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
pop ecx
pop ecx
mov edi, JSAMPARRAY [output_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax,eax
jle near .return
alignx 16,7
mov edi, JSAMPARRAY [output_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax, eax
jle near .return
alignx 16, 7
.rowloop:
push eax
push edi
push edx
push ebx
push esi
push ecx ; col
push eax
push edi
push edx
push ebx
push esi
push ecx ; col
mov esi, JSAMPROW [esi] ; inptr0
mov ebx, JSAMPROW [ebx] ; inptr1
mov edx, JSAMPROW [edx] ; inptr2
mov edi, JSAMPROW [edi] ; outptr
movpic eax, POINTER [gotptr] ; load GOT address (eax)
alignx 16,7
mov esi, JSAMPROW [esi] ; inptr0
mov ebx, JSAMPROW [ebx] ; inptr1
mov edx, JSAMPROW [edx] ; inptr2
mov edi, JSAMPROW [edi] ; outptr
movpic eax, POINTER [gotptr] ; load GOT address (eax)
alignx 16, 7
.columnloop:
movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
pcmpeqw xmm4,xmm4
pcmpeqw xmm7,xmm7
psrlw xmm4,BYTE_BIT
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
pcmpeqw xmm4, xmm4
pcmpeqw xmm7, xmm7
psrlw xmm4, BYTE_BIT
psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE
psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE
psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
paddw xmm4,xmm7
paddw xmm5,xmm7
paddw xmm0,xmm7
paddw xmm1,xmm7
paddw xmm4, xmm7
paddw xmm5, xmm7
paddw xmm0, xmm7
paddw xmm1, xmm7
; (Original)
; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb
;
; (This implementation)
; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb
; (Original)
; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb
;
; (This implementation)
; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb
movdqa xmm2,xmm4 ; xmm2=CbE
movdqa xmm3,xmm5 ; xmm3=CbO
paddw xmm4,xmm4 ; xmm4=2*CbE
paddw xmm5,xmm5 ; xmm5=2*CbO
movdqa xmm6,xmm0 ; xmm6=CrE
movdqa xmm7,xmm1 ; xmm7=CrO
paddw xmm0,xmm0 ; xmm0=2*CrE
paddw xmm1,xmm1 ; xmm1=2*CrO
movdqa xmm2, xmm4 ; xmm2=CbE
movdqa xmm3, xmm5 ; xmm3=CbO
paddw xmm4, xmm4 ; xmm4=2*CbE
paddw xmm5, xmm5 ; xmm5=2*CbO
movdqa xmm6, xmm0 ; xmm6=CrE
movdqa xmm7, xmm1 ; xmm7=CrO
paddw xmm0, xmm0 ; xmm0=2*CrE
paddw xmm1, xmm1 ; xmm1=2*CrO
pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
pmulhw xmm5, [GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
pmulhw xmm1, [GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
paddw xmm4,[GOTOFF(eax,PW_ONE)]
paddw xmm5,[GOTOFF(eax,PW_ONE)]
psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800))
psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800))
paddw xmm0,[GOTOFF(eax,PW_ONE)]
paddw xmm1,[GOTOFF(eax,PW_ONE)]
psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200))
psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200))
paddw xmm4, [GOTOFF(eax,PW_ONE)]
paddw xmm5, [GOTOFF(eax,PW_ONE)]
psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
paddw xmm0, [GOTOFF(eax,PW_ONE)]
paddw xmm1, [GOTOFF(eax,PW_ONE)]
psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
paddw xmm4,xmm2
paddw xmm5,xmm3
paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
paddw xmm4, xmm2
paddw xmm5, xmm3
paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
movdqa xmm4,xmm2
movdqa xmm5,xmm3
punpcklwd xmm2,xmm6
punpckhwd xmm4,xmm6
pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
punpcklwd xmm3,xmm7
punpckhwd xmm5,xmm7
pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
movdqa xmm4, xmm2
movdqa xmm5, xmm3
punpcklwd xmm2, xmm6
punpckhwd xmm4, xmm6
pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm4, [GOTOFF(eax,PW_MF0344_F0285)]
punpcklwd xmm3, xmm7
punpckhwd xmm5, xmm7
pmaddwd xmm3, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
paddd xmm2,[GOTOFF(eax,PD_ONEHALF)]
paddd xmm4,[GOTOFF(eax,PD_ONEHALF)]
psrad xmm2,SCALEBITS
psrad xmm4,SCALEBITS
paddd xmm3,[GOTOFF(eax,PD_ONEHALF)]
paddd xmm5,[GOTOFF(eax,PD_ONEHALF)]
psrad xmm3,SCALEBITS
psrad xmm5,SCALEBITS
paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
paddd xmm4, [GOTOFF(eax,PD_ONEHALF)]
psrad xmm2, SCALEBITS
psrad xmm4, SCALEBITS
paddd xmm3, [GOTOFF(eax,PD_ONEHALF)]
paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
psrad xmm3, SCALEBITS
psrad xmm5, SCALEBITS
packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
pcmpeqw xmm4,xmm4
psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE
psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO
pcmpeqw xmm4, xmm4
psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
%if RGB_PIXELSIZE == 3 ; ---------------
%if RGB_PIXELSIZE == 3 ; ---------------
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
movdqa xmmG,xmmA
movdqa xmmH,xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
movdqa xmmG, xmmA
movdqa xmmH, xmmA
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
movdqa xmmC,xmmD
movdqa xmmB,xmmD
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
movdqa xmmC, xmmD
movdqa xmmB, xmmD
punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
movdqa xmmF,xmmE
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
movdqa xmmF, xmmE
punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB,xmmE
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB, xmmE
punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB,xmmF
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB, xmmF
punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32
test edi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
jmp short .out0
test edi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
jmp short .out0
.out1: ; --(unaligned)-----------------
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow
add esi, byte SIZEOF_XMMWORD ; inptr0
add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
alignx 16,7
add esi, byte SIZEOF_XMMWORD ; inptr0
add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
alignx 16, 7
.column_st32:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmF
sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmD
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
jb short .column_st7
movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_MMWORD
sub ecx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
jb short .column_st7
movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_MMWORD
sub ecx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD
.column_st7:
; Store the lower 4 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_DWORD
jb short .column_st3
movd XMM_DWORD [edi], xmmA
add edi, byte SIZEOF_DWORD
sub ecx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD
; Store the lower 4 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_DWORD
jb short .column_st3
movd XMM_DWORD [edi], xmmA
add edi, byte SIZEOF_DWORD
sub ecx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD
.column_st3:
; Store the lower 2 bytes of eax to the output when it has enough
; space.
movd eax, xmmA
cmp ecx, byte SIZEOF_WORD
jb short .column_st1
mov WORD [edi], ax
add edi, byte SIZEOF_WORD
sub ecx, byte SIZEOF_WORD
shr eax, 16
; Store the lower 2 bytes of eax to the output when it has enough
; space.
movd eax, xmmA
cmp ecx, byte SIZEOF_WORD
jb short .column_st1
mov WORD [edi], ax
add edi, byte SIZEOF_WORD
sub ecx, byte SIZEOF_WORD
shr eax, 16
.column_st1:
; Store the lower 1 byte of eax to the output when it has enough
; space.
test ecx, ecx
jz short .nextrow
mov BYTE [edi], al
; Store the lower 1 byte of eax to the output when it has enough
; space.
test ecx, ecx
jz short .nextrow
mov BYTE [edi], al
%else ; RGB_PIXELSIZE == 4 ; -----------
%else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
movdqa xmmC,xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG,xmmB
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmC, xmmA
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG, xmmB
punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmD,xmmA
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH,xmmC
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD, xmmA
punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH, xmmC
punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32
test edi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
jmp short .out0
test edi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
jmp short .out0
.out1: ; --(unaligned)-----------------
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow
add esi, byte SIZEOF_XMMWORD ; inptr0
add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
alignx 16,7
add esi, byte SIZEOF_XMMWORD ; inptr0
add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
alignx 16, 7
.column_st32:
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub ecx, byte SIZEOF_XMMWORD/2
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmC
movdqa xmmD, xmmH
sub ecx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmD
sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_XMMWORD/8
jb short .column_st7
movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD/8*4
sub ecx, byte SIZEOF_XMMWORD/8
psrldq xmmA, SIZEOF_XMMWORD/8*4
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_XMMWORD/8
jb short .column_st7
movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD/8*4
sub ecx, byte SIZEOF_XMMWORD/8
psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7:
; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space.
test ecx, ecx
jz short .nextrow
movd XMM_DWORD [edi], xmmA
; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space.
test ecx, ecx
jz short .nextrow
movd XMM_DWORD [edi], xmmA
%endif ; RGB_PIXELSIZE ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
alignx 16,7
alignx 16, 7
.nextrow:
pop ecx
pop esi
pop ebx
pop edx
pop edi
pop eax
pop ecx
pop esi
pop ebx
pop edx
pop edi
pop eax
add esi, byte SIZEOF_JSAMPROW
add ebx, byte SIZEOF_JSAMPROW
add edx, byte SIZEOF_JSAMPROW
add edi, byte SIZEOF_JSAMPROW ; output_buf
dec eax ; num_rows
jg near .rowloop
add esi, byte SIZEOF_JSAMPROW
add ebx, byte SIZEOF_JSAMPROW
add edx, byte SIZEOF_JSAMPROW
add edi, byte SIZEOF_JSAMPROW ; output_buf
dec eax ; num_rows
jg near .rowloop
sfence ; flush the write buffer
sfence ; flush the write buffer
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp,ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -20,21 +20,21 @@
; --------------------------------------------------------------------------
%define SCALEBITS 16
%define SCALEBITS 16
F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
alignz 16
global EXTN(jconst_ycc_rgb_convert_sse2)
alignz 16
global EXTN(jconst_ycc_rgb_convert_sse2)
EXTN(jconst_ycc_rgb_convert_sse2):
@@ -44,11 +44,11 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
SECTION SEG_TEXT
BITS 64
%include "jdcolext-sse2-64.asm"

View File

@@ -20,21 +20,21 @@
; --------------------------------------------------------------------------
%define SCALEBITS 16
%define SCALEBITS 16
F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
alignz 16
global EXTN(jconst_ycc_rgb_convert_sse2)
alignz 16
global EXTN(jconst_ycc_rgb_convert_sse2)
EXTN(jconst_ycc_rgb_convert_sse2):
@@ -44,11 +44,11 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
SECTION SEG_TEXT
BITS 32
%include "jdcolext-sse2.asm"

View File

@@ -17,11 +17,11 @@
;
%define RANGE_MASK (MAXJSAMPLE * 4 + 3) ; 2 bits wider than legal samples
%define ROW(n,b,s) ((b)+(n)*(s))
%define COL(n,b,s) ((b)+(n)*(s)*DCTSIZE)
%define ROW(n,b,s) ((b)+(n)*(s))
%define COL(n,b,s) ((b)+(n)*(s)*DCTSIZE)
%define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
%define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
%define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
%define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
%define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
%define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
; --------------------------------------------------------------------------

View File

@@ -20,21 +20,21 @@
; --------------------------------------------------------------------------
%define SCALEBITS 16
%define SCALEBITS 16
F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
alignz 16
global EXTN(jconst_merged_upsample_sse2)
alignz 16
global EXTN(jconst_merged_upsample_sse2)
EXTN(jconst_merged_upsample_sse2):
@@ -44,11 +44,11 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
SECTION SEG_TEXT
BITS 64
%include "jdmrgext-sse2-64.asm"

View File

@@ -20,21 +20,21 @@
; --------------------------------------------------------------------------
%define SCALEBITS 16
%define SCALEBITS 16
F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
alignz 16
global EXTN(jconst_merged_upsample_sse2)
alignz 16
global EXTN(jconst_merged_upsample_sse2)
EXTN(jconst_merged_upsample_sse2):
@@ -44,11 +44,11 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
SECTION SEG_TEXT
BITS 32
%include "jdmrgext-sse2.asm"

View File

@@ -34,399 +34,399 @@
; r12 = JDIMENSION in_row_group_ctr
; r13 = JSAMPARRAY output_buf
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 3
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 3
align 16
global EXTN(jsimd_h2v1_merged_upsample_sse2)
align 16
global EXTN(jsimd_h2v1_merged_upsample_sse2)
EXTN(jsimd_h2v1_merged_upsample_sse2):
push rbp
mov rax,rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax
mov rbp,rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
push rbx
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
push rbx
mov ecx, r10d ; col
test rcx,rcx
jz near .return
mov ecx, r10d ; col
test rcx, rcx
jz near .return
push rcx
push rcx
mov rdi, r11
mov ecx, r12d
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
mov rdi, r13
mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
mov rdi, JSAMPROW [rdi] ; outptr
mov rdi, r11
mov ecx, r12d
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
mov rdi, r13
mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
mov rdi, JSAMPROW [rdi] ; outptr
pop rcx ; col
pop rcx ; col
.columnloop:
movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF)
movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF)
movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF)
movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF)
pxor xmm1,xmm1 ; xmm1=(all 0's)
pcmpeqw xmm3,xmm3
psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
pxor xmm1, xmm1 ; xmm1=(all 0's)
pcmpeqw xmm3, xmm3
psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
movdqa xmm4,xmm6
punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH
punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL
movdqa xmm0,xmm7
punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH
punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL
movdqa xmm4, xmm6
punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
movdqa xmm0, xmm7
punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
paddw xmm6,xmm3
paddw xmm4,xmm3
paddw xmm7,xmm3
paddw xmm0,xmm3
paddw xmm6, xmm3
paddw xmm4, xmm3
paddw xmm7, xmm3
paddw xmm0, xmm3
; (Original)
; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb
;
; (This implementation)
; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb
; (Original)
; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb
;
; (This implementation)
; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb
movdqa xmm5,xmm6 ; xmm5=CbH
movdqa xmm2,xmm4 ; xmm2=CbL
paddw xmm6,xmm6 ; xmm6=2*CbH
paddw xmm4,xmm4 ; xmm4=2*CbL
movdqa xmm1,xmm7 ; xmm1=CrH
movdqa xmm3,xmm0 ; xmm3=CrL
paddw xmm7,xmm7 ; xmm7=2*CrH
paddw xmm0,xmm0 ; xmm0=2*CrL
movdqa xmm5, xmm6 ; xmm5=CbH
movdqa xmm2, xmm4 ; xmm2=CbL
paddw xmm6, xmm6 ; xmm6=2*CbH
paddw xmm4, xmm4 ; xmm4=2*CbL
movdqa xmm1, xmm7 ; xmm1=CrH
movdqa xmm3, xmm0 ; xmm3=CrL
paddw xmm7, xmm7 ; xmm7=2*CrH
paddw xmm0, xmm0 ; xmm0=2*CrL
pmulhw xmm6,[rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800))
pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800))
pmulhw xmm7,[rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
pmulhw xmm6, [rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800))
pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800))
pmulhw xmm7, [rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
paddw xmm6,[rel PW_ONE]
paddw xmm4,[rel PW_ONE]
psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800))
psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800))
paddw xmm7,[rel PW_ONE]
paddw xmm0,[rel PW_ONE]
psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200))
psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200))
paddw xmm6, [rel PW_ONE]
paddw xmm4, [rel PW_ONE]
psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
paddw xmm7, [rel PW_ONE]
paddw xmm0, [rel PW_ONE]
psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
paddw xmm6,xmm5
paddw xmm4,xmm2
paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
paddw xmm6, xmm5
paddw xmm4, xmm2
paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
movdqa xmm6,xmm5
movdqa xmm7,xmm2
punpcklwd xmm5,xmm1
punpckhwd xmm6,xmm1
pmaddwd xmm5,[rel PW_MF0344_F0285]
pmaddwd xmm6,[rel PW_MF0344_F0285]
punpcklwd xmm2,xmm3
punpckhwd xmm7,xmm3
pmaddwd xmm2,[rel PW_MF0344_F0285]
pmaddwd xmm7,[rel PW_MF0344_F0285]
movdqa xmm6, xmm5
movdqa xmm7, xmm2
punpcklwd xmm5, xmm1
punpckhwd xmm6, xmm1
pmaddwd xmm5, [rel PW_MF0344_F0285]
pmaddwd xmm6, [rel PW_MF0344_F0285]
punpcklwd xmm2, xmm3
punpckhwd xmm7, xmm3
pmaddwd xmm2, [rel PW_MF0344_F0285]
pmaddwd xmm7, [rel PW_MF0344_F0285]
paddd xmm5,[rel PD_ONEHALF]
paddd xmm6,[rel PD_ONEHALF]
psrad xmm5,SCALEBITS
psrad xmm6,SCALEBITS
paddd xmm2,[rel PD_ONEHALF]
paddd xmm7,[rel PD_ONEHALF]
psrad xmm2,SCALEBITS
psrad xmm7,SCALEBITS
paddd xmm5, [rel PD_ONEHALF]
paddd xmm6, [rel PD_ONEHALF]
psrad xmm5, SCALEBITS
psrad xmm6, SCALEBITS
paddd xmm2, [rel PD_ONEHALF]
paddd xmm7, [rel PD_ONEHALF]
psrad xmm2, SCALEBITS
psrad xmm7, SCALEBITS
packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
mov al,2 ; Yctr
jmp short .Yloop_1st
mov al, 2 ; Yctr
jmp short .Yloop_1st
.Yloop_2nd:
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
.Yloop_1st:
movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF)
movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF)
pcmpeqw xmm6,xmm6
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE
psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO
pcmpeqw xmm6, xmm6
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H)
movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H)
movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H)
movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
%if RGB_PIXELSIZE == 3 ; ---------------
%if RGB_PIXELSIZE == 3 ; ---------------
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
movdqa xmmG,xmmA
movdqa xmmH,xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
movdqa xmmG, xmmA
movdqa xmmH, xmmA
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
movdqa xmmC,xmmD
movdqa xmmB,xmmD
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
movdqa xmmC, xmmD
movdqa xmmB, xmmD
punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
movdqa xmmF,xmmE
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
movdqa xmmF, xmmE
punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB,xmmE
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB, xmmE
punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB,xmmF
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB, xmmF
punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st32
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st32
test rdi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
jmp short .out0
test rdi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
jmp short .out0
.out1: ; --(unaligned)-----------------
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
.out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn
add rsi, byte SIZEOF_XMMWORD ; inptr0
dec al ; Yctr
jnz near .Yloop_2nd
add rsi, byte SIZEOF_XMMWORD ; inptr0
dec al ; Yctr
jnz near .Yloop_2nd
add rbx, byte SIZEOF_XMMWORD ; inptr1
add rdx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
add rbx, byte SIZEOF_XMMWORD ; inptr1
add rdx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
.column_st32:
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmF
sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmD
sub rcx, byte SIZEOF_XMMWORD
.column_st15:
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_MMWORD
jb short .column_st7
movq XMM_MMWORD [rdi], xmmA
add rdi, byte SIZEOF_MMWORD
sub rcx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_MMWORD
jb short .column_st7
movq XMM_MMWORD [rdi], xmmA
add rdi, byte SIZEOF_MMWORD
sub rcx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD
.column_st7:
; Store the lower 4 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_DWORD
jb short .column_st3
movd XMM_DWORD [rdi], xmmA
add rdi, byte SIZEOF_DWORD
sub rcx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD
; Store the lower 4 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_DWORD
jb short .column_st3
movd XMM_DWORD [rdi], xmmA
add rdi, byte SIZEOF_DWORD
sub rcx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD
.column_st3:
; Store the lower 2 bytes of rax to the output when it has enough
; space.
movd eax, xmmA
cmp rcx, byte SIZEOF_WORD
jb short .column_st1
mov WORD [rdi], ax
add rdi, byte SIZEOF_WORD
sub rcx, byte SIZEOF_WORD
shr rax, 16
; Store the lower 2 bytes of rax to the output when it has enough
; space.
movd eax, xmmA
cmp rcx, byte SIZEOF_WORD
jb short .column_st1
mov WORD [rdi], ax
add rdi, byte SIZEOF_WORD
sub rcx, byte SIZEOF_WORD
shr rax, 16
.column_st1:
; Store the lower 1 byte of rax to the output when it has enough
; space.
test rcx, rcx
jz short .endcolumn
mov BYTE [rdi], al
; Store the lower 1 byte of rax to the output when it has enough
; space.
test rcx, rcx
jz short .endcolumn
mov BYTE [rdi], al
%else ; RGB_PIXELSIZE == 4 ; -----------
%else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
movdqa xmmC,xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG,xmmB
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmC, xmmA
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG, xmmB
punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmD,xmmA
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH,xmmC
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD, xmmA
punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH, xmmC
punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st32
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st32
test rdi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
jmp short .out0
test rdi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
jmp short .out0
.out1: ; --(unaligned)-----------------
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
.out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn
add rsi, byte SIZEOF_XMMWORD ; inptr0
dec al ; Yctr
jnz near .Yloop_2nd
add rsi, byte SIZEOF_XMMWORD ; inptr0
dec al ; Yctr
jnz near .Yloop_2nd
add rbx, byte SIZEOF_XMMWORD ; inptr1
add rdx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
add rbx, byte SIZEOF_XMMWORD ; inptr1
add rdx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
.column_st32:
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub rcx, byte SIZEOF_XMMWORD/2
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmC
movdqa xmmD, xmmH
sub rcx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmD
sub rcx, byte SIZEOF_XMMWORD/4
.column_st15:
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_XMMWORD/8
jb short .column_st7
movq XMM_MMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD/8*4
sub rcx, byte SIZEOF_XMMWORD/8
psrldq xmmA, SIZEOF_XMMWORD/8*4
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_XMMWORD/8
jb short .column_st7
movq XMM_MMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD/8*4
sub rcx, byte SIZEOF_XMMWORD/8
psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7:
; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space.
test rcx, rcx
jz short .endcolumn
movd XMM_DWORD [rdi], xmmA
; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space.
test rcx, rcx
jz short .endcolumn
movd XMM_DWORD [rdi], xmmA
%endif ; RGB_PIXELSIZE ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
.endcolumn:
sfence ; flush the write buffer
sfence ; flush the write buffer
.return:
pop rbx
uncollect_args
mov rsp,rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
pop rbx
uncollect_args
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
; --------------------------------------------------------------------------
;
@@ -444,94 +444,94 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
; r12 = JDIMENSION in_row_group_ctr
; r13 = JSAMPARRAY output_buf
align 16
global EXTN(jsimd_h2v2_merged_upsample_sse2)
align 16
global EXTN(jsimd_h2v2_merged_upsample_sse2)
EXTN(jsimd_h2v2_merged_upsample_sse2):
push rbp
mov rax,rsp
mov rbp,rsp
collect_args
push rbx
push rbp
mov rax, rsp
mov rbp, rsp
collect_args
push rbx
mov eax, r10d
mov eax, r10d
mov rdi, r11
mov ecx, r12d
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
mov rdi, r13
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
mov rdi, r11
mov ecx, r12d
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
mov rdi, r13
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
push rdx ; inptr2
push rbx ; inptr1
push rsi ; inptr00
mov rbx,rsp
push rdx ; inptr2
push rbx ; inptr1
push rsi ; inptr00
mov rbx, rsp
push rdi
push rcx
push rax
push rdi
push rcx
push rax
%ifdef WIN64
mov r8, rcx
mov r9, rdi
mov rcx, rax
mov rdx, rbx
%else
mov rdx, rcx
mov rcx, rdi
mov rdi, rax
mov rsi, rbx
%endif
%ifdef WIN64
mov r8, rcx
mov r9, rdi
mov rcx, rax
mov rdx, rbx
%else
mov rdx, rcx
mov rcx, rdi
mov rdi, rax
mov rsi, rbx
%endif
call EXTN(jsimd_h2v1_merged_upsample_sse2)
call EXTN(jsimd_h2v1_merged_upsample_sse2)
pop rax
pop rcx
pop rdi
pop rsi
pop rbx
pop rdx
pop rax
pop rcx
pop rdi
pop rsi
pop rbx
pop rdx
add rdi, byte SIZEOF_JSAMPROW ; outptr1
add rsi, byte SIZEOF_JSAMPROW ; inptr01
add rdi, byte SIZEOF_JSAMPROW ; outptr1
add rsi, byte SIZEOF_JSAMPROW ; inptr01
push rdx ; inptr2
push rbx ; inptr1
push rsi ; inptr00
mov rbx,rsp
push rdx ; inptr2
push rbx ; inptr1
push rsi ; inptr00
mov rbx, rsp
push rdi
push rcx
push rax
push rdi
push rcx
push rax
%ifdef WIN64
mov r8, rcx
mov r9, rdi
mov rcx, rax
mov rdx, rbx
%else
mov rdx, rcx
mov rcx, rdi
mov rdi, rax
mov rsi, rbx
%endif
%ifdef WIN64
mov r8, rcx
mov r9, rdi
mov rcx, rax
mov rdx, rbx
%else
mov rdx, rcx
mov rcx, rdi
mov rdi, rax
mov rsi, rbx
%endif
call EXTN(jsimd_h2v1_merged_upsample_sse2)
call EXTN(jsimd_h2v1_merged_upsample_sse2)
pop rax
pop rcx
pop rdi
pop rsi
pop rbx
pop rdx
pop rax
pop rcx
pop rdi
pop rsi
pop rbx
pop rdx
pop rbx
uncollect_args
pop rbp
ret
pop rbx
uncollect_args
pop rbp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -29,422 +29,422 @@
; JSAMPARRAY output_buf);
;
%define output_width(b) (b)+8 ; JDIMENSION output_width
%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
%define output_width(b) (b)+8 ; JDIMENSION output_width
%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 3
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 3
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
align 16
global EXTN(jsimd_h2v1_merged_upsample_sse2)
align 16
global EXTN(jsimd_h2v1_merged_upsample_sse2)
EXTN(jsimd_h2v1_merged_upsample_sse2):
push ebp
mov eax,esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax
mov ebp,esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [output_width(eax)] ; col
test ecx,ecx
jz near .return
mov ecx, JDIMENSION [output_width(eax)] ; col
test ecx, ecx
jz near .return
push ecx
push ecx
mov edi, JSAMPIMAGE [input_buf(eax)]
mov ecx, JDIMENSION [in_row_group_ctr(eax)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
mov edi, JSAMPARRAY [output_buf(eax)]
mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
mov edi, JSAMPROW [edi] ; outptr
mov edi, JSAMPIMAGE [input_buf(eax)]
mov ecx, JDIMENSION [in_row_group_ctr(eax)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
mov edi, JSAMPARRAY [output_buf(eax)]
mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
mov edi, JSAMPROW [edi] ; outptr
pop ecx ; col
pop ecx ; col
alignx 16,7
alignx 16, 7
.columnloop:
movpic eax, POINTER [gotptr] ; load GOT address (eax)
movpic eax, POINTER [gotptr] ; load GOT address (eax)
movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF)
movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF)
movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF)
movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF)
pxor xmm1,xmm1 ; xmm1=(all 0's)
pcmpeqw xmm3,xmm3
psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
pxor xmm1, xmm1 ; xmm1=(all 0's)
pcmpeqw xmm3, xmm3
psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
movdqa xmm4,xmm6
punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH
punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL
movdqa xmm0,xmm7
punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH
punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL
movdqa xmm4, xmm6
punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
movdqa xmm0, xmm7
punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
paddw xmm6,xmm3
paddw xmm4,xmm3
paddw xmm7,xmm3
paddw xmm0,xmm3
paddw xmm6, xmm3
paddw xmm4, xmm3
paddw xmm7, xmm3
paddw xmm0, xmm3
; (Original)
; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb
;
; (This implementation)
; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb
; (Original)
; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb
;
; (This implementation)
; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb
movdqa xmm5,xmm6 ; xmm5=CbH
movdqa xmm2,xmm4 ; xmm2=CbL
paddw xmm6,xmm6 ; xmm6=2*CbH
paddw xmm4,xmm4 ; xmm4=2*CbL
movdqa xmm1,xmm7 ; xmm1=CrH
movdqa xmm3,xmm0 ; xmm3=CrL
paddw xmm7,xmm7 ; xmm7=2*CrH
paddw xmm0,xmm0 ; xmm0=2*CrL
movdqa xmm5, xmm6 ; xmm5=CbH
movdqa xmm2, xmm4 ; xmm2=CbL
paddw xmm6, xmm6 ; xmm6=2*CbH
paddw xmm4, xmm4 ; xmm4=2*CbL
movdqa xmm1, xmm7 ; xmm1=CrH
movdqa xmm3, xmm0 ; xmm3=CrL
paddw xmm7, xmm7 ; xmm7=2*CrH
paddw xmm0, xmm0 ; xmm0=2*CrL
pmulhw xmm6,[GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800))
pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800))
pmulhw xmm7,[GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200))
pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200))
pmulhw xmm6, [GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800))
pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800))
pmulhw xmm7, [GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200))
pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200))
paddw xmm6,[GOTOFF(eax,PW_ONE)]
paddw xmm4,[GOTOFF(eax,PW_ONE)]
psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800))
psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800))
paddw xmm7,[GOTOFF(eax,PW_ONE)]
paddw xmm0,[GOTOFF(eax,PW_ONE)]
psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200))
psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200))
paddw xmm6, [GOTOFF(eax,PW_ONE)]
paddw xmm4, [GOTOFF(eax,PW_ONE)]
psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
paddw xmm7, [GOTOFF(eax,PW_ONE)]
paddw xmm0, [GOTOFF(eax,PW_ONE)]
psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
paddw xmm6,xmm5
paddw xmm4,xmm2
paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
paddw xmm6, xmm5
paddw xmm4, xmm2
paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
movdqa xmm6,xmm5
movdqa xmm7,xmm2
punpcklwd xmm5,xmm1
punpckhwd xmm6,xmm1
pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
punpcklwd xmm2,xmm3
punpckhwd xmm7,xmm3
pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
movdqa xmm6, xmm5
movdqa xmm7, xmm2
punpcklwd xmm5, xmm1
punpckhwd xmm6, xmm1
pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm6, [GOTOFF(eax,PW_MF0344_F0285)]
punpcklwd xmm2, xmm3
punpckhwd xmm7, xmm3
pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm7, [GOTOFF(eax,PW_MF0344_F0285)]
paddd xmm5,[GOTOFF(eax,PD_ONEHALF)]
paddd xmm6,[GOTOFF(eax,PD_ONEHALF)]
psrad xmm5,SCALEBITS
psrad xmm6,SCALEBITS
paddd xmm2,[GOTOFF(eax,PD_ONEHALF)]
paddd xmm7,[GOTOFF(eax,PD_ONEHALF)]
psrad xmm2,SCALEBITS
psrad xmm7,SCALEBITS
paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
paddd xmm6, [GOTOFF(eax,PD_ONEHALF)]
psrad xmm5, SCALEBITS
psrad xmm6, SCALEBITS
paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
paddd xmm7, [GOTOFF(eax,PD_ONEHALF)]
psrad xmm2, SCALEBITS
psrad xmm7, SCALEBITS
packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
mov al,2 ; Yctr
jmp short .Yloop_1st
alignx 16,7
mov al, 2 ; Yctr
jmp short .Yloop_1st
alignx 16, 7
.Yloop_2nd:
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
alignx 16,7
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
alignx 16, 7
.Yloop_1st:
movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF)
movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF)
pcmpeqw xmm6,xmm6
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE
psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO
pcmpeqw xmm6, xmm6
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H)
movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H)
movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H)
movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
%if RGB_PIXELSIZE == 3 ; ---------------
%if RGB_PIXELSIZE == 3 ; ---------------
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
movdqa xmmG,xmmA
movdqa xmmH,xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
movdqa xmmG, xmmA
movdqa xmmH, xmmA
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
movdqa xmmC,xmmD
movdqa xmmB,xmmD
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
movdqa xmmC, xmmD
movdqa xmmB, xmmD
punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
movdqa xmmF,xmmE
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
movdqa xmmF, xmmE
punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB,xmmE
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB, xmmE
punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB,xmmF
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB, xmmF
punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32
test edi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
jmp short .out0
test edi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
jmp short .out0
.out1: ; --(unaligned)-----------------
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn
add esi, byte SIZEOF_XMMWORD ; inptr0
dec al ; Yctr
jnz near .Yloop_2nd
add esi, byte SIZEOF_XMMWORD ; inptr0
dec al ; Yctr
jnz near .Yloop_2nd
add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
alignx 16,7
add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
alignx 16, 7
.column_st32:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmF
sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmD
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
jb short .column_st7
movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_MMWORD
sub ecx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
jb short .column_st7
movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_MMWORD
sub ecx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD
.column_st7:
; Store the lower 4 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_DWORD
jb short .column_st3
movd XMM_DWORD [edi], xmmA
add edi, byte SIZEOF_DWORD
sub ecx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD
; Store the lower 4 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_DWORD
jb short .column_st3
movd XMM_DWORD [edi], xmmA
add edi, byte SIZEOF_DWORD
sub ecx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD
.column_st3:
; Store the lower 2 bytes of eax to the output when it has enough
; space.
movd eax, xmmA
cmp ecx, byte SIZEOF_WORD
jb short .column_st1
mov WORD [edi], ax
add edi, byte SIZEOF_WORD
sub ecx, byte SIZEOF_WORD
shr eax, 16
; Store the lower 2 bytes of eax to the output when it has enough
; space.
movd eax, xmmA
cmp ecx, byte SIZEOF_WORD
jb short .column_st1
mov WORD [edi], ax
add edi, byte SIZEOF_WORD
sub ecx, byte SIZEOF_WORD
shr eax, 16
.column_st1:
; Store the lower 1 byte of eax to the output when it has enough
; space.
test ecx, ecx
jz short .endcolumn
mov BYTE [edi], al
; Store the lower 1 byte of eax to the output when it has enough
; space.
test ecx, ecx
jz short .endcolumn
mov BYTE [edi], al
%else ; RGB_PIXELSIZE == 4 ; -----------
%else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
movdqa xmmC,xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG,xmmB
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmC, xmmA
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG, xmmB
punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmD,xmmA
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH,xmmC
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD, xmmA
punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH, xmmC
punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32
test edi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
jmp short .out0
test edi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
jmp short .out0
.out1: ; --(unaligned)-----------------
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn
add esi, byte SIZEOF_XMMWORD ; inptr0
dec al ; Yctr
jnz near .Yloop_2nd
add esi, byte SIZEOF_XMMWORD ; inptr0
dec al ; Yctr
jnz near .Yloop_2nd
add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
alignx 16,7
add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
alignx 16, 7
.column_st32:
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub ecx, byte SIZEOF_XMMWORD/2
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmC
movdqa xmmD, xmmH
sub ecx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmD
sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_XMMWORD/8
jb short .column_st7
movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD/8*4
sub ecx, byte SIZEOF_XMMWORD/8
psrldq xmmA, SIZEOF_XMMWORD/8*4
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_XMMWORD/8
jb short .column_st7
movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD/8*4
sub ecx, byte SIZEOF_XMMWORD/8
psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7:
; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space.
test ecx, ecx
jz short .endcolumn
movd XMM_DWORD [edi], xmmA
; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space.
test ecx, ecx
jz short .endcolumn
movd XMM_DWORD [edi], xmmA
%endif ; RGB_PIXELSIZE ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
.endcolumn:
sfence ; flush the write buffer
sfence ; flush the write buffer
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp,ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; --------------------------------------------------------------------------
;
@@ -457,62 +457,62 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
; JSAMPARRAY output_buf);
;
%define output_width(b) (b)+8 ; JDIMENSION output_width
%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
%define output_width(b) (b)+8 ; JDIMENSION output_width
%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
align 16
global EXTN(jsimd_h2v2_merged_upsample_sse2)
align 16
global EXTN(jsimd_h2v2_merged_upsample_sse2)
EXTN(jsimd_h2v2_merged_upsample_sse2):
push ebp
mov ebp,esp
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
push ebp
mov ebp, esp
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov eax, POINTER [output_width(ebp)]
mov eax, POINTER [output_width(ebp)]
mov edi, JSAMPIMAGE [input_buf(ebp)]
mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
mov edi, JSAMPARRAY [output_buf(ebp)]
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
mov edi, JSAMPIMAGE [input_buf(ebp)]
mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
mov edi, JSAMPARRAY [output_buf(ebp)]
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
push edx ; inptr2
push ebx ; inptr1
push esi ; inptr00
mov ebx,esp
push edx ; inptr2
push ebx ; inptr1
push esi ; inptr00
mov ebx, esp
push edi ; output_buf (outptr0)
push ecx ; in_row_group_ctr
push ebx ; input_buf
push eax ; output_width
push edi ; output_buf (outptr0)
push ecx ; in_row_group_ctr
push ebx ; input_buf
push eax ; output_width
call near EXTN(jsimd_h2v1_merged_upsample_sse2)
call near EXTN(jsimd_h2v1_merged_upsample_sse2)
add esi, byte SIZEOF_JSAMPROW ; inptr01
add edi, byte SIZEOF_JSAMPROW ; outptr1
mov POINTER [ebx+0*SIZEOF_POINTER], esi
mov POINTER [ebx-1*SIZEOF_POINTER], edi
add esi, byte SIZEOF_JSAMPROW ; inptr01
add edi, byte SIZEOF_JSAMPROW ; outptr1
mov POINTER [ebx+0*SIZEOF_POINTER], esi
mov POINTER [ebx-1*SIZEOF_POINTER], edi
call near EXTN(jsimd_h2v1_merged_upsample_sse2)
call near EXTN(jsimd_h2v1_merged_upsample_sse2)
add esp, byte 7*SIZEOF_DWORD
add esp, byte 7*SIZEOF_DWORD
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
pop ebp
ret
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -25,32 +25,32 @@
; --------------------------------------------------------------------------
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1,%2,0x44
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1, %2, 0x44
%endmacro
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1,%2,0xEE
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1, %2, 0xEE
%endmacro
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
alignz 16
global EXTN(jconst_fdct_float_sse)
alignz 16
global EXTN(jconst_fdct_float_sse)
EXTN(jconst_fdct_float_sse):
PD_0_382 times 4 dd 0.382683432365089771728460
PD_0_707 times 4 dd 0.707106781186547524400844
PD_0_541 times 4 dd 0.541196100146196984399723
PD_1_306 times 4 dd 1.306562964876376527856643
PD_0_382 times 4 dd 0.382683432365089771728460
PD_0_707 times 4 dd 0.707106781186547524400844
PD_0_541 times 4 dd 0.541196100146196984399723
PD_1_306 times 4 dd 1.306562964876376527856643
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
SECTION SEG_TEXT
BITS 64
;
; Perform the forward DCT on one block of samples.
;
@@ -60,298 +60,298 @@ PD_1_306 times 4 dd 1.306562964876376527856643
; r10 = FAST_FLOAT *data
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
align 16
global EXTN(jsimd_fdct_float_sse)
align 16
global EXTN(jsimd_fdct_float_sse)
EXTN(jsimd_fdct_float_sse):
push rbp
mov rax,rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax
mov rbp,rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
; ---- Pass 1: process rows.
; ---- Pass 1: process rows.
mov rdx, r10 ; (FAST_FLOAT *)
mov rcx, DCTSIZE/4
mov rdx, r10 ; (FAST_FLOAT *)
mov rcx, DCTSIZE/4
.rowloop:
movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
movaps xmm4,xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31)
unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33)
movaps xmm5,xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35)
unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37)
movaps xmm4, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
movaps xmm5, xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
movaps xmm4,xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13)
movaps xmm2,xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15)
unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17)
movaps xmm4, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
movaps xmm2, xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
movaps xmm7,xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0
unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1
movaps xmm3,xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6
unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7
movaps xmm7, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
movaps xmm3, xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
movaps xmm0,xmm7
movaps xmm5,xmm6
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
movaps xmm0, xmm7
movaps xmm5, xmm6
subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm7,xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2
unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3
movaps xmm6,xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4
unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5
movaps xmm7, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
movaps xmm6, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
movaps xmm2,xmm7
movaps xmm3,xmm4
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
movaps xmm2, xmm7
movaps xmm3, xmm4
addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part
; -- Even part
movaps xmm1,xmm5
movaps xmm6,xmm0
subps xmm5,xmm7 ; xmm5=tmp13
subps xmm0,xmm4 ; xmm0=tmp12
addps xmm1,xmm7 ; xmm1=tmp10
addps xmm6,xmm4 ; xmm6=tmp11
movaps xmm1, xmm5
movaps xmm6, xmm0
subps xmm5, xmm7 ; xmm5=tmp13
subps xmm0, xmm4 ; xmm0=tmp12
addps xmm1, xmm7 ; xmm1=tmp10
addps xmm6, xmm4 ; xmm6=tmp11
addps xmm0,xmm5
mulps xmm0,[rel PD_0_707] ; xmm0=z1
addps xmm0, xmm5
mulps xmm0, [rel PD_0_707] ; xmm0=z1
movaps xmm7,xmm1
movaps xmm4,xmm5
subps xmm1,xmm6 ; xmm1=data4
subps xmm5,xmm0 ; xmm5=data6
addps xmm7,xmm6 ; xmm7=data0
addps xmm4,xmm0 ; xmm4=data2
movaps xmm7, xmm1
movaps xmm4, xmm5
subps xmm1, xmm6 ; xmm1=data4
subps xmm5, xmm0 ; xmm5=data6
addps xmm7, xmm6 ; xmm7=data0
addps xmm4, xmm0 ; xmm4=data2
movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
; -- Odd part
; -- Odd part
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
addps xmm2,xmm3 ; xmm2=tmp10
addps xmm3,xmm6 ; xmm3=tmp11
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
addps xmm2, xmm3 ; xmm2=tmp10
addps xmm3, xmm6 ; xmm3=tmp11
addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
mulps xmm3,[rel PD_0_707] ; xmm3=z3
mulps xmm3, [rel PD_0_707] ; xmm3=z3
movaps xmm1,xmm2 ; xmm1=tmp10
subps xmm2,xmm6
mulps xmm2,[rel PD_0_382] ; xmm2=z5
mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1,xmm2 ; xmm1=z2
addps xmm6,xmm2 ; xmm6=z4
movaps xmm1, xmm2 ; xmm1=tmp10
subps xmm2, xmm6
mulps xmm2, [rel PD_0_382] ; xmm2=z5
mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1, xmm2 ; xmm1=z2
addps xmm6, xmm2 ; xmm6=z4
movaps xmm5,xmm0
subps xmm0,xmm3 ; xmm0=z13
addps xmm5,xmm3 ; xmm5=z11
movaps xmm5, xmm0
subps xmm0, xmm3 ; xmm0=z13
addps xmm5, xmm3 ; xmm5=z11
movaps xmm7,xmm0
movaps xmm4,xmm5
subps xmm0,xmm1 ; xmm0=data3
subps xmm5,xmm6 ; xmm5=data7
addps xmm7,xmm1 ; xmm7=data5
addps xmm4,xmm6 ; xmm4=data1
movaps xmm7, xmm0
movaps xmm4, xmm5
subps xmm0, xmm1 ; xmm0=data3
subps xmm5, xmm6 ; xmm5=data7
addps xmm7, xmm1 ; xmm7=data5
addps xmm4, xmm6 ; xmm4=data1
movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
dec rcx
jnz near .rowloop
add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
dec rcx
jnz near .rowloop
; ---- Pass 2: process columns.
; ---- Pass 2: process columns.
mov rdx, r10 ; (FAST_FLOAT *)
mov rcx, DCTSIZE/4
mov rdx, r10 ; (FAST_FLOAT *)
mov rcx, DCTSIZE/4
.columnloop:
movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
movaps xmm4,xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13)
unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33)
movaps xmm5,xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53)
unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73)
movaps xmm4, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
movaps xmm5, xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
movaps xmm4,xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11)
unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31)
movaps xmm2,xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51)
unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71)
movaps xmm4, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
movaps xmm2, xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
movaps xmm7,xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0
unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1
movaps xmm3,xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6
unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7
movaps xmm7, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
movaps xmm3, xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
movaps xmm0,xmm7
movaps xmm5,xmm6
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
movaps xmm0, xmm7
movaps xmm5, xmm6
subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm7,xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2
unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3
movaps xmm6,xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4
unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5
movaps xmm7, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
movaps xmm6, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
movaps xmm2,xmm7
movaps xmm3,xmm4
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
movaps xmm2, xmm7
movaps xmm3, xmm4
addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part
; -- Even part
movaps xmm1,xmm5
movaps xmm6,xmm0
subps xmm5,xmm7 ; xmm5=tmp13
subps xmm0,xmm4 ; xmm0=tmp12
addps xmm1,xmm7 ; xmm1=tmp10
addps xmm6,xmm4 ; xmm6=tmp11
movaps xmm1, xmm5
movaps xmm6, xmm0
subps xmm5, xmm7 ; xmm5=tmp13
subps xmm0, xmm4 ; xmm0=tmp12
addps xmm1, xmm7 ; xmm1=tmp10
addps xmm6, xmm4 ; xmm6=tmp11
addps xmm0,xmm5
mulps xmm0,[rel PD_0_707] ; xmm0=z1
addps xmm0, xmm5
mulps xmm0, [rel PD_0_707] ; xmm0=z1
movaps xmm7,xmm1
movaps xmm4,xmm5
subps xmm1,xmm6 ; xmm1=data4
subps xmm5,xmm0 ; xmm5=data6
addps xmm7,xmm6 ; xmm7=data0
addps xmm4,xmm0 ; xmm4=data2
movaps xmm7, xmm1
movaps xmm4, xmm5
subps xmm1, xmm6 ; xmm1=data4
subps xmm5, xmm0 ; xmm5=data6
addps xmm7, xmm6 ; xmm7=data0
addps xmm4, xmm0 ; xmm4=data2
movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
; -- Odd part
; -- Odd part
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
addps xmm2,xmm3 ; xmm2=tmp10
addps xmm3,xmm6 ; xmm3=tmp11
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
addps xmm2, xmm3 ; xmm2=tmp10
addps xmm3, xmm6 ; xmm3=tmp11
addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
mulps xmm3,[rel PD_0_707] ; xmm3=z3
mulps xmm3, [rel PD_0_707] ; xmm3=z3
movaps xmm1,xmm2 ; xmm1=tmp10
subps xmm2,xmm6
mulps xmm2,[rel PD_0_382] ; xmm2=z5
mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1,xmm2 ; xmm1=z2
addps xmm6,xmm2 ; xmm6=z4
movaps xmm1, xmm2 ; xmm1=tmp10
subps xmm2, xmm6
mulps xmm2, [rel PD_0_382] ; xmm2=z5
mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1, xmm2 ; xmm1=z2
addps xmm6, xmm2 ; xmm6=z4
movaps xmm5,xmm0
subps xmm0,xmm3 ; xmm0=z13
addps xmm5,xmm3 ; xmm5=z11
movaps xmm5, xmm0
subps xmm0, xmm3 ; xmm0=z13
addps xmm5, xmm3 ; xmm5=z11
movaps xmm7,xmm0
movaps xmm4,xmm5
subps xmm0,xmm1 ; xmm0=data3
subps xmm5,xmm6 ; xmm5=data7
addps xmm7,xmm1 ; xmm7=data5
addps xmm4,xmm6 ; xmm4=data1
movaps xmm7, xmm0
movaps xmm4, xmm5
subps xmm0, xmm1 ; xmm0=data3
subps xmm5, xmm6 ; xmm5=data7
addps xmm7, xmm1 ; xmm7=data5
addps xmm4, xmm6 ; xmm4=data1
movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
add rdx, byte 4*SIZEOF_FAST_FLOAT
dec rcx
jnz near .columnloop
add rdx, byte 4*SIZEOF_FAST_FLOAT
dec rcx
jnz near .columnloop
uncollect_args
mov rsp,rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
uncollect_args
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -24,32 +24,32 @@
; --------------------------------------------------------------------------
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1,%2,0x44
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1, %2, 0x44
%endmacro
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1,%2,0xEE
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1, %2, 0xEE
%endmacro
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
alignz 16
global EXTN(jconst_fdct_float_sse)
alignz 16
global EXTN(jconst_fdct_float_sse)
EXTN(jconst_fdct_float_sse):
PD_0_382 times 4 dd 0.382683432365089771728460
PD_0_707 times 4 dd 0.707106781186547524400844
PD_0_541 times 4 dd 0.541196100146196984399723
PD_1_306 times 4 dd 1.306562964876376527856643
PD_0_382 times 4 dd 0.382683432365089771728460
PD_0_707 times 4 dd 0.707106781186547524400844
PD_0_541 times 4 dd 0.541196100146196984399723
PD_1_306 times 4 dd 1.306562964876376527856643
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
SECTION SEG_TEXT
BITS 32
;
; Perform the forward DCT on one block of samples.
;
@@ -57,313 +57,313 @@ PD_1_306 times 4 dd 1.306562964876376527856643
; jsimd_fdct_float_sse (FAST_FLOAT *data)
;
%define data(b) (b)+8 ; FAST_FLOAT *data
%define data(b) (b)+8 ; FAST_FLOAT *data
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
align 16
global EXTN(jsimd_fdct_float_sse)
align 16
global EXTN(jsimd_fdct_float_sse)
EXTN(jsimd_fdct_float_sse):
push ebp
mov eax,esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax
mov ebp,esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
; push esi ; unused
; push edi ; unused
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
; push esi ; unused
; push edi ; unused
get_GOT ebx ; get GOT address
get_GOT ebx ; get GOT address
; ---- Pass 1: process rows.
; ---- Pass 1: process rows.
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE/4
alignx 16,7
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE/4
alignx 16, 7
.rowloop:
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
movaps xmm4,xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31)
unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33)
movaps xmm5,xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35)
unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37)
movaps xmm4, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
movaps xmm5, xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
movaps xmm4,xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13)
movaps xmm2,xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15)
unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17)
movaps xmm4, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
movaps xmm2, xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
movaps xmm7,xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0
unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1
movaps xmm3,xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6
unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7
movaps xmm7, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
movaps xmm3, xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
movaps xmm0,xmm7
movaps xmm5,xmm6
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
movaps xmm0, xmm7
movaps xmm5, xmm6
subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm7,xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2
unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3
movaps xmm6,xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4
unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5
movaps xmm7, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
movaps xmm6, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
movaps xmm2,xmm7
movaps xmm3,xmm4
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
movaps xmm2, xmm7
movaps xmm3, xmm4
addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part
; -- Even part
movaps xmm1,xmm5
movaps xmm6,xmm0
subps xmm5,xmm7 ; xmm5=tmp13
subps xmm0,xmm4 ; xmm0=tmp12
addps xmm1,xmm7 ; xmm1=tmp10
addps xmm6,xmm4 ; xmm6=tmp11
movaps xmm1, xmm5
movaps xmm6, xmm0
subps xmm5, xmm7 ; xmm5=tmp13
subps xmm0, xmm4 ; xmm0=tmp12
addps xmm1, xmm7 ; xmm1=tmp10
addps xmm6, xmm4 ; xmm6=tmp11
addps xmm0,xmm5
mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
addps xmm0, xmm5
mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
movaps xmm7,xmm1
movaps xmm4,xmm5
subps xmm1,xmm6 ; xmm1=data4
subps xmm5,xmm0 ; xmm5=data6
addps xmm7,xmm6 ; xmm7=data0
addps xmm4,xmm0 ; xmm4=data2
movaps xmm7, xmm1
movaps xmm4, xmm5
subps xmm1, xmm6 ; xmm1=data4
subps xmm5, xmm0 ; xmm5=data6
addps xmm7, xmm6 ; xmm7=data0
addps xmm4, xmm0 ; xmm4=data2
movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
; -- Odd part
; -- Odd part
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
addps xmm2,xmm3 ; xmm2=tmp10
addps xmm3,xmm6 ; xmm3=tmp11
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
addps xmm2, xmm3 ; xmm2=tmp10
addps xmm3, xmm6 ; xmm3=tmp11
addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
movaps xmm1,xmm2 ; xmm1=tmp10
subps xmm2,xmm6
mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1,xmm2 ; xmm1=z2
addps xmm6,xmm2 ; xmm6=z4
movaps xmm1, xmm2 ; xmm1=tmp10
subps xmm2, xmm6
mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1, xmm2 ; xmm1=z2
addps xmm6, xmm2 ; xmm6=z4
movaps xmm5,xmm0
subps xmm0,xmm3 ; xmm0=z13
addps xmm5,xmm3 ; xmm5=z11
movaps xmm5, xmm0
subps xmm0, xmm3 ; xmm0=z13
addps xmm5, xmm3 ; xmm5=z11
movaps xmm7,xmm0
movaps xmm4,xmm5
subps xmm0,xmm1 ; xmm0=data3
subps xmm5,xmm6 ; xmm5=data7
addps xmm7,xmm1 ; xmm7=data5
addps xmm4,xmm6 ; xmm4=data1
movaps xmm7, xmm0
movaps xmm4, xmm5
subps xmm0, xmm1 ; xmm0=data3
subps xmm5, xmm6 ; xmm5=data7
addps xmm7, xmm1 ; xmm7=data5
addps xmm4, xmm6 ; xmm4=data1
movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
dec ecx
jnz near .rowloop
add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
dec ecx
jnz near .rowloop
; ---- Pass 2: process columns.
; ---- Pass 2: process columns.
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE/4
alignx 16,7
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE/4
alignx 16, 7
.columnloop:
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
movaps xmm4,xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13)
unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33)
movaps xmm5,xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53)
unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73)
movaps xmm4, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
movaps xmm5, xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
movaps xmm4,xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11)
unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31)
movaps xmm2,xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51)
unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71)
movaps xmm4, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
movaps xmm2, xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
movaps xmm7,xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0
unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1
movaps xmm3,xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6
unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7
movaps xmm7, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
movaps xmm3, xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
movaps xmm0,xmm7
movaps xmm5,xmm6
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
movaps xmm0, xmm7
movaps xmm5, xmm6
subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm7,xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2
unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3
movaps xmm6,xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4
unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5
movaps xmm7, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
movaps xmm6, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
movaps xmm2,xmm7
movaps xmm3,xmm4
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
movaps xmm2, xmm7
movaps xmm3, xmm4
addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part
; -- Even part
movaps xmm1,xmm5
movaps xmm6,xmm0
subps xmm5,xmm7 ; xmm5=tmp13
subps xmm0,xmm4 ; xmm0=tmp12
addps xmm1,xmm7 ; xmm1=tmp10
addps xmm6,xmm4 ; xmm6=tmp11
movaps xmm1, xmm5
movaps xmm6, xmm0
subps xmm5, xmm7 ; xmm5=tmp13
subps xmm0, xmm4 ; xmm0=tmp12
addps xmm1, xmm7 ; xmm1=tmp10
addps xmm6, xmm4 ; xmm6=tmp11
addps xmm0,xmm5
mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
addps xmm0, xmm5
mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
movaps xmm7,xmm1
movaps xmm4,xmm5
subps xmm1,xmm6 ; xmm1=data4
subps xmm5,xmm0 ; xmm5=data6
addps xmm7,xmm6 ; xmm7=data0
addps xmm4,xmm0 ; xmm4=data2
movaps xmm7, xmm1
movaps xmm4, xmm5
subps xmm1, xmm6 ; xmm1=data4
subps xmm5, xmm0 ; xmm5=data6
addps xmm7, xmm6 ; xmm7=data0
addps xmm4, xmm0 ; xmm4=data2
movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
; -- Odd part
; -- Odd part
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
addps xmm2,xmm3 ; xmm2=tmp10
addps xmm3,xmm6 ; xmm3=tmp11
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
addps xmm2, xmm3 ; xmm2=tmp10
addps xmm3, xmm6 ; xmm3=tmp11
addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
movaps xmm1,xmm2 ; xmm1=tmp10
subps xmm2,xmm6
mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1,xmm2 ; xmm1=z2
addps xmm6,xmm2 ; xmm6=z4
movaps xmm1, xmm2 ; xmm1=tmp10
subps xmm2, xmm6
mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1, xmm2 ; xmm1=z2
addps xmm6, xmm2 ; xmm6=z4
movaps xmm5,xmm0
subps xmm0,xmm3 ; xmm0=z13
addps xmm5,xmm3 ; xmm5=z11
movaps xmm5, xmm0
subps xmm0, xmm3 ; xmm0=z13
addps xmm5, xmm3 ; xmm5=z11
movaps xmm7,xmm0
movaps xmm4,xmm5
subps xmm0,xmm1 ; xmm0=data3
subps xmm5,xmm6 ; xmm5=data7
addps xmm7,xmm1 ; xmm7=data5
addps xmm4,xmm6 ; xmm4=data1
movaps xmm7, xmm0
movaps xmm4, xmm5
subps xmm0, xmm1 ; xmm0=data3
subps xmm5, xmm6 ; xmm5=data7
addps xmm7, xmm1 ; xmm7=data5
addps xmm4, xmm6 ; xmm4=data1
movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
add edx, byte 4*SIZEOF_FAST_FLOAT
dec ecx
jnz near .columnloop
add edx, byte 4*SIZEOF_FAST_FLOAT
dec ecx
jnz near .columnloop
; pop edi ; unused
; pop esi ; unused
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
poppic ebx
mov esp,ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; pop edi ; unused
; pop esi ; unused
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
poppic ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -26,46 +26,46 @@
; --------------------------------------------------------------------------
%define CONST_BITS 8 ; 14 is also OK.
%define CONST_BITS 8 ; 14 is also OK.
%if CONST_BITS == 8
F_0_382 equ 98 ; FIX(0.382683433)
F_0_541 equ 139 ; FIX(0.541196100)
F_0_707 equ 181 ; FIX(0.707106781)
F_1_306 equ 334 ; FIX(1.306562965)
F_0_382 equ 98 ; FIX(0.382683433)
F_0_541 equ 139 ; FIX(0.541196100)
F_0_707 equ 181 ; FIX(0.707106781)
F_1_306 equ 334 ; FIX(1.306562965)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433)
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781)
F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965)
F_0_382 equ DESCALE( 410903207, 30-CONST_BITS) ; FIX(0.382683433)
F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
F_0_707 equ DESCALE( 759250124, 30-CONST_BITS) ; FIX(0.707106781)
F_1_306 equ DESCALE(1402911301, 30-CONST_BITS) ; FIX(1.306562965)
%endif
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
%define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
%define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
alignz 16
global EXTN(jconst_fdct_ifast_sse2)
alignz 16
global EXTN(jconst_fdct_ifast_sse2)
EXTN(jconst_fdct_ifast_sse2):
PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
SECTION SEG_TEXT
BITS 64
;
; Perform the forward DCT on one block of samples.
;
@@ -75,317 +75,317 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
; r10 = DCTELEM *data
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
align 16
global EXTN(jsimd_fdct_ifast_sse2)
align 16
global EXTN(jsimd_fdct_ifast_sse2)
EXTN(jsimd_fdct_ifast_sse2):
push rbp
mov rax,rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax
mov rbp,rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
; ---- Pass 1: process rows.
; ---- Pass 1: process rows.
mov rdx, r10 ; (DCTELEM *)
mov rdx, r10 ; (DCTELEM *)
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
movdqa xmm6,xmm1
movdqa xmm3,xmm0
psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
movdqa xmm6, xmm1
movdqa xmm3, xmm0
psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
movdqa xmm2,xmm1
movdqa xmm5,xmm7
paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
movdqa xmm2, xmm1
movdqa xmm5, xmm7
paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
; -- Even part
; -- Even part
movdqa xmm4,xmm3
movdqa xmm0,xmm6
psubw xmm3,xmm1 ; xmm3=tmp13
psubw xmm6,xmm7 ; xmm6=tmp12
paddw xmm4,xmm1 ; xmm4=tmp10
paddw xmm0,xmm7 ; xmm0=tmp11
movdqa xmm4, xmm3
movdqa xmm0, xmm6
psubw xmm3, xmm1 ; xmm3=tmp13
psubw xmm6, xmm7 ; xmm6=tmp12
paddw xmm4, xmm1 ; xmm4=tmp10
paddw xmm0, xmm7 ; xmm0=tmp11
paddw xmm6,xmm3
psllw xmm6,PRE_MULTIPLY_SCALE_BITS
pmulhw xmm6,[rel PW_F0707] ; xmm6=z1
paddw xmm6, xmm3
psllw xmm6, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm6, [rel PW_F0707] ; xmm6=z1
movdqa xmm1,xmm4
movdqa xmm7,xmm3
psubw xmm4,xmm0 ; xmm4=data4
psubw xmm3,xmm6 ; xmm3=data6
paddw xmm1,xmm0 ; xmm1=data0
paddw xmm7,xmm6 ; xmm7=data2
movdqa xmm1, xmm4
movdqa xmm7, xmm3
psubw xmm4, xmm0 ; xmm4=data4
psubw xmm3, xmm6 ; xmm3=data6
paddw xmm1, xmm0 ; xmm1=data0
paddw xmm7, xmm6 ; xmm7=data2
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
; -- Odd part
; -- Odd part
paddw xmm2,xmm5 ; xmm2=tmp10
paddw xmm5,xmm0 ; xmm5=tmp11
paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7
paddw xmm2, xmm5 ; xmm2=tmp10
paddw xmm5, xmm0 ; xmm5=tmp11
paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
psllw xmm2,PRE_MULTIPLY_SCALE_BITS
psllw xmm0,PRE_MULTIPLY_SCALE_BITS
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[rel PW_F0707] ; xmm5=z3
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5, [rel PW_F0707] ; xmm5=z3
movdqa xmm4,xmm2 ; xmm4=tmp10
psubw xmm2,xmm0
pmulhw xmm2,[rel PW_F0382] ; xmm2=z5
pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4,xmm2 ; xmm4=z2
paddw xmm0,xmm2 ; xmm0=z4
movdqa xmm4, xmm2 ; xmm4=tmp10
psubw xmm2, xmm0
pmulhw xmm2, [rel PW_F0382] ; xmm2=z5
pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm0, [rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4, xmm2 ; xmm4=z2
paddw xmm0, xmm2 ; xmm0=z4
movdqa xmm3,xmm6
psubw xmm6,xmm5 ; xmm6=z13
paddw xmm3,xmm5 ; xmm3=z11
movdqa xmm3, xmm6
psubw xmm6, xmm5 ; xmm6=z13
paddw xmm3, xmm5 ; xmm3=z11
movdqa xmm2,xmm6
movdqa xmm5,xmm3
psubw xmm6,xmm4 ; xmm6=data3
psubw xmm3,xmm0 ; xmm3=data7
paddw xmm2,xmm4 ; xmm2=data5
paddw xmm5,xmm0 ; xmm5=data1
movdqa xmm2, xmm6
movdqa xmm5, xmm3
psubw xmm6, xmm4 ; xmm6=data3
psubw xmm3, xmm0 ; xmm3=data7
paddw xmm2, xmm4 ; xmm2=data5
paddw xmm5, xmm0 ; xmm5=data1
; ---- Pass 2: process columns.
; ---- Pass 2: process columns.
; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
movdqa xmm4,xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
movdqa xmm7,xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
movdqa xmm0,xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
movdqa xmm2,xmm5 ; transpose coefficients(phase 2)
punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
movdqa xmm3,xmm7 ; transpose coefficients(phase 2)
punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
movdqa xmm2,xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
movdqa xmm7,xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
movdqa xmm0,xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
movdqa xmm5,xmm6
movdqa xmm3,xmm1
psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6
psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7
paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1
paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0
movdqa xmm5, xmm6
movdqa xmm3, xmm1
psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
movdqa xmm6,xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
movdqa xmm1,xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
movdqa xmm7,xmm6
movdqa xmm0,xmm2
paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3
paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2
psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4
psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5
movdqa xmm7, xmm6
movdqa xmm0, xmm2
paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
; -- Even part
; -- Even part
movdqa xmm4,xmm3
movdqa xmm1,xmm5
psubw xmm3,xmm6 ; xmm3=tmp13
psubw xmm5,xmm2 ; xmm5=tmp12
paddw xmm4,xmm6 ; xmm4=tmp10
paddw xmm1,xmm2 ; xmm1=tmp11
movdqa xmm4, xmm3
movdqa xmm1, xmm5
psubw xmm3, xmm6 ; xmm3=tmp13
psubw xmm5, xmm2 ; xmm5=tmp12
paddw xmm4, xmm6 ; xmm4=tmp10
paddw xmm1, xmm2 ; xmm1=tmp11
paddw xmm5,xmm3
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[rel PW_F0707] ; xmm5=z1
paddw xmm5, xmm3
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5, [rel PW_F0707] ; xmm5=z1
movdqa xmm6,xmm4
movdqa xmm2,xmm3
psubw xmm4,xmm1 ; xmm4=data4
psubw xmm3,xmm5 ; xmm3=data6
paddw xmm6,xmm1 ; xmm6=data0
paddw xmm2,xmm5 ; xmm2=data2
movdqa xmm6, xmm4
movdqa xmm2, xmm3
psubw xmm4, xmm1 ; xmm4=data4
psubw xmm3, xmm5 ; xmm3=data6
paddw xmm6, xmm1 ; xmm6=data0
paddw xmm2, xmm5 ; xmm2=data2
movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
; -- Odd part
; -- Odd part
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
paddw xmm7,xmm0 ; xmm7=tmp10
paddw xmm0,xmm1 ; xmm0=tmp11
paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7
paddw xmm7, xmm0 ; xmm7=tmp10
paddw xmm0, xmm1 ; xmm0=tmp11
paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
psllw xmm7,PRE_MULTIPLY_SCALE_BITS
psllw xmm1,PRE_MULTIPLY_SCALE_BITS
psllw xmm7, PRE_MULTIPLY_SCALE_BITS
psllw xmm1, PRE_MULTIPLY_SCALE_BITS
psllw xmm0,PRE_MULTIPLY_SCALE_BITS
pmulhw xmm0,[rel PW_F0707] ; xmm0=z3
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm0, [rel PW_F0707] ; xmm0=z3
movdqa xmm4,xmm7 ; xmm4=tmp10
psubw xmm7,xmm1
pmulhw xmm7,[rel PW_F0382] ; xmm7=z5
pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4,xmm7 ; xmm4=z2
paddw xmm1,xmm7 ; xmm1=z4
movdqa xmm4, xmm7 ; xmm4=tmp10
psubw xmm7, xmm1
pmulhw xmm7, [rel PW_F0382] ; xmm7=z5
pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm1, [rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4, xmm7 ; xmm4=z2
paddw xmm1, xmm7 ; xmm1=z4
movdqa xmm3,xmm5
psubw xmm5,xmm0 ; xmm5=z13
paddw xmm3,xmm0 ; xmm3=z11
movdqa xmm3, xmm5
psubw xmm5, xmm0 ; xmm5=z13
paddw xmm3, xmm0 ; xmm3=z11
movdqa xmm6,xmm5
movdqa xmm2,xmm3
psubw xmm5,xmm4 ; xmm5=data3
psubw xmm3,xmm1 ; xmm3=data7
paddw xmm6,xmm4 ; xmm6=data5
paddw xmm2,xmm1 ; xmm2=data1
movdqa xmm6, xmm5
movdqa xmm2, xmm3
psubw xmm5, xmm4 ; xmm5=data3
psubw xmm3, xmm1 ; xmm3=data7
paddw xmm6, xmm4 ; xmm6=data5
paddw xmm2, xmm1 ; xmm2=data1
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
uncollect_args
mov rsp,rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
uncollect_args
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -25,46 +25,46 @@
; --------------------------------------------------------------------------
%define CONST_BITS 8 ; 14 is also OK.
%define CONST_BITS 8 ; 14 is also OK.
%if CONST_BITS == 8
F_0_382 equ 98 ; FIX(0.382683433)
F_0_541 equ 139 ; FIX(0.541196100)
F_0_707 equ 181 ; FIX(0.707106781)
F_1_306 equ 334 ; FIX(1.306562965)
F_0_382 equ 98 ; FIX(0.382683433)
F_0_541 equ 139 ; FIX(0.541196100)
F_0_707 equ 181 ; FIX(0.707106781)
F_1_306 equ 334 ; FIX(1.306562965)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433)
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781)
F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965)
F_0_382 equ DESCALE( 410903207, 30-CONST_BITS) ; FIX(0.382683433)
F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
F_0_707 equ DESCALE( 759250124, 30-CONST_BITS) ; FIX(0.707106781)
F_1_306 equ DESCALE(1402911301, 30-CONST_BITS) ; FIX(1.306562965)
%endif
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
%define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
%define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
alignz 16
global EXTN(jconst_fdct_ifast_sse2)
alignz 16
global EXTN(jconst_fdct_ifast_sse2)
EXTN(jconst_fdct_ifast_sse2):
PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
SECTION SEG_TEXT
BITS 32
;
; Perform the forward DCT on one block of samples.
;
@@ -72,332 +72,332 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
; jsimd_fdct_ifast_sse2 (DCTELEM *data)
;
%define data(b) (b)+8 ; DCTELEM *data
%define data(b) (b)+8 ; DCTELEM *data
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
align 16
global EXTN(jsimd_fdct_ifast_sse2)
align 16
global EXTN(jsimd_fdct_ifast_sse2)
EXTN(jsimd_fdct_ifast_sse2):
push ebp
mov eax,esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax
mov ebp,esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic ebx
; push ecx ; unused
; push edx ; need not be preserved
; push esi ; unused
; push edi ; unused
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic ebx
; push ecx ; unused
; push edx ; need not be preserved
; push esi ; unused
; push edi ; unused
get_GOT ebx ; get GOT address
get_GOT ebx ; get GOT address
; ---- Pass 1: process rows.
; ---- Pass 1: process rows.
mov edx, POINTER [data(eax)] ; (DCTELEM *)
mov edx, POINTER [data(eax)] ; (DCTELEM *)
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
movdqa xmm6,xmm1
movdqa xmm3,xmm0
psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
movdqa xmm6, xmm1
movdqa xmm3, xmm0
psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
movdqa xmm2,xmm1
movdqa xmm5,xmm7
paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
movdqa xmm2, xmm1
movdqa xmm5, xmm7
paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
; -- Even part
; -- Even part
movdqa xmm4,xmm3
movdqa xmm0,xmm6
psubw xmm3,xmm1 ; xmm3=tmp13
psubw xmm6,xmm7 ; xmm6=tmp12
paddw xmm4,xmm1 ; xmm4=tmp10
paddw xmm0,xmm7 ; xmm0=tmp11
movdqa xmm4, xmm3
movdqa xmm0, xmm6
psubw xmm3, xmm1 ; xmm3=tmp13
psubw xmm6, xmm7 ; xmm6=tmp12
paddw xmm4, xmm1 ; xmm4=tmp10
paddw xmm0, xmm7 ; xmm0=tmp11
paddw xmm6,xmm3
psllw xmm6,PRE_MULTIPLY_SCALE_BITS
pmulhw xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1
paddw xmm6, xmm3
psllw xmm6, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm6, [GOTOFF(ebx,PW_F0707)] ; xmm6=z1
movdqa xmm1,xmm4
movdqa xmm7,xmm3
psubw xmm4,xmm0 ; xmm4=data4
psubw xmm3,xmm6 ; xmm3=data6
paddw xmm1,xmm0 ; xmm1=data0
paddw xmm7,xmm6 ; xmm7=data2
movdqa xmm1, xmm4
movdqa xmm7, xmm3
psubw xmm4, xmm0 ; xmm4=data4
psubw xmm3, xmm6 ; xmm3=data6
paddw xmm1, xmm0 ; xmm1=data0
paddw xmm7, xmm6 ; xmm7=data2
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
; -- Odd part
; -- Odd part
paddw xmm2,xmm5 ; xmm2=tmp10
paddw xmm5,xmm0 ; xmm5=tmp11
paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7
paddw xmm2, xmm5 ; xmm2=tmp10
paddw xmm5, xmm0 ; xmm5=tmp11
paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
psllw xmm2,PRE_MULTIPLY_SCALE_BITS
psllw xmm0,PRE_MULTIPLY_SCALE_BITS
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z3
movdqa xmm4,xmm2 ; xmm4=tmp10
psubw xmm2,xmm0
pmulhw xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5
pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4,xmm2 ; xmm4=z2
paddw xmm0,xmm2 ; xmm0=z4
movdqa xmm4, xmm2 ; xmm4=tmp10
psubw xmm2, xmm0
pmulhw xmm2, [GOTOFF(ebx,PW_F0382)] ; xmm2=z5
pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm0, [GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4, xmm2 ; xmm4=z2
paddw xmm0, xmm2 ; xmm0=z4
movdqa xmm3,xmm6
psubw xmm6,xmm5 ; xmm6=z13
paddw xmm3,xmm5 ; xmm3=z11
movdqa xmm3, xmm6
psubw xmm6, xmm5 ; xmm6=z13
paddw xmm3, xmm5 ; xmm3=z11
movdqa xmm2,xmm6
movdqa xmm5,xmm3
psubw xmm6,xmm4 ; xmm6=data3
psubw xmm3,xmm0 ; xmm3=data7
paddw xmm2,xmm4 ; xmm2=data5
paddw xmm5,xmm0 ; xmm5=data1
movdqa xmm2, xmm6
movdqa xmm5, xmm3
psubw xmm6, xmm4 ; xmm6=data3
psubw xmm3, xmm0 ; xmm3=data7
paddw xmm2, xmm4 ; xmm2=data5
paddw xmm5, xmm0 ; xmm5=data1
; ---- Pass 2: process columns.
; ---- Pass 2: process columns.
; mov edx, POINTER [data(eax)] ; (DCTELEM *)
; mov edx, POINTER [data(eax)] ; (DCTELEM *)
; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
movdqa xmm4,xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
movdqa xmm7,xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
movdqa xmm0,xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
movdqa xmm2,xmm5 ; transpose coefficients(phase 2)
punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
movdqa xmm3,xmm7 ; transpose coefficients(phase 2)
punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
movdqa xmm2,xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
movdqa xmm7,xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
movdqa xmm0,xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
movdqa xmm5,xmm6
movdqa xmm3,xmm1
psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6
psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7
paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1
paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0
movdqa xmm5, xmm6
movdqa xmm3, xmm1
psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
movdqa xmm6,xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
movdqa xmm1,xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
movdqa xmm7,xmm6
movdqa xmm0,xmm2
paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3
paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2
psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4
psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5
movdqa xmm7, xmm6
movdqa xmm0, xmm2
paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
; -- Even part
; -- Even part
movdqa xmm4,xmm3
movdqa xmm1,xmm5
psubw xmm3,xmm6 ; xmm3=tmp13
psubw xmm5,xmm2 ; xmm5=tmp12
paddw xmm4,xmm6 ; xmm4=tmp10
paddw xmm1,xmm2 ; xmm1=tmp11
movdqa xmm4, xmm3
movdqa xmm1, xmm5
psubw xmm3, xmm6 ; xmm3=tmp13
psubw xmm5, xmm2 ; xmm5=tmp12
paddw xmm4, xmm6 ; xmm4=tmp10
paddw xmm1, xmm2 ; xmm1=tmp11
paddw xmm5,xmm3
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1
paddw xmm5, xmm3
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z1
movdqa xmm6,xmm4
movdqa xmm2,xmm3
psubw xmm4,xmm1 ; xmm4=data4
psubw xmm3,xmm5 ; xmm3=data6
paddw xmm6,xmm1 ; xmm6=data0
paddw xmm2,xmm5 ; xmm2=data2
movdqa xmm6, xmm4
movdqa xmm2, xmm3
psubw xmm4, xmm1 ; xmm4=data4
psubw xmm3, xmm5 ; xmm3=data6
paddw xmm6, xmm1 ; xmm6=data0
paddw xmm2, xmm5 ; xmm2=data2
movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
; -- Odd part
; -- Odd part
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
paddw xmm7,xmm0 ; xmm7=tmp10
paddw xmm0,xmm1 ; xmm0=tmp11
paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7
paddw xmm7, xmm0 ; xmm7=tmp10
paddw xmm0, xmm1 ; xmm0=tmp11
paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
psllw xmm7,PRE_MULTIPLY_SCALE_BITS
psllw xmm1,PRE_MULTIPLY_SCALE_BITS
psllw xmm7, PRE_MULTIPLY_SCALE_BITS
psllw xmm1, PRE_MULTIPLY_SCALE_BITS
psllw xmm0,PRE_MULTIPLY_SCALE_BITS
pmulhw xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm0, [GOTOFF(ebx,PW_F0707)] ; xmm0=z3
movdqa xmm4,xmm7 ; xmm4=tmp10
psubw xmm7,xmm1
pmulhw xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5
pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4,xmm7 ; xmm4=z2
paddw xmm1,xmm7 ; xmm1=z4
movdqa xmm4, xmm7 ; xmm4=tmp10
psubw xmm7, xmm1
pmulhw xmm7, [GOTOFF(ebx,PW_F0382)] ; xmm7=z5
pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm1, [GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4, xmm7 ; xmm4=z2
paddw xmm1, xmm7 ; xmm1=z4
movdqa xmm3,xmm5
psubw xmm5,xmm0 ; xmm5=z13
paddw xmm3,xmm0 ; xmm3=z11
movdqa xmm3, xmm5
psubw xmm5, xmm0 ; xmm5=z13
paddw xmm3, xmm0 ; xmm3=z11
movdqa xmm6,xmm5
movdqa xmm2,xmm3
psubw xmm5,xmm4 ; xmm5=data3
psubw xmm3,xmm1 ; xmm3=data7
paddw xmm6,xmm4 ; xmm6=data5
paddw xmm2,xmm1 ; xmm2=data1
movdqa xmm6, xmm5
movdqa xmm2, xmm3
psubw xmm5, xmm4 ; xmm5=data3
psubw xmm3, xmm1 ; xmm3=data7
paddw xmm6, xmm4 ; xmm6=data5
paddw xmm2, xmm1 ; xmm2=data1
movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
; pop edi ; unused
; pop esi ; unused
; pop edx ; need not be preserved
; pop ecx ; unused
poppic ebx
mov esp,ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; pop edi ; unused
; pop esi ; unused
; pop edx ; need not be preserved
; pop ecx ; unused
poppic ebx
mov esp,ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -25,34 +25,34 @@
; --------------------------------------------------------------------------
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1,%2,0x44
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1, %2, 0x44
%endmacro
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1,%2,0xEE
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1, %2, 0xEE
%endmacro
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
alignz 16
global EXTN(jconst_idct_float_sse2)
alignz 16
global EXTN(jconst_idct_float_sse2)
EXTN(jconst_idct_float_sse2):
PD_1_414 times 4 dd 1.414213562373095048801689
PD_1_847 times 4 dd 1.847759065022573512256366
PD_1_082 times 4 dd 1.082392200292393968799446
PD_M2_613 times 4 dd -2.613125929752753055713286
PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
PD_1_414 times 4 dd 1.414213562373095048801689
PD_1_847 times 4 dd 1.847759065022573512256366
PD_1_082 times 4 dd 1.082392200292393968799446
PD_M2_613 times 4 dd -2.613125929752753055713286
PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
SECTION SEG_TEXT
BITS 64
;
; Perform dequantization and inverse DCT on one block of coefficients.
;
@@ -66,417 +66,417 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col
%define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
%define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
; FAST_FLOAT workspace[DCTSIZE2]
align 16
global EXTN(jsimd_idct_float_sse2)
align 16
global EXTN(jsimd_idct_float_sse2)
EXTN(jsimd_idct_float_sse2):
push rbp
mov rax,rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax
mov rbp,rsp ; rbp = aligned rbp
lea rsp, [workspace]
collect_args
push rbx
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [workspace]
collect_args
push rbx
; ---- Pass 1: process columns from input, store into work array.
; ---- Pass 1: process columns from input, store into work array.
mov rdx, r10 ; quantptr
mov rsi, r11 ; inptr
lea rdi, [workspace] ; FAST_FLOAT *wsptr
mov rcx, DCTSIZE/4 ; ctr
mov rdx, r10 ; quantptr
mov rsi, r11 ; inptr
lea rdi, [workspace] ; FAST_FLOAT *wsptr
mov rcx, DCTSIZE/4 ; ctr
.columnloop:
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
jnz near .columnDCT
mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
jnz near .columnDCT
movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
por xmm1,xmm2
por xmm3,xmm4
por xmm5,xmm6
por xmm1,xmm3
por xmm5,xmm7
por xmm1,xmm5
packsswb xmm1,xmm1
movd eax,xmm1
test rax,rax
jnz short .columnDCT
movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
por xmm1, xmm2
por xmm3, xmm4
por xmm5, xmm6
por xmm1, xmm3
por xmm5, xmm7
por xmm1, xmm5
packsswb xmm1, xmm1
movd eax, xmm1
test rax, rax
jnz short .columnDCT
; -- AC terms all zero
; -- AC terms all zero
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm1,xmm0
movaps xmm2,xmm0
movaps xmm3,xmm0
movaps xmm1, xmm0
movaps xmm2, xmm0
movaps xmm3, xmm0
shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
jmp near .nextcolumn
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
jmp near .nextcolumn
%endif
.columnDCT:
; -- Even part
; -- Even part
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)
cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)
punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm4,xmm0
movaps xmm5,xmm1
subps xmm0,xmm2 ; xmm0=tmp11
subps xmm1,xmm3
addps xmm4,xmm2 ; xmm4=tmp10
addps xmm5,xmm3 ; xmm5=tmp13
movaps xmm4, xmm0
movaps xmm5, xmm1
subps xmm0, xmm2 ; xmm0=tmp11
subps xmm1, xmm3
addps xmm4, xmm2 ; xmm4=tmp10
addps xmm5, xmm3 ; xmm5=tmp13
mulps xmm1,[rel PD_1_414]
subps xmm1,xmm5 ; xmm1=tmp12
mulps xmm1, [rel PD_1_414]
subps xmm1, xmm5 ; xmm1=tmp12
movaps xmm6,xmm4
movaps xmm7,xmm0
subps xmm4,xmm5 ; xmm4=tmp3
subps xmm0,xmm1 ; xmm0=tmp2
addps xmm6,xmm5 ; xmm6=tmp0
addps xmm7,xmm1 ; xmm7=tmp1
movaps xmm6, xmm4
movaps xmm7, xmm0
subps xmm4, xmm5 ; xmm4=tmp3
subps xmm0, xmm1 ; xmm0=tmp2
addps xmm6, xmm5 ; xmm6=tmp0
addps xmm7, xmm1 ; xmm7=tmp1
movaps XMMWORD [wk(1)], xmm4 ; tmp3
movaps XMMWORD [wk(0)], xmm0 ; tmp2
movaps XMMWORD [wk(1)], xmm4 ; tmp3
movaps XMMWORD [wk(0)], xmm0 ; tmp2
; -- Odd part
; -- Odd part
movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)
cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)
punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)
cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)
punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm4,xmm2
movaps xmm0,xmm5
addps xmm2,xmm1 ; xmm2=z11
addps xmm5,xmm3 ; xmm5=z13
subps xmm4,xmm1 ; xmm4=z12
subps xmm0,xmm3 ; xmm0=z10
movaps xmm4, xmm2
movaps xmm0, xmm5
addps xmm2, xmm1 ; xmm2=z11
addps xmm5, xmm3 ; xmm5=z13
subps xmm4, xmm1 ; xmm4=z12
subps xmm0, xmm3 ; xmm0=z10
movaps xmm1,xmm2
subps xmm2,xmm5
addps xmm1,xmm5 ; xmm1=tmp7
movaps xmm1, xmm2
subps xmm2, xmm5
addps xmm1, xmm5 ; xmm1=tmp7
mulps xmm2,[rel PD_1_414] ; xmm2=tmp11
mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
movaps xmm3,xmm0
addps xmm0,xmm4
mulps xmm0,[rel PD_1_847] ; xmm0=z5
mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)
addps xmm3,xmm0 ; xmm3=tmp12
subps xmm4,xmm0 ; xmm4=tmp10
movaps xmm3, xmm0
addps xmm0, xmm4
mulps xmm0, [rel PD_1_847] ; xmm0=z5
mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
addps xmm3, xmm0 ; xmm3=tmp12
subps xmm4, xmm0 ; xmm4=tmp10
; -- Final output stage
; -- Final output stage
subps xmm3,xmm1 ; xmm3=tmp6
movaps xmm5,xmm6
movaps xmm0,xmm7
addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
subps xmm2,xmm3 ; xmm2=tmp5
subps xmm3, xmm1 ; xmm3=tmp6
movaps xmm5, xmm6
movaps xmm0, xmm7
addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
subps xmm2, xmm3 ; xmm2=tmp5
movaps xmm1,xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
movaps xmm3,xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
movaps xmm1, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
movaps xmm3, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
addps xmm4,xmm2 ; xmm4=tmp4
movaps xmm0,xmm7
movaps xmm3,xmm5
addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
addps xmm4, xmm2 ; xmm4=tmp4
movaps xmm0, xmm7
movaps xmm3, xmm5
addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
movaps xmm2,xmm7 ; transpose coefficients(phase 1)
unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
movaps xmm4,xmm5 ; transpose coefficients(phase 1)
unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
movaps xmm2, xmm7 ; transpose coefficients(phase 1)
unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
movaps xmm4, xmm5 ; transpose coefficients(phase 1)
unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
movaps xmm3,xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
movaps xmm0,xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
movaps xmm3, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
movaps xmm0, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
movaps xmm6,xmm5 ; transpose coefficients(phase 2)
unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
movaps xmm3,xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
movaps xmm6, xmm5 ; transpose coefficients(phase 2)
unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
movaps xmm3, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
.nextcolumn:
add rsi, byte 4*SIZEOF_JCOEF ; coef_block
add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
dec rcx ; ctr
jnz near .columnloop
add rsi, byte 4*SIZEOF_JCOEF ; coef_block
add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
dec rcx ; ctr
jnz near .columnloop
; -- Prefetch the next coefficient block
; -- Prefetch the next coefficient block
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
; ---- Pass 2: process rows from work array, store into output array.
; ---- Pass 2: process rows from work array, store into output array.
mov rax, [original_rbp]
lea rsi, [workspace] ; FAST_FLOAT *wsptr
mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d
mov rcx, DCTSIZE/4 ; ctr
mov rax, [original_rbp]
lea rsi, [workspace] ; FAST_FLOAT *wsptr
mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d
mov rcx, DCTSIZE/4 ; ctr
.rowloop:
; -- Even part
; -- Even part
movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm4,xmm0
movaps xmm5,xmm1
subps xmm0,xmm2 ; xmm0=tmp11
subps xmm1,xmm3
addps xmm4,xmm2 ; xmm4=tmp10
addps xmm5,xmm3 ; xmm5=tmp13
movaps xmm4, xmm0
movaps xmm5, xmm1
subps xmm0, xmm2 ; xmm0=tmp11
subps xmm1, xmm3
addps xmm4, xmm2 ; xmm4=tmp10
addps xmm5, xmm3 ; xmm5=tmp13
mulps xmm1,[rel PD_1_414]
subps xmm1,xmm5 ; xmm1=tmp12
mulps xmm1, [rel PD_1_414]
subps xmm1, xmm5 ; xmm1=tmp12
movaps xmm6,xmm4
movaps xmm7,xmm0
subps xmm4,xmm5 ; xmm4=tmp3
subps xmm0,xmm1 ; xmm0=tmp2
addps xmm6,xmm5 ; xmm6=tmp0
addps xmm7,xmm1 ; xmm7=tmp1
movaps xmm6, xmm4
movaps xmm7, xmm0
subps xmm4, xmm5 ; xmm4=tmp3
subps xmm0, xmm1 ; xmm0=tmp2
addps xmm6, xmm5 ; xmm6=tmp0
addps xmm7, xmm1 ; xmm7=tmp1
movaps XMMWORD [wk(1)], xmm4 ; tmp3
movaps XMMWORD [wk(0)], xmm0 ; tmp2
movaps XMMWORD [wk(1)], xmm4 ; tmp3
movaps XMMWORD [wk(0)], xmm0 ; tmp2
; -- Odd part
; -- Odd part
movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm4,xmm2
movaps xmm0,xmm5
addps xmm2,xmm1 ; xmm2=z11
addps xmm5,xmm3 ; xmm5=z13
subps xmm4,xmm1 ; xmm4=z12
subps xmm0,xmm3 ; xmm0=z10
movaps xmm4, xmm2
movaps xmm0, xmm5
addps xmm2, xmm1 ; xmm2=z11
addps xmm5, xmm3 ; xmm5=z13
subps xmm4, xmm1 ; xmm4=z12
subps xmm0, xmm3 ; xmm0=z10
movaps xmm1,xmm2
subps xmm2,xmm5
addps xmm1,xmm5 ; xmm1=tmp7
movaps xmm1, xmm2
subps xmm2, xmm5
addps xmm1, xmm5 ; xmm1=tmp7
mulps xmm2,[rel PD_1_414] ; xmm2=tmp11
mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
movaps xmm3,xmm0
addps xmm0,xmm4
mulps xmm0,[rel PD_1_847] ; xmm0=z5
mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)
addps xmm3,xmm0 ; xmm3=tmp12
subps xmm4,xmm0 ; xmm4=tmp10
movaps xmm3, xmm0
addps xmm0, xmm4
mulps xmm0, [rel PD_1_847] ; xmm0=z5
mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
addps xmm3, xmm0 ; xmm3=tmp12
subps xmm4, xmm0 ; xmm4=tmp10
; -- Final output stage
; -- Final output stage
subps xmm3,xmm1 ; xmm3=tmp6
movaps xmm5,xmm6
movaps xmm0,xmm7
addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
subps xmm2,xmm3 ; xmm2=tmp5
subps xmm3, xmm1 ; xmm3=tmp6
movaps xmm5, xmm6
movaps xmm0, xmm7
addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
subps xmm2, xmm3 ; xmm2=tmp5
movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
pcmpeqd xmm3,xmm3
psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
pcmpeqd xmm3, xmm3
psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
addps xmm4,xmm2 ; xmm4=tmp4
movaps xmm7,xmm1
movaps xmm5,xmm3
addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)
addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)
subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)
subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)
addps xmm4, xmm2 ; xmm4=tmp4
movaps xmm7, xmm1
movaps xmm5, xmm3
addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
pcmpeqd xmm4,xmm4
psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
pcmpeqd xmm4, xmm4
psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
paddb xmm6,xmm2
paddb xmm1,xmm2
packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
paddb xmm6, xmm2
paddb xmm1, xmm2
movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
movdqa xmm7,xmm6 ; transpose coefficients(phase 3)
punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
add rdi, byte 4*SIZEOF_JSAMPROW
dec rcx ; ctr
jnz near .rowloop
add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
add rdi, byte 4*SIZEOF_JSAMPROW
dec rcx ; ctr
jnz near .rowloop
pop rbx
uncollect_args
mov rsp,rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
pop rbx
uncollect_args
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -24,34 +24,34 @@
; --------------------------------------------------------------------------
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1,%2,0x44
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1, %2, 0x44
%endmacro
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1,%2,0xEE
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1, %2, 0xEE
%endmacro
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
alignz 16
global EXTN(jconst_idct_float_sse2)
alignz 16
global EXTN(jconst_idct_float_sse2)
EXTN(jconst_idct_float_sse2):
PD_1_414 times 4 dd 1.414213562373095048801689
PD_1_847 times 4 dd 1.847759065022573512256366
PD_1_082 times 4 dd 1.082392200292393968799446
PD_M2_613 times 4 dd -2.613125929752753055713286
PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
PD_1_414 times 4 dd 1.414213562373095048801689
PD_1_847 times 4 dd 1.847759065022573512256366
PD_1_082 times 4 dd 1.082392200292393968799446
PD_M2_613 times 4 dd -2.613125929752753055713286
PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
SECTION SEG_TEXT
BITS 32
;
; Perform dequantization and inverse DCT on one block of coefficients.
;
@@ -60,438 +60,438 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; JSAMPARRAY output_buf, JDIMENSION output_col)
;
%define dct_table(b) (b)+8 ; void *dct_table
%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
%define output_col(b) (b)+20 ; JDIMENSION output_col
%define dct_table(b) (b)+8 ; void *dct_table
%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
%define output_col(b) (b)+20 ; JDIMENSION output_col
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
; FAST_FLOAT workspace[DCTSIZE2]
align 16
global EXTN(jsimd_idct_float_sse2)
align 16
global EXTN(jsimd_idct_float_sse2)
EXTN(jsimd_idct_float_sse2):
push ebp
mov eax,esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax
mov ebp,esp ; ebp = aligned ebp
lea esp, [workspace]
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [workspace]
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
get_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input, store into work array.
; ---- Pass 1: process columns from input, store into work array.
; mov eax, [original_ebp]
mov edx, POINTER [dct_table(eax)] ; quantptr
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
lea edi, [workspace] ; FAST_FLOAT *wsptr
mov ecx, DCTSIZE/4 ; ctr
alignx 16,7
; mov eax, [original_ebp]
mov edx, POINTER [dct_table(eax)] ; quantptr
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
lea edi, [workspace] ; FAST_FLOAT *wsptr
mov ecx, DCTSIZE/4 ; ctr
alignx 16, 7
.columnloop:
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz near .columnDCT
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz near .columnDCT
movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
por xmm1,xmm2
por xmm3,xmm4
por xmm5,xmm6
por xmm1,xmm3
por xmm5,xmm7
por xmm1,xmm5
packsswb xmm1,xmm1
movd eax,xmm1
test eax,eax
jnz short .columnDCT
movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
por xmm1, xmm2
por xmm3, xmm4
por xmm5, xmm6
por xmm1, xmm3
por xmm5, xmm7
por xmm1, xmm5
packsswb xmm1, xmm1
movd eax, xmm1
test eax, eax
jnz short .columnDCT
; -- AC terms all zero
; -- AC terms all zero
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm1,xmm0
movaps xmm2,xmm0
movaps xmm3,xmm0
movaps xmm1, xmm0
movaps xmm2, xmm0
movaps xmm3, xmm0
shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
jmp near .nextcolumn
alignx 16,7
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
jmp near .nextcolumn
alignx 16, 7
%endif
.columnDCT:
; -- Even part
; -- Even part
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)
cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)
punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm4,xmm0
movaps xmm5,xmm1
subps xmm0,xmm2 ; xmm0=tmp11
subps xmm1,xmm3
addps xmm4,xmm2 ; xmm4=tmp10
addps xmm5,xmm3 ; xmm5=tmp13
movaps xmm4, xmm0
movaps xmm5, xmm1
subps xmm0, xmm2 ; xmm0=tmp11
subps xmm1, xmm3
addps xmm4, xmm2 ; xmm4=tmp10
addps xmm5, xmm3 ; xmm5=tmp13
mulps xmm1,[GOTOFF(ebx,PD_1_414)]
subps xmm1,xmm5 ; xmm1=tmp12
mulps xmm1, [GOTOFF(ebx,PD_1_414)]
subps xmm1, xmm5 ; xmm1=tmp12
movaps xmm6,xmm4
movaps xmm7,xmm0
subps xmm4,xmm5 ; xmm4=tmp3
subps xmm0,xmm1 ; xmm0=tmp2
addps xmm6,xmm5 ; xmm6=tmp0
addps xmm7,xmm1 ; xmm7=tmp1
movaps xmm6, xmm4
movaps xmm7, xmm0
subps xmm4, xmm5 ; xmm4=tmp3
subps xmm0, xmm1 ; xmm0=tmp2
addps xmm6, xmm5 ; xmm6=tmp0
addps xmm7, xmm1 ; xmm7=tmp1
movaps XMMWORD [wk(1)], xmm4 ; tmp3
movaps XMMWORD [wk(0)], xmm0 ; tmp2
movaps XMMWORD [wk(1)], xmm4 ; tmp3
movaps XMMWORD [wk(0)], xmm0 ; tmp2
; -- Odd part
; -- Odd part
movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)
cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)
punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)
cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)
punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm4,xmm2
movaps xmm0,xmm5
addps xmm2,xmm1 ; xmm2=z11
addps xmm5,xmm3 ; xmm5=z13
subps xmm4,xmm1 ; xmm4=z12
subps xmm0,xmm3 ; xmm0=z10
movaps xmm4, xmm2
movaps xmm0, xmm5
addps xmm2, xmm1 ; xmm2=z11
addps xmm5, xmm3 ; xmm5=z13
subps xmm4, xmm1 ; xmm4=z12
subps xmm0, xmm3 ; xmm0=z10
movaps xmm1,xmm2
subps xmm2,xmm5
addps xmm1,xmm5 ; xmm1=tmp7
movaps xmm1, xmm2
subps xmm2, xmm5
addps xmm1, xmm5 ; xmm1=tmp7
mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
movaps xmm3,xmm0
addps xmm0,xmm4
mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
addps xmm3,xmm0 ; xmm3=tmp12
subps xmm4,xmm0 ; xmm4=tmp10
movaps xmm3, xmm0
addps xmm0, xmm4
mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
addps xmm3, xmm0 ; xmm3=tmp12
subps xmm4, xmm0 ; xmm4=tmp10
; -- Final output stage
; -- Final output stage
subps xmm3,xmm1 ; xmm3=tmp6
movaps xmm5,xmm6
movaps xmm0,xmm7
addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
subps xmm2,xmm3 ; xmm2=tmp5
subps xmm3, xmm1 ; xmm3=tmp6
movaps xmm5, xmm6
movaps xmm0, xmm7
addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
subps xmm2, xmm3 ; xmm2=tmp5
movaps xmm1,xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
movaps xmm3,xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
movaps xmm1, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
movaps xmm3, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
addps xmm4,xmm2 ; xmm4=tmp4
movaps xmm0,xmm7
movaps xmm3,xmm5
addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
addps xmm4, xmm2 ; xmm4=tmp4
movaps xmm0, xmm7
movaps xmm3, xmm5
addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
movaps xmm2,xmm7 ; transpose coefficients(phase 1)
unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
movaps xmm4,xmm5 ; transpose coefficients(phase 1)
unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
movaps xmm2, xmm7 ; transpose coefficients(phase 1)
unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
movaps xmm4, xmm5 ; transpose coefficients(phase 1)
unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
movaps xmm3,xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
movaps xmm0,xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
movaps xmm3, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
movaps xmm0, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
movaps xmm6,xmm5 ; transpose coefficients(phase 2)
unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
movaps xmm3,xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
movaps xmm6, xmm5 ; transpose coefficients(phase 2)
unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
movaps xmm3, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
.nextcolumn:
add esi, byte 4*SIZEOF_JCOEF ; coef_block
add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
dec ecx ; ctr
jnz near .columnloop
add esi, byte 4*SIZEOF_JCOEF ; coef_block
add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
dec ecx ; ctr
jnz near .columnloop
; -- Prefetch the next coefficient block
; -- Prefetch the next coefficient block
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
; ---- Pass 2: process rows from work array, store into output array.
; ---- Pass 2: process rows from work array, store into output array.
mov eax, [original_ebp]
lea esi, [workspace] ; FAST_FLOAT *wsptr
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(eax)]
mov ecx, DCTSIZE/4 ; ctr
alignx 16,7
mov eax, [original_ebp]
lea esi, [workspace] ; FAST_FLOAT *wsptr
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(eax)]
mov ecx, DCTSIZE/4 ; ctr
alignx 16, 7
.rowloop:
; -- Even part
; -- Even part
movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm4,xmm0
movaps xmm5,xmm1
subps xmm0,xmm2 ; xmm0=tmp11
subps xmm1,xmm3
addps xmm4,xmm2 ; xmm4=tmp10
addps xmm5,xmm3 ; xmm5=tmp13
movaps xmm4, xmm0
movaps xmm5, xmm1
subps xmm0, xmm2 ; xmm0=tmp11
subps xmm1, xmm3
addps xmm4, xmm2 ; xmm4=tmp10
addps xmm5, xmm3 ; xmm5=tmp13
mulps xmm1,[GOTOFF(ebx,PD_1_414)]
subps xmm1,xmm5 ; xmm1=tmp12
mulps xmm1, [GOTOFF(ebx,PD_1_414)]
subps xmm1, xmm5 ; xmm1=tmp12
movaps xmm6,xmm4
movaps xmm7,xmm0
subps xmm4,xmm5 ; xmm4=tmp3
subps xmm0,xmm1 ; xmm0=tmp2
addps xmm6,xmm5 ; xmm6=tmp0
addps xmm7,xmm1 ; xmm7=tmp1
movaps xmm6, xmm4
movaps xmm7, xmm0
subps xmm4, xmm5 ; xmm4=tmp3
subps xmm0, xmm1 ; xmm0=tmp2
addps xmm6, xmm5 ; xmm6=tmp0
addps xmm7, xmm1 ; xmm7=tmp1
movaps XMMWORD [wk(1)], xmm4 ; tmp3
movaps XMMWORD [wk(0)], xmm0 ; tmp2
movaps XMMWORD [wk(1)], xmm4 ; tmp3
movaps XMMWORD [wk(0)], xmm0 ; tmp2
; -- Odd part
; -- Odd part
movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm4,xmm2
movaps xmm0,xmm5
addps xmm2,xmm1 ; xmm2=z11
addps xmm5,xmm3 ; xmm5=z13
subps xmm4,xmm1 ; xmm4=z12
subps xmm0,xmm3 ; xmm0=z10
movaps xmm4, xmm2
movaps xmm0, xmm5
addps xmm2, xmm1 ; xmm2=z11
addps xmm5, xmm3 ; xmm5=z13
subps xmm4, xmm1 ; xmm4=z12
subps xmm0, xmm3 ; xmm0=z10
movaps xmm1,xmm2
subps xmm2,xmm5
addps xmm1,xmm5 ; xmm1=tmp7
movaps xmm1, xmm2
subps xmm2, xmm5
addps xmm1, xmm5 ; xmm1=tmp7
mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
movaps xmm3,xmm0
addps xmm0,xmm4
mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
addps xmm3,xmm0 ; xmm3=tmp12
subps xmm4,xmm0 ; xmm4=tmp10
movaps xmm3, xmm0
addps xmm0, xmm4
mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
addps xmm3, xmm0 ; xmm3=tmp12
subps xmm4, xmm0 ; xmm4=tmp10
; -- Final output stage
; -- Final output stage
subps xmm3,xmm1 ; xmm3=tmp6
movaps xmm5,xmm6
movaps xmm0,xmm7
addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
subps xmm2,xmm3 ; xmm2=tmp5
subps xmm3, xmm1 ; xmm3=tmp6
movaps xmm5, xmm6
movaps xmm0, xmm7
addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
subps xmm2, xmm3 ; xmm2=tmp5
movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
pcmpeqd xmm3,xmm3
psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
pcmpeqd xmm3, xmm3
psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
addps xmm4,xmm2 ; xmm4=tmp4
movaps xmm7,xmm1
movaps xmm5,xmm3
addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)
addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)
subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)
subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)
addps xmm4, xmm2 ; xmm4=tmp4
movaps xmm7, xmm1
movaps xmm5, xmm3
addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
pcmpeqd xmm4,xmm4
psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
pcmpeqd xmm4, xmm4
psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
paddb xmm6,xmm2
paddb xmm1,xmm2
packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
paddb xmm6, xmm2
paddb xmm1, xmm2
movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
movdqa xmm7,xmm6 ; transpose coefficients(phase 3)
punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
pushpic ebx ; save GOT address
pushpic ebx ; save GOT address
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
poppic ebx ; restore GOT address
poppic ebx ; restore GOT address
add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
add edi, byte 4*SIZEOF_JSAMPROW
dec ecx ; ctr
jnz near .rowloop
add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
add edi, byte 4*SIZEOF_JSAMPROW
dec ecx ; ctr
jnz near .rowloop
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp,ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -26,54 +26,54 @@
; --------------------------------------------------------------------------
%define CONST_BITS 8 ; 14 is also OK.
%define PASS1_BITS 2
%define CONST_BITS 8 ; 14 is also OK.
%define PASS1_BITS 2
%if IFAST_SCALE_BITS != PASS1_BITS
%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
%endif
%if CONST_BITS == 8
F_1_082 equ 277 ; FIX(1.082392200)
F_1_414 equ 362 ; FIX(1.414213562)
F_1_847 equ 473 ; FIX(1.847759065)
F_2_613 equ 669 ; FIX(2.613125930)
F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
F_1_082 equ 277 ; FIX(1.082392200)
F_1_414 equ 362 ; FIX(1.414213562)
F_1_847 equ 473 ; FIX(1.847759065)
F_2_613 equ 669 ; FIX(2.613125930)
F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
F_1_082 equ DESCALE(1162209775, 30-CONST_BITS) ; FIX(1.082392200)
F_1_414 equ DESCALE(1518500249, 30-CONST_BITS) ; FIX(1.414213562)
F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
F_2_613 equ DESCALE(2805822602, 30-CONST_BITS) ; FIX(2.613125930)
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
%endif
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
%define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
%define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
alignz 16
global EXTN(jconst_idct_ifast_sse2)
alignz 16
global EXTN(jconst_idct_ifast_sse2)
EXTN(jconst_idct_ifast_sse2):
PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
SECTION SEG_TEXT
BITS 64
;
; Perform dequantization and inverse DCT on one block of coefficients.
;
@@ -87,405 +87,405 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col
%define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
align 16
global EXTN(jsimd_idct_ifast_sse2)
align 16
global EXTN(jsimd_idct_ifast_sse2)
EXTN(jsimd_idct_ifast_sse2):
push rbp
mov rax,rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax
mov rbp,rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
; ---- Pass 1: process columns from input.
; ---- Pass 1: process columns from input.
mov rdx, r10 ; quantptr
mov rsi, r11 ; inptr
mov rdx, r10 ; quantptr
mov rsi, r11 ; inptr
%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
jnz near .columnDCT
mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
jnz near .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
por xmm1,xmm0
packsswb xmm1,xmm1
packsswb xmm1,xmm1
movd eax,xmm1
test rax,rax
jnz short .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
por xmm1, xmm0
packsswb xmm1, xmm1
packsswb xmm1, xmm1
movd eax, xmm1
test rax, rax
jnz short .columnDCT
; -- AC terms all zero
; -- AC terms all zero
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
jmp near .column_end
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
jmp near .column_end
%endif
.columnDCT:
; -- Even part
; -- Even part
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm4,xmm0
movdqa xmm5,xmm1
psubw xmm0,xmm2 ; xmm0=tmp11
psubw xmm1,xmm3
paddw xmm4,xmm2 ; xmm4=tmp10
paddw xmm5,xmm3 ; xmm5=tmp13
movdqa xmm4, xmm0
movdqa xmm5, xmm1
psubw xmm0, xmm2 ; xmm0=tmp11
psubw xmm1, xmm3
paddw xmm4, xmm2 ; xmm4=tmp10
paddw xmm5, xmm3 ; xmm5=tmp13
psllw xmm1,PRE_MULTIPLY_SCALE_BITS
pmulhw xmm1,[rel PW_F1414]
psubw xmm1,xmm5 ; xmm1=tmp12
psllw xmm1, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm1, [rel PW_F1414]
psubw xmm1, xmm5 ; xmm1=tmp12
movdqa xmm6,xmm4
movdqa xmm7,xmm0
psubw xmm4,xmm5 ; xmm4=tmp3
psubw xmm0,xmm1 ; xmm0=tmp2
paddw xmm6,xmm5 ; xmm6=tmp0
paddw xmm7,xmm1 ; xmm7=tmp1
movdqa xmm6, xmm4
movdqa xmm7, xmm0
psubw xmm4, xmm5 ; xmm4=tmp3
psubw xmm0, xmm1 ; xmm0=tmp2
paddw xmm6, xmm5 ; xmm6=tmp0
paddw xmm7, xmm1 ; xmm7=tmp1
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
; -- Odd part
; -- Odd part
movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm4,xmm2
movdqa xmm0,xmm5
psubw xmm2,xmm1 ; xmm2=z12
psubw xmm5,xmm3 ; xmm5=z10
paddw xmm4,xmm1 ; xmm4=z11
paddw xmm0,xmm3 ; xmm0=z13
movdqa xmm4, xmm2
movdqa xmm0, xmm5
psubw xmm2, xmm1 ; xmm2=z12
psubw xmm5, xmm3 ; xmm5=z10
paddw xmm4, xmm1 ; xmm4=z11
paddw xmm0, xmm3 ; xmm0=z13
movdqa xmm1,xmm5 ; xmm1=z10(unscaled)
psllw xmm2,PRE_MULTIPLY_SCALE_BITS
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
movdqa xmm1, xmm5 ; xmm1=z10(unscaled)
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
movdqa xmm3,xmm4
psubw xmm4,xmm0
paddw xmm3,xmm0 ; xmm3=tmp7
movdqa xmm3, xmm4
psubw xmm4, xmm0
paddw xmm3, xmm0 ; xmm3=tmp7
psllw xmm4,PRE_MULTIPLY_SCALE_BITS
pmulhw xmm4,[rel PW_F1414] ; xmm4=tmp11
psllw xmm4, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm4, [rel PW_F1414] ; xmm4=tmp11
; To avoid overflow...
;
; (Original)
; tmp12 = -2.613125930 * z10 + z5;
;
; (This implementation)
; tmp12 = (-1.613125930 - 1) * z10 + z5;
; = -1.613125930 * z10 - z10 + z5;
; To avoid overflow...
;
; (Original)
; tmp12 = -2.613125930 * z10 + z5;
;
; (This implementation)
; tmp12 = (-1.613125930 - 1) * z10 + z5;
; = -1.613125930 * z10 - z10 + z5;
movdqa xmm0,xmm5
paddw xmm5,xmm2
pmulhw xmm5,[rel PW_F1847] ; xmm5=z5
pmulhw xmm0,[rel PW_MF1613]
pmulhw xmm2,[rel PW_F1082]
psubw xmm0,xmm1
psubw xmm2,xmm5 ; xmm2=tmp10
paddw xmm0,xmm5 ; xmm0=tmp12
movdqa xmm0, xmm5
paddw xmm5, xmm2
pmulhw xmm5, [rel PW_F1847] ; xmm5=z5
pmulhw xmm0, [rel PW_MF1613]
pmulhw xmm2, [rel PW_F1082]
psubw xmm0, xmm1
psubw xmm2, xmm5 ; xmm2=tmp10
paddw xmm0, xmm5 ; xmm0=tmp12
; -- Final output stage
; -- Final output stage
psubw xmm0,xmm3 ; xmm0=tmp6
movdqa xmm1,xmm6
movdqa xmm5,xmm7
paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
psubw xmm4,xmm0 ; xmm4=tmp5
psubw xmm0, xmm3 ; xmm0=tmp6
movdqa xmm1, xmm6
movdqa xmm5, xmm7
paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
psubw xmm4, xmm0 ; xmm4=tmp5
movdqa xmm3,xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
movdqa xmm0,xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
movdqa xmm3, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
movdqa xmm0, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
paddw xmm2,xmm4 ; xmm2=tmp4
movdqa xmm5,xmm7
movdqa xmm0,xmm1
paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
paddw xmm2, xmm4 ; xmm2=tmp4
movdqa xmm5, xmm7
movdqa xmm0, xmm1
paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
movdqa xmm4,xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
movdqa xmm2,xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm4, xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
movdqa xmm2, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm0,xmm3 ; transpose coefficients(phase 2)
punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
movdqa xmm0, xmm3 ; transpose coefficients(phase 2)
punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
movdqa xmm3,xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
movdqa xmm0,xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
movdqa xmm3, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
movdqa xmm0, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
movdqa xmm4,xmm6 ; transpose coefficients(phase 3)
punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
movdqa xmm7,xmm5 ; transpose coefficients(phase 3)
punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
movdqa xmm4, xmm6 ; transpose coefficients(phase 3)
punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
movdqa xmm7, xmm5 ; transpose coefficients(phase 3)
punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
movdqa xmm7,xmm3 ; transpose coefficients(phase 3)
punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
movdqa xmm7, xmm3 ; transpose coefficients(phase 3)
punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
.column_end:
; -- Prefetch the next coefficient block
; -- Prefetch the next coefficient block
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
; ---- Pass 2: process rows from work array, store into output array.
; ---- Pass 2: process rows from work array, store into output array.
mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d
mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d
; -- Even part
; -- Even part
; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
movdqa xmm2,xmm6
movdqa xmm0,xmm5
psubw xmm6,xmm1 ; xmm6=tmp11
psubw xmm5,xmm3
paddw xmm2,xmm1 ; xmm2=tmp10
paddw xmm0,xmm3 ; xmm0=tmp13
movdqa xmm2, xmm6
movdqa xmm0, xmm5
psubw xmm6, xmm1 ; xmm6=tmp11
psubw xmm5, xmm3
paddw xmm2, xmm1 ; xmm2=tmp10
paddw xmm0, xmm3 ; xmm0=tmp13
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[rel PW_F1414]
psubw xmm5,xmm0 ; xmm5=tmp12
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5, [rel PW_F1414]
psubw xmm5, xmm0 ; xmm5=tmp12
movdqa xmm1,xmm2
movdqa xmm3,xmm6
psubw xmm2,xmm0 ; xmm2=tmp3
psubw xmm6,xmm5 ; xmm6=tmp2
paddw xmm1,xmm0 ; xmm1=tmp0
paddw xmm3,xmm5 ; xmm3=tmp1
movdqa xmm1, xmm2
movdqa xmm3, xmm6
psubw xmm2, xmm0 ; xmm2=tmp3
psubw xmm6, xmm5 ; xmm6=tmp2
paddw xmm1, xmm0 ; xmm1=tmp0
paddw xmm3, xmm5 ; xmm3=tmp1
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
; -- Odd part
; -- Odd part
; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
movdqa xmm2,xmm0
movdqa xmm6,xmm4
psubw xmm0,xmm7 ; xmm0=z12
psubw xmm4,xmm5 ; xmm4=z10
paddw xmm2,xmm7 ; xmm2=z11
paddw xmm6,xmm5 ; xmm6=z13
movdqa xmm2, xmm0
movdqa xmm6, xmm4
psubw xmm0, xmm7 ; xmm0=z12
psubw xmm4, xmm5 ; xmm4=z10
paddw xmm2, xmm7 ; xmm2=z11
paddw xmm6, xmm5 ; xmm6=z13
movdqa xmm7,xmm4 ; xmm7=z10(unscaled)
psllw xmm0,PRE_MULTIPLY_SCALE_BITS
psllw xmm4,PRE_MULTIPLY_SCALE_BITS
movdqa xmm7, xmm4 ; xmm7=z10(unscaled)
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
psllw xmm4, PRE_MULTIPLY_SCALE_BITS
movdqa xmm5,xmm2
psubw xmm2,xmm6
paddw xmm5,xmm6 ; xmm5=tmp7
movdqa xmm5, xmm2
psubw xmm2, xmm6
paddw xmm5, xmm6 ; xmm5=tmp7
psllw xmm2,PRE_MULTIPLY_SCALE_BITS
pmulhw xmm2,[rel PW_F1414] ; xmm2=tmp11
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm2, [rel PW_F1414] ; xmm2=tmp11
; To avoid overflow...
;
; (Original)
; tmp12 = -2.613125930 * z10 + z5;
;
; (This implementation)
; tmp12 = (-1.613125930 - 1) * z10 + z5;
; = -1.613125930 * z10 - z10 + z5;
; To avoid overflow...
;
; (Original)
; tmp12 = -2.613125930 * z10 + z5;
;
; (This implementation)
; tmp12 = (-1.613125930 - 1) * z10 + z5;
; = -1.613125930 * z10 - z10 + z5;
movdqa xmm6,xmm4
paddw xmm4,xmm0
pmulhw xmm4,[rel PW_F1847] ; xmm4=z5
pmulhw xmm6,[rel PW_MF1613]
pmulhw xmm0,[rel PW_F1082]
psubw xmm6,xmm7
psubw xmm0,xmm4 ; xmm0=tmp10
paddw xmm6,xmm4 ; xmm6=tmp12
movdqa xmm6, xmm4
paddw xmm4, xmm0
pmulhw xmm4, [rel PW_F1847] ; xmm4=z5
pmulhw xmm6, [rel PW_MF1613]
pmulhw xmm0, [rel PW_F1082]
psubw xmm6, xmm7
psubw xmm0, xmm4 ; xmm0=tmp10
paddw xmm6, xmm4 ; xmm6=tmp12
; -- Final output stage
; -- Final output stage
psubw xmm6,xmm5 ; xmm6=tmp6
movdqa xmm7,xmm1
movdqa xmm4,xmm3
paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
psraw xmm1,(PASS1_BITS+3) ; descale
psraw xmm3,(PASS1_BITS+3) ; descale
psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
psraw xmm7,(PASS1_BITS+3) ; descale
psraw xmm4,(PASS1_BITS+3) ; descale
psubw xmm2,xmm6 ; xmm2=tmp5
psubw xmm6, xmm5 ; xmm6=tmp6
movdqa xmm7, xmm1
movdqa xmm4, xmm3
paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
psraw xmm1, (PASS1_BITS+3) ; descale
psraw xmm3, (PASS1_BITS+3) ; descale
psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
psraw xmm7, (PASS1_BITS+3) ; descale
psraw xmm4, (PASS1_BITS+3) ; descale
psubw xmm2, xmm6 ; xmm2=tmp5
packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
paddw xmm0,xmm2 ; xmm0=tmp4
movdqa xmm4,xmm5
movdqa xmm7,xmm6
paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
psraw xmm5,(PASS1_BITS+3) ; descale
psraw xmm6,(PASS1_BITS+3) ; descale
psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
psraw xmm4,(PASS1_BITS+3) ; descale
psraw xmm7,(PASS1_BITS+3) ; descale
paddw xmm0, xmm2 ; xmm0=tmp4
movdqa xmm4, xmm5
movdqa xmm7, xmm6
paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
psraw xmm5, (PASS1_BITS+3) ; descale
psraw xmm6, (PASS1_BITS+3) ; descale
psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
psraw xmm4, (PASS1_BITS+3) ; descale
psraw xmm7, (PASS1_BITS+3) ; descale
movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
paddb xmm1,xmm2
paddb xmm3,xmm2
paddb xmm5,xmm2
paddb xmm7,xmm2
paddb xmm1, xmm2
paddb xmm3, xmm2
paddb xmm5, xmm2
paddb xmm7, xmm2
movdqa xmm0,xmm1 ; transpose coefficients(phase 1)
punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
movdqa xmm0, xmm1 ; transpose coefficients(phase 1)
punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
movdqa xmm4,xmm1 ; transpose coefficients(phase 2)
punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
movdqa xmm2,xmm6 ; transpose coefficients(phase 2)
punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
movdqa xmm4, xmm1 ; transpose coefficients(phase 2)
punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
movdqa xmm2, xmm6 ; transpose coefficients(phase 2)
punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
movdqa xmm3,xmm1 ; transpose coefficients(phase 3)
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
movdqa xmm7,xmm4 ; transpose coefficients(phase 3)
punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
movdqa xmm3, xmm1 ; transpose coefficients(phase 3)
punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
movdqa xmm7, xmm4 ; transpose coefficients(phase 3)
punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
uncollect_args
mov rsp,rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
ret
uncollect_args
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -25,54 +25,54 @@
; --------------------------------------------------------------------------
%define CONST_BITS 8 ; 14 is also OK.
%define PASS1_BITS 2
%define CONST_BITS 8 ; 14 is also OK.
%define PASS1_BITS 2
%if IFAST_SCALE_BITS != PASS1_BITS
%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
%endif
%if CONST_BITS == 8
F_1_082 equ 277 ; FIX(1.082392200)
F_1_414 equ 362 ; FIX(1.414213562)
F_1_847 equ 473 ; FIX(1.847759065)
F_2_613 equ 669 ; FIX(2.613125930)
F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
F_1_082 equ 277 ; FIX(1.082392200)
F_1_414 equ 362 ; FIX(1.414213562)
F_1_847 equ 473 ; FIX(1.847759065)
F_2_613 equ 669 ; FIX(2.613125930)
F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
F_1_082 equ DESCALE(1162209775, 30-CONST_BITS) ; FIX(1.082392200)
F_1_414 equ DESCALE(1518500249, 30-CONST_BITS) ; FIX(1.414213562)
F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
F_2_613 equ DESCALE(2805822602, 30-CONST_BITS) ; FIX(2.613125930)
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
%endif
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
%define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
%define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
alignz 16
global EXTN(jconst_idct_ifast_sse2)
alignz 16
global EXTN(jconst_idct_ifast_sse2)
EXTN(jconst_idct_ifast_sse2):
PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
SECTION SEG_TEXT
BITS 32
;
; Perform dequantization and inverse DCT on one block of coefficients.
;
@@ -81,421 +81,421 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; JSAMPARRAY output_buf, JDIMENSION output_col)
;
%define dct_table(b) (b)+8 ; jpeg_component_info *compptr
%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
%define output_col(b) (b)+20 ; JDIMENSION output_col
%define dct_table(b) (b)+8 ; jpeg_component_info *compptr
%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
%define output_col(b) (b)+20 ; JDIMENSION output_col
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
align 16
global EXTN(jsimd_idct_ifast_sse2)
align 16
global EXTN(jsimd_idct_ifast_sse2)
EXTN(jsimd_idct_ifast_sse2):
push ebp
mov eax,esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax
mov ebp,esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic ebx
; push ecx ; unused
; push edx ; need not be preserved
push esi
push edi
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic ebx
; push ecx ; unused
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
get_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input.
; ---- Pass 1: process columns from input.
; mov eax, [original_ebp]
mov edx, POINTER [dct_table(eax)] ; quantptr
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
; mov eax, [original_ebp]
mov edx, POINTER [dct_table(eax)] ; quantptr
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz near .columnDCT
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz near .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
por xmm1,xmm0
packsswb xmm1,xmm1
packsswb xmm1,xmm1
movd eax,xmm1
test eax,eax
jnz short .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
por xmm1, xmm0
packsswb xmm1, xmm1
packsswb xmm1, xmm1
movd eax, xmm1
test eax, eax
jnz short .columnDCT
; -- AC terms all zero
; -- AC terms all zero
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
jmp near .column_end
alignx 16,7
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
jmp near .column_end
alignx 16, 7
%endif
.columnDCT:
; -- Even part
; -- Even part
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm4,xmm0
movdqa xmm5,xmm1
psubw xmm0,xmm2 ; xmm0=tmp11
psubw xmm1,xmm3
paddw xmm4,xmm2 ; xmm4=tmp10
paddw xmm5,xmm3 ; xmm5=tmp13
movdqa xmm4, xmm0
movdqa xmm5, xmm1
psubw xmm0, xmm2 ; xmm0=tmp11
psubw xmm1, xmm3
paddw xmm4, xmm2 ; xmm4=tmp10
paddw xmm5, xmm3 ; xmm5=tmp13
psllw xmm1,PRE_MULTIPLY_SCALE_BITS
pmulhw xmm1,[GOTOFF(ebx,PW_F1414)]
psubw xmm1,xmm5 ; xmm1=tmp12
psllw xmm1, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm1, [GOTOFF(ebx,PW_F1414)]
psubw xmm1, xmm5 ; xmm1=tmp12
movdqa xmm6,xmm4
movdqa xmm7,xmm0
psubw xmm4,xmm5 ; xmm4=tmp3
psubw xmm0,xmm1 ; xmm0=tmp2
paddw xmm6,xmm5 ; xmm6=tmp0
paddw xmm7,xmm1 ; xmm7=tmp1
movdqa xmm6, xmm4
movdqa xmm7, xmm0
psubw xmm4, xmm5 ; xmm4=tmp3
psubw xmm0, xmm1 ; xmm0=tmp2
paddw xmm6, xmm5 ; xmm6=tmp0
paddw xmm7, xmm1 ; xmm7=tmp1
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
; -- Odd part
; -- Odd part
movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm4,xmm2
movdqa xmm0,xmm5
psubw xmm2,xmm1 ; xmm2=z12
psubw xmm5,xmm3 ; xmm5=z10
paddw xmm4,xmm1 ; xmm4=z11
paddw xmm0,xmm3 ; xmm0=z13
movdqa xmm4, xmm2
movdqa xmm0, xmm5
psubw xmm2, xmm1 ; xmm2=z12
psubw xmm5, xmm3 ; xmm5=z10
paddw xmm4, xmm1 ; xmm4=z11
paddw xmm0, xmm3 ; xmm0=z13
movdqa xmm1,xmm5 ; xmm1=z10(unscaled)
psllw xmm2,PRE_MULTIPLY_SCALE_BITS
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
movdqa xmm1, xmm5 ; xmm1=z10(unscaled)
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
movdqa xmm3,xmm4
psubw xmm4,xmm0
paddw xmm3,xmm0 ; xmm3=tmp7
movdqa xmm3, xmm4
psubw xmm4, xmm0
paddw xmm3, xmm0 ; xmm3=tmp7
psllw xmm4,PRE_MULTIPLY_SCALE_BITS
pmulhw xmm4,[GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11
psllw xmm4, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm4, [GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11
; To avoid overflow...
;
; (Original)
; tmp12 = -2.613125930 * z10 + z5;
;
; (This implementation)
; tmp12 = (-1.613125930 - 1) * z10 + z5;
; = -1.613125930 * z10 - z10 + z5;
; To avoid overflow...
;
; (Original)
; tmp12 = -2.613125930 * z10 + z5;
;
; (This implementation)
; tmp12 = (-1.613125930 - 1) * z10 + z5;
; = -1.613125930 * z10 - z10 + z5;
movdqa xmm0,xmm5
paddw xmm5,xmm2
pmulhw xmm5,[GOTOFF(ebx,PW_F1847)] ; xmm5=z5
pmulhw xmm0,[GOTOFF(ebx,PW_MF1613)]
pmulhw xmm2,[GOTOFF(ebx,PW_F1082)]
psubw xmm0,xmm1
psubw xmm2,xmm5 ; xmm2=tmp10
paddw xmm0,xmm5 ; xmm0=tmp12
movdqa xmm0, xmm5
paddw xmm5, xmm2
pmulhw xmm5, [GOTOFF(ebx,PW_F1847)] ; xmm5=z5
pmulhw xmm0, [GOTOFF(ebx,PW_MF1613)]
pmulhw xmm2, [GOTOFF(ebx,PW_F1082)]
psubw xmm0, xmm1
psubw xmm2, xmm5 ; xmm2=tmp10
paddw xmm0, xmm5 ; xmm0=tmp12
; -- Final output stage
; -- Final output stage
psubw xmm0,xmm3 ; xmm0=tmp6
movdqa xmm1,xmm6
movdqa xmm5,xmm7
paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
psubw xmm4,xmm0 ; xmm4=tmp5
psubw xmm0, xmm3 ; xmm0=tmp6
movdqa xmm1, xmm6
movdqa xmm5, xmm7
paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
psubw xmm4, xmm0 ; xmm4=tmp5
movdqa xmm3,xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
movdqa xmm0,xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
movdqa xmm3, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
movdqa xmm0, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
paddw xmm2,xmm4 ; xmm2=tmp4
movdqa xmm5,xmm7
movdqa xmm0,xmm1
paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
paddw xmm2, xmm4 ; xmm2=tmp4
movdqa xmm5, xmm7
movdqa xmm0, xmm1
paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
movdqa xmm4,xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
movdqa xmm2,xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm4, xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
movdqa xmm2, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm0,xmm3 ; transpose coefficients(phase 2)
punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
movdqa xmm0, xmm3 ; transpose coefficients(phase 2)
punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
movdqa xmm3,xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
movdqa xmm0,xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
movdqa xmm3, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
movdqa xmm0, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
movdqa xmm4,xmm6 ; transpose coefficients(phase 3)
punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
movdqa xmm7,xmm5 ; transpose coefficients(phase 3)
punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
movdqa xmm4, xmm6 ; transpose coefficients(phase 3)
punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
movdqa xmm7, xmm5 ; transpose coefficients(phase 3)
punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
movdqa xmm7,xmm3 ; transpose coefficients(phase 3)
punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
movdqa xmm7, xmm3 ; transpose coefficients(phase 3)
punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
.column_end:
; -- Prefetch the next coefficient block
; -- Prefetch the next coefficient block
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
; ---- Pass 2: process rows from work array, store into output array.
; ---- Pass 2: process rows from work array, store into output array.
mov eax, [original_ebp]
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(eax)]
mov eax, [original_ebp]
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(eax)]
; -- Even part
; -- Even part
; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
movdqa xmm2,xmm6
movdqa xmm0,xmm5
psubw xmm6,xmm1 ; xmm6=tmp11
psubw xmm5,xmm3
paddw xmm2,xmm1 ; xmm2=tmp10
paddw xmm0,xmm3 ; xmm0=tmp13
movdqa xmm2, xmm6
movdqa xmm0, xmm5
psubw xmm6, xmm1 ; xmm6=tmp11
psubw xmm5, xmm3
paddw xmm2, xmm1 ; xmm2=tmp10
paddw xmm0, xmm3 ; xmm0=tmp13
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[GOTOFF(ebx,PW_F1414)]
psubw xmm5,xmm0 ; xmm5=tmp12
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5, [GOTOFF(ebx,PW_F1414)]
psubw xmm5, xmm0 ; xmm5=tmp12
movdqa xmm1,xmm2
movdqa xmm3,xmm6
psubw xmm2,xmm0 ; xmm2=tmp3
psubw xmm6,xmm5 ; xmm6=tmp2
paddw xmm1,xmm0 ; xmm1=tmp0
paddw xmm3,xmm5 ; xmm3=tmp1
movdqa xmm1, xmm2
movdqa xmm3, xmm6
psubw xmm2, xmm0 ; xmm2=tmp3
psubw xmm6, xmm5 ; xmm6=tmp2
paddw xmm1, xmm0 ; xmm1=tmp0
paddw xmm3, xmm5 ; xmm3=tmp1
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
; -- Odd part
; -- Odd part
; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
movdqa xmm2,xmm0
movdqa xmm6,xmm4
psubw xmm0,xmm7 ; xmm0=z12
psubw xmm4,xmm5 ; xmm4=z10
paddw xmm2,xmm7 ; xmm2=z11
paddw xmm6,xmm5 ; xmm6=z13
movdqa xmm2, xmm0
movdqa xmm6, xmm4
psubw xmm0, xmm7 ; xmm0=z12
psubw xmm4, xmm5 ; xmm4=z10
paddw xmm2, xmm7 ; xmm2=z11
paddw xmm6, xmm5 ; xmm6=z13
movdqa xmm7,xmm4 ; xmm7=z10(unscaled)
psllw xmm0,PRE_MULTIPLY_SCALE_BITS
psllw xmm4,PRE_MULTIPLY_SCALE_BITS
movdqa xmm7, xmm4 ; xmm7=z10(unscaled)
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
psllw xmm4, PRE_MULTIPLY_SCALE_BITS
movdqa xmm5,xmm2
psubw xmm2,xmm6
paddw xmm5,xmm6 ; xmm5=tmp7
movdqa xmm5, xmm2
psubw xmm2, xmm6
paddw xmm5, xmm6 ; xmm5=tmp7
psllw xmm2,PRE_MULTIPLY_SCALE_BITS
pmulhw xmm2,[GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm2, [GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11
; To avoid overflow...
;
; (Original)
; tmp12 = -2.613125930 * z10 + z5;
;
; (This implementation)
; tmp12 = (-1.613125930 - 1) * z10 + z5;
; = -1.613125930 * z10 - z10 + z5;
; To avoid overflow...
;
; (Original)
; tmp12 = -2.613125930 * z10 + z5;
;
; (This implementation)
; tmp12 = (-1.613125930 - 1) * z10 + z5;
; = -1.613125930 * z10 - z10 + z5;
movdqa xmm6,xmm4
paddw xmm4,xmm0
pmulhw xmm4,[GOTOFF(ebx,PW_F1847)] ; xmm4=z5
pmulhw xmm6,[GOTOFF(ebx,PW_MF1613)]
pmulhw xmm0,[GOTOFF(ebx,PW_F1082)]
psubw xmm6,xmm7
psubw xmm0,xmm4 ; xmm0=tmp10
paddw xmm6,xmm4 ; xmm6=tmp12
movdqa xmm6, xmm4
paddw xmm4, xmm0
pmulhw xmm4, [GOTOFF(ebx,PW_F1847)] ; xmm4=z5
pmulhw xmm6, [GOTOFF(ebx,PW_MF1613)]
pmulhw xmm0, [GOTOFF(ebx,PW_F1082)]
psubw xmm6, xmm7
psubw xmm0, xmm4 ; xmm0=tmp10
paddw xmm6, xmm4 ; xmm6=tmp12
; -- Final output stage
; -- Final output stage
psubw xmm6,xmm5 ; xmm6=tmp6
movdqa xmm7,xmm1
movdqa xmm4,xmm3
paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
psraw xmm1,(PASS1_BITS+3) ; descale
psraw xmm3,(PASS1_BITS+3) ; descale
psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
psraw xmm7,(PASS1_BITS+3) ; descale
psraw xmm4,(PASS1_BITS+3) ; descale
psubw xmm2,xmm6 ; xmm2=tmp5
psubw xmm6, xmm5 ; xmm6=tmp6
movdqa xmm7, xmm1
movdqa xmm4, xmm3
paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
psraw xmm1, (PASS1_BITS+3) ; descale
psraw xmm3, (PASS1_BITS+3) ; descale
psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
psraw xmm7, (PASS1_BITS+3) ; descale
psraw xmm4, (PASS1_BITS+3) ; descale
psubw xmm2, xmm6 ; xmm2=tmp5
packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
paddw xmm0,xmm2 ; xmm0=tmp4
movdqa xmm4,xmm5
movdqa xmm7,xmm6
paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
psraw xmm5,(PASS1_BITS+3) ; descale
psraw xmm6,(PASS1_BITS+3) ; descale
psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
psraw xmm4,(PASS1_BITS+3) ; descale
psraw xmm7,(PASS1_BITS+3) ; descale
paddw xmm0, xmm2 ; xmm0=tmp4
movdqa xmm4, xmm5
movdqa xmm7, xmm6
paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
psraw xmm5, (PASS1_BITS+3) ; descale
psraw xmm6, (PASS1_BITS+3) ; descale
psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
psraw xmm4, (PASS1_BITS+3) ; descale
psraw xmm7, (PASS1_BITS+3) ; descale
movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
paddb xmm1,xmm2
paddb xmm3,xmm2
paddb xmm5,xmm2
paddb xmm7,xmm2
paddb xmm1, xmm2
paddb xmm3, xmm2
paddb xmm5, xmm2
paddb xmm7, xmm2
movdqa xmm0,xmm1 ; transpose coefficients(phase 1)
punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
movdqa xmm0, xmm1 ; transpose coefficients(phase 1)
punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
movdqa xmm4,xmm1 ; transpose coefficients(phase 2)
punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
movdqa xmm2,xmm6 ; transpose coefficients(phase 2)
punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
movdqa xmm4, xmm1 ; transpose coefficients(phase 2)
punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
movdqa xmm2, xmm6 ; transpose coefficients(phase 2)
punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
movdqa xmm3,xmm1 ; transpose coefficients(phase 3)
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
movdqa xmm7,xmm4 ; transpose coefficients(phase 3)
punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
movdqa xmm3, xmm1 ; transpose coefficients(phase 3)
punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
movdqa xmm7, xmm4 ; transpose coefficients(phase 3)
punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; unused
poppic ebx
mov esp,ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; unused
poppic ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -26,74 +26,74 @@
; --------------------------------------------------------------------------
%define CONST_BITS 13
%define PASS1_BITS 2
%define CONST_BITS 13
%define PASS1_BITS 2
%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
%if CONST_BITS == 13
F_0_211 equ 1730 ; FIX(0.211164243)
F_0_509 equ 4176 ; FIX(0.509795579)
F_0_601 equ 4926 ; FIX(0.601344887)
F_0_720 equ 5906 ; FIX(0.720959822)
F_0_765 equ 6270 ; FIX(0.765366865)
F_0_850 equ 6967 ; FIX(0.850430095)
F_0_899 equ 7373 ; FIX(0.899976223)
F_1_061 equ 8697 ; FIX(1.061594337)
F_1_272 equ 10426 ; FIX(1.272758580)
F_1_451 equ 11893 ; FIX(1.451774981)
F_1_847 equ 15137 ; FIX(1.847759065)
F_2_172 equ 17799 ; FIX(2.172734803)
F_2_562 equ 20995 ; FIX(2.562915447)
F_3_624 equ 29692 ; FIX(3.624509785)
F_0_211 equ 1730 ; FIX(0.211164243)
F_0_509 equ 4176 ; FIX(0.509795579)
F_0_601 equ 4926 ; FIX(0.601344887)
F_0_720 equ 5906 ; FIX(0.720959822)
F_0_765 equ 6270 ; FIX(0.765366865)
F_0_850 equ 6967 ; FIX(0.850430095)
F_0_899 equ 7373 ; FIX(0.899976223)
F_1_061 equ 8697 ; FIX(1.061594337)
F_1_272 equ 10426 ; FIX(1.272758580)
F_1_451 equ 11893 ; FIX(1.451774981)
F_1_847 equ 15137 ; FIX(1.847759065)
F_2_172 equ 17799 ; FIX(2.172734803)
F_2_562 equ 20995 ; FIX(2.562915447)
F_3_624 equ 29692 ; FIX(3.624509785)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243)
F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579)
F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887)
F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822)
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095)
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337)
F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580)
F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803)
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785)
F_0_211 equ DESCALE( 226735879, 30-CONST_BITS) ; FIX(0.211164243)
F_0_509 equ DESCALE( 547388834, 30-CONST_BITS) ; FIX(0.509795579)
F_0_601 equ DESCALE( 645689155, 30-CONST_BITS) ; FIX(0.601344887)
F_0_720 equ DESCALE( 774124714, 30-CONST_BITS) ; FIX(0.720959822)
F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
F_0_850 equ DESCALE( 913142361, 30-CONST_BITS) ; FIX(0.850430095)
F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
F_1_061 equ DESCALE(1139878239, 30-CONST_BITS) ; FIX(1.061594337)
F_1_272 equ DESCALE(1366614119, 30-CONST_BITS) ; FIX(1.272758580)
F_1_451 equ DESCALE(1558831516, 30-CONST_BITS) ; FIX(1.451774981)
F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
F_2_172 equ DESCALE(2332956230, 30-CONST_BITS) ; FIX(2.172734803)
F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
F_3_624 equ DESCALE(3891787747, 30-CONST_BITS) ; FIX(3.624509785)
%endif
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
alignz 16
global EXTN(jconst_idct_red_sse2)
alignz 16
global EXTN(jconst_idct_red_sse2)
EXTN(jconst_idct_red_sse2):
PW_F184_MF076 times 4 dw F_1_847,-F_0_765
PW_F256_F089 times 4 dw F_2_562, F_0_899
PW_F106_MF217 times 4 dw F_1_061,-F_2_172
PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509
PW_F145_MF021 times 4 dw F_1_451,-F_0_211
PW_F362_MF127 times 4 dw F_3_624,-F_1_272
PW_F085_MF072 times 4 dw F_0_850,-F_0_720
PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1)
PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1)
PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1)
PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
PW_F184_MF076 times 4 dw F_1_847,-F_0_765
PW_F256_F089 times 4 dw F_2_562, F_0_899
PW_F106_MF217 times 4 dw F_1_061,-F_2_172
PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509
PW_F145_MF021 times 4 dw F_1_451,-F_0_211
PW_F362_MF127 times 4 dw F_3_624,-F_1_272
PW_F085_MF072 times 4 dw F_0_850,-F_0_720
PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1)
PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1)
PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1)
PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
SECTION SEG_TEXT
BITS 64
;
; Perform dequantization and inverse DCT on one block of coefficients,
; producing a reduced-size 4x4 output block.
@@ -108,292 +108,292 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col
%define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
align 16
global EXTN(jsimd_idct_4x4_sse2)
align 16
global EXTN(jsimd_idct_4x4_sse2)
EXTN(jsimd_idct_4x4_sse2):
push rbp
mov rax,rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax
mov rbp,rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
; ---- Pass 1: process columns from input.
; ---- Pass 1: process columns from input.
mov rdx, r10 ; quantptr
mov rsi, r11 ; inptr
mov rdx, r10 ; quantptr
mov rsi, r11 ; inptr
%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
jnz short .columnDCT
mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
jnz short .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
por xmm0,xmm1
packsswb xmm0,xmm0
packsswb xmm0,xmm0
movd eax,xmm0
test rax,rax
jnz short .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
por xmm0, xmm1
packsswb xmm0, xmm0
packsswb xmm0, xmm0
movd eax, xmm0
test rax, rax
jnz short .columnDCT
; -- AC terms all zero
; -- AC terms all zero
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
psllw xmm0,PASS1_BITS
psllw xmm0, PASS1_BITS
movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
jmp near .column_end
jmp near .column_end
%endif
.columnDCT:
; -- Odd part
; -- Odd part
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm4,xmm0
movdqa xmm5,xmm0
punpcklwd xmm4,xmm1
punpckhwd xmm5,xmm1
movdqa xmm0,xmm4
movdqa xmm1,xmm5
pmaddwd xmm4,[rel PW_F256_F089] ; xmm4=(tmp2L)
pmaddwd xmm5,[rel PW_F256_F089] ; xmm5=(tmp2H)
pmaddwd xmm0,[rel PW_F106_MF217] ; xmm0=(tmp0L)
pmaddwd xmm1,[rel PW_F106_MF217] ; xmm1=(tmp0H)
movdqa xmm4, xmm0
movdqa xmm5, xmm0
punpcklwd xmm4, xmm1
punpckhwd xmm5, xmm1
movdqa xmm0, xmm4
movdqa xmm1, xmm5
pmaddwd xmm4, [rel PW_F256_F089] ; xmm4=(tmp2L)
pmaddwd xmm5, [rel PW_F256_F089] ; xmm5=(tmp2H)
pmaddwd xmm0, [rel PW_F106_MF217] ; xmm0=(tmp0L)
pmaddwd xmm1, [rel PW_F106_MF217] ; xmm1=(tmp0H)
movdqa xmm6,xmm2
movdqa xmm7,xmm2
punpcklwd xmm6,xmm3
punpckhwd xmm7,xmm3
movdqa xmm2,xmm6
movdqa xmm3,xmm7
pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2L)
pmaddwd xmm7,[rel PW_MF060_MF050] ; xmm7=(tmp2H)
pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0L)
pmaddwd xmm3,[rel PW_F145_MF021] ; xmm3=(tmp0H)
movdqa xmm6, xmm2
movdqa xmm7, xmm2
punpcklwd xmm6, xmm3
punpckhwd xmm7, xmm3
movdqa xmm2, xmm6
movdqa xmm3, xmm7
pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2L)
pmaddwd xmm7, [rel PW_MF060_MF050] ; xmm7=(tmp2H)
pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0L)
pmaddwd xmm3, [rel PW_F145_MF021] ; xmm3=(tmp0H)
paddd xmm6,xmm4 ; xmm6=tmp2L
paddd xmm7,xmm5 ; xmm7=tmp2H
paddd xmm2,xmm0 ; xmm2=tmp0L
paddd xmm3,xmm1 ; xmm3=tmp0H
paddd xmm6, xmm4 ; xmm6=tmp2L
paddd xmm7, xmm5 ; xmm7=tmp2H
paddd xmm2, xmm0 ; xmm2=tmp0L
paddd xmm3, xmm1 ; xmm3=tmp0H
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
; -- Even part
; -- Even part
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pxor xmm1,xmm1
pxor xmm2,xmm2
punpcklwd xmm1,xmm4 ; xmm1=tmp0L
punpckhwd xmm2,xmm4 ; xmm2=tmp0H
psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
pxor xmm1, xmm1
pxor xmm2, xmm2
punpcklwd xmm1, xmm4 ; xmm1=tmp0L
punpckhwd xmm2, xmm4 ; xmm2=tmp0H
psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
movdqa xmm3,xmm5 ; xmm5=in2=z2
punpcklwd xmm5,xmm0 ; xmm0=in6=z3
punpckhwd xmm3,xmm0
pmaddwd xmm5,[rel PW_F184_MF076] ; xmm5=tmp2L
pmaddwd xmm3,[rel PW_F184_MF076] ; xmm3=tmp2H
movdqa xmm3, xmm5 ; xmm5=in2=z2
punpcklwd xmm5, xmm0 ; xmm0=in6=z3
punpckhwd xmm3, xmm0
pmaddwd xmm5, [rel PW_F184_MF076] ; xmm5=tmp2L
pmaddwd xmm3, [rel PW_F184_MF076] ; xmm3=tmp2H
movdqa xmm4,xmm1
movdqa xmm0,xmm2
paddd xmm1,xmm5 ; xmm1=tmp10L
paddd xmm2,xmm3 ; xmm2=tmp10H
psubd xmm4,xmm5 ; xmm4=tmp12L
psubd xmm0,xmm3 ; xmm0=tmp12H
movdqa xmm4, xmm1
movdqa xmm0, xmm2
paddd xmm1, xmm5 ; xmm1=tmp10L
paddd xmm2, xmm3 ; xmm2=tmp10H
psubd xmm4, xmm5 ; xmm4=tmp12L
psubd xmm0, xmm3 ; xmm0=tmp12H
; -- Final output stage
; -- Final output stage
movdqa xmm5,xmm1
movdqa xmm3,xmm2
paddd xmm1,xmm6 ; xmm1=data0L
paddd xmm2,xmm7 ; xmm2=data0H
psubd xmm5,xmm6 ; xmm5=data3L
psubd xmm3,xmm7 ; xmm3=data3H
movdqa xmm5, xmm1
movdqa xmm3, xmm2
paddd xmm1, xmm6 ; xmm1=data0L
paddd xmm2, xmm7 ; xmm2=data0H
psubd xmm5, xmm6 ; xmm5=data3L
psubd xmm3, xmm7 ; xmm3=data3H
movdqa xmm6,[rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4]
movdqa xmm6, [rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4]
paddd xmm1,xmm6
paddd xmm2,xmm6
psrad xmm1,DESCALE_P1_4
psrad xmm2,DESCALE_P1_4
paddd xmm5,xmm6
paddd xmm3,xmm6
psrad xmm5,DESCALE_P1_4
psrad xmm3,DESCALE_P1_4
paddd xmm1, xmm6
paddd xmm2, xmm6
psrad xmm1, DESCALE_P1_4
psrad xmm2, DESCALE_P1_4
paddd xmm5, xmm6
paddd xmm3, xmm6
psrad xmm5, DESCALE_P1_4
psrad xmm3, DESCALE_P1_4
packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
movdqa xmm2,xmm4
movdqa xmm3,xmm0
paddd xmm4,xmm7 ; xmm4=data1L
paddd xmm0,xmm6 ; xmm0=data1H
psubd xmm2,xmm7 ; xmm2=data2L
psubd xmm3,xmm6 ; xmm3=data2H
movdqa xmm2, xmm4
movdqa xmm3, xmm0
paddd xmm4, xmm7 ; xmm4=data1L
paddd xmm0, xmm6 ; xmm0=data1H
psubd xmm2, xmm7 ; xmm2=data2L
psubd xmm3, xmm6 ; xmm3=data2H
movdqa xmm7,[rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4]
movdqa xmm7, [rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4]
paddd xmm4,xmm7
paddd xmm0,xmm7
psrad xmm4,DESCALE_P1_4
psrad xmm0,DESCALE_P1_4
paddd xmm2,xmm7
paddd xmm3,xmm7
psrad xmm2,DESCALE_P1_4
psrad xmm3,DESCALE_P1_4
paddd xmm4, xmm7
paddd xmm0, xmm7
psrad xmm4, DESCALE_P1_4
psrad xmm0, DESCALE_P1_4
paddd xmm2, xmm7
paddd xmm3, xmm7
psrad xmm2, DESCALE_P1_4
psrad xmm3, DESCALE_P1_4
packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
movdqa xmm6,xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
movdqa xmm7,xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
movdqa xmm6, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
movdqa xmm7, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
movdqa xmm3,xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
movdqa xmm3, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
.column_end:
; -- Prefetch the next coefficient block
; -- Prefetch the next coefficient block
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
; ---- Pass 2: process rows, store into output array.
; ---- Pass 2: process rows, store into output array.
mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d
mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d
; -- Even part
; -- Even part
pxor xmm4,xmm4
punpcklwd xmm4,xmm1 ; xmm4=tmp0
psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
pxor xmm4, xmm4
punpcklwd xmm4, xmm1 ; xmm4=tmp0
psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
; -- Odd part
; -- Odd part
punpckhwd xmm1,xmm0
punpckhwd xmm6,xmm3
movdqa xmm5,xmm1
movdqa xmm2,xmm6
pmaddwd xmm1,[rel PW_F256_F089] ; xmm1=(tmp2)
pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2)
pmaddwd xmm5,[rel PW_F106_MF217] ; xmm5=(tmp0)
pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0)
punpckhwd xmm1, xmm0
punpckhwd xmm6, xmm3
movdqa xmm5, xmm1
movdqa xmm2, xmm6
pmaddwd xmm1, [rel PW_F256_F089] ; xmm1=(tmp2)
pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2)
pmaddwd xmm5, [rel PW_F106_MF217] ; xmm5=(tmp0)
pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0)
paddd xmm6,xmm1 ; xmm6=tmp2
paddd xmm2,xmm5 ; xmm2=tmp0
paddd xmm6, xmm1 ; xmm6=tmp2
paddd xmm2, xmm5 ; xmm2=tmp0
; -- Even part
; -- Even part
punpcklwd xmm0,xmm3
pmaddwd xmm0,[rel PW_F184_MF076] ; xmm0=tmp2
punpcklwd xmm0, xmm3
pmaddwd xmm0, [rel PW_F184_MF076] ; xmm0=tmp2
movdqa xmm7,xmm4
paddd xmm4,xmm0 ; xmm4=tmp10
psubd xmm7,xmm0 ; xmm7=tmp12
movdqa xmm7, xmm4
paddd xmm4, xmm0 ; xmm4=tmp10
psubd xmm7, xmm0 ; xmm7=tmp12
; -- Final output stage
; -- Final output stage
movdqa xmm1,[rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4]
movdqa xmm1, [rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4]
movdqa xmm5,xmm4
movdqa xmm3,xmm7
paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30)
paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31)
psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33)
psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32)
movdqa xmm5, xmm4
movdqa xmm3, xmm7
paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30)
paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31)
psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33)
psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
paddd xmm4,xmm1
paddd xmm7,xmm1
psrad xmm4,DESCALE_P2_4
psrad xmm7,DESCALE_P2_4
paddd xmm5,xmm1
paddd xmm3,xmm1
psrad xmm5,DESCALE_P2_4
psrad xmm3,DESCALE_P2_4
paddd xmm4, xmm1
paddd xmm7, xmm1
psrad xmm4, DESCALE_P2_4
psrad xmm7, DESCALE_P2_4
paddd xmm5, xmm1
paddd xmm3, xmm1
psrad xmm5, DESCALE_P2_4
psrad xmm3, DESCALE_P2_4
packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
movdqa xmm0,xmm4 ; transpose coefficients(phase 1)
punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
movdqa xmm0, xmm4 ; transpose coefficients(phase 1)
punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
movdqa xmm6,xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
movdqa xmm6, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
paddb xmm4,[rel PB_CENTERJSAMP]
packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
paddb xmm4, [rel PB_CENTERJSAMP]
pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
uncollect_args
mov rsp,rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
uncollect_args
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
; --------------------------------------------------------------------------
@@ -411,165 +411,165 @@ EXTN(jsimd_idct_4x4_sse2):
; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col
align 16
global EXTN(jsimd_idct_2x2_sse2)
align 16
global EXTN(jsimd_idct_2x2_sse2)
EXTN(jsimd_idct_2x2_sse2):
push rbp
mov rax,rsp
mov rbp,rsp
collect_args
push rbx
push rbp
mov rax, rsp
mov rbp, rsp
collect_args
push rbx
; ---- Pass 1: process columns from input.
; ---- Pass 1: process columns from input.
mov rdx, r10 ; quantptr
mov rsi, r11 ; inptr
mov rdx, r10 ; quantptr
mov rsi, r11 ; inptr
; | input: | result: |
; | 00 01 ** 03 ** 05 ** 07 | |
; | 10 11 ** 13 ** 15 ** 17 | |
; | ** ** ** ** ** ** ** ** | |
; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
; | 50 51 ** 53 ** 55 ** 57 | |
; | ** ** ** ** ** ** ** ** | |
; | 70 71 ** 73 ** 75 ** 77 | |
; | input: | result: |
; | 00 01 ** 03 ** 05 ** 07 | |
; | 10 11 ** 13 ** 15 ** 17 | |
; | ** ** ** ** ** ** ** ** | |
; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
; | 50 51 ** 53 ** 55 ** 57 | |
; | ** ** ** ** ** ** ** ** | |
; | 70 71 ** 73 ** 75 ** 77 | |
; -- Odd part
; -- Odd part
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
pcmpeqd xmm7,xmm7
pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
pcmpeqd xmm7, xmm7
pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
pmaddwd xmm4,[rel PW_F362_MF127]
pmaddwd xmm5,[rel PW_F085_MF072]
movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
pmaddwd xmm4, [rel PW_F362_MF127]
pmaddwd xmm5, [rel PW_F085_MF072]
psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
pmaddwd xmm0,[rel PW_F362_MF127]
pmaddwd xmm2,[rel PW_F085_MF072]
psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
pmaddwd xmm0, [rel PW_F362_MF127]
pmaddwd xmm2, [rel PW_F085_MF072]
paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3]
paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3]
paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
; -- Even part
; -- Even part
movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
; xmm6=(00 01 ** 03 ** 05 ** 07)
; xmm6=(00 01 ** 03 ** 05 ** 07)
movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
; -- Final output stage
; -- Final output stage
movdqa xmm3,xmm6
movdqa xmm5,xmm1
paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
movdqa xmm3, xmm6
movdqa xmm5, xmm1
paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
movdqa xmm2,[rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2]
movdqa xmm2, [rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2]
punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **)
punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **)
movdqa xmm7,xmm1
punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3)
punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7)
movdqa xmm7, xmm1
punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3)
punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7)
paddd xmm6,xmm2
psrad xmm6,DESCALE_P1_2
paddd xmm6, xmm2
psrad xmm6, DESCALE_P1_2
paddd xmm1,xmm2
paddd xmm7,xmm2
psrad xmm1,DESCALE_P1_2
psrad xmm7,DESCALE_P1_2
paddd xmm1, xmm2
paddd xmm7, xmm2
psrad xmm1, DESCALE_P1_2
psrad xmm7, DESCALE_P1_2
; -- Prefetch the next coefficient block
; -- Prefetch the next coefficient block
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
; ---- Pass 2: process rows, store into output array.
; ---- Pass 2: process rows, store into output array.
mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d
mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d
; | input:| result:|
; | A0 B0 | |
; | A1 B1 | C0 C1 |
; | A3 B3 | D0 D1 |
; | A5 B5 | |
; | A7 B7 | |
; | input:| result:|
; | A0 B0 | |
; | A1 B1 | C0 C1 |
; | A3 B3 | D0 D1 |
; | A5 B5 | |
; | A7 B7 | |
; -- Odd part
; -- Odd part
packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
pmaddwd xmm1,[rel PW_F362_MF127]
pmaddwd xmm7,[rel PW_F085_MF072]
packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
pmaddwd xmm1, [rel PW_F362_MF127]
pmaddwd xmm7, [rel PW_F085_MF072]
paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
; -- Even part
; -- Even part
pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
; -- Final output stage
; -- Final output stage
movdqa xmm4,xmm6
paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
movdqa xmm4, xmm6
paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1)
punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1)
paddd xmm6,[rel PD_DESCALE_P2_2]
psrad xmm6,DESCALE_P2_2
paddd xmm6, [rel PD_DESCALE_P2_2]
psrad xmm6, DESCALE_P2_2
packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
paddb xmm6,[rel PB_CENTERJSAMP]
packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
paddb xmm6, [rel PB_CENTERJSAMP]
pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --)
pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --)
pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx
mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx
mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx
pop rbx
uncollect_args
pop rbp
ret
pop rbx
uncollect_args
pop rbp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -25,74 +25,74 @@
; --------------------------------------------------------------------------
%define CONST_BITS 13
%define PASS1_BITS 2
%define CONST_BITS 13
%define PASS1_BITS 2
%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
%if CONST_BITS == 13
F_0_211 equ 1730 ; FIX(0.211164243)
F_0_509 equ 4176 ; FIX(0.509795579)
F_0_601 equ 4926 ; FIX(0.601344887)
F_0_720 equ 5906 ; FIX(0.720959822)
F_0_765 equ 6270 ; FIX(0.765366865)
F_0_850 equ 6967 ; FIX(0.850430095)
F_0_899 equ 7373 ; FIX(0.899976223)
F_1_061 equ 8697 ; FIX(1.061594337)
F_1_272 equ 10426 ; FIX(1.272758580)
F_1_451 equ 11893 ; FIX(1.451774981)
F_1_847 equ 15137 ; FIX(1.847759065)
F_2_172 equ 17799 ; FIX(2.172734803)
F_2_562 equ 20995 ; FIX(2.562915447)
F_3_624 equ 29692 ; FIX(3.624509785)
F_0_211 equ 1730 ; FIX(0.211164243)
F_0_509 equ 4176 ; FIX(0.509795579)
F_0_601 equ 4926 ; FIX(0.601344887)
F_0_720 equ 5906 ; FIX(0.720959822)
F_0_765 equ 6270 ; FIX(0.765366865)
F_0_850 equ 6967 ; FIX(0.850430095)
F_0_899 equ 7373 ; FIX(0.899976223)
F_1_061 equ 8697 ; FIX(1.061594337)
F_1_272 equ 10426 ; FIX(1.272758580)
F_1_451 equ 11893 ; FIX(1.451774981)
F_1_847 equ 15137 ; FIX(1.847759065)
F_2_172 equ 17799 ; FIX(2.172734803)
F_2_562 equ 20995 ; FIX(2.562915447)
F_3_624 equ 29692 ; FIX(3.624509785)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243)
F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579)
F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887)
F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822)
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095)
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337)
F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580)
F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803)
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785)
F_0_211 equ DESCALE( 226735879, 30-CONST_BITS) ; FIX(0.211164243)
F_0_509 equ DESCALE( 547388834, 30-CONST_BITS) ; FIX(0.509795579)
F_0_601 equ DESCALE( 645689155, 30-CONST_BITS) ; FIX(0.601344887)
F_0_720 equ DESCALE( 774124714, 30-CONST_BITS) ; FIX(0.720959822)
F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
F_0_850 equ DESCALE( 913142361, 30-CONST_BITS) ; FIX(0.850430095)
F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
F_1_061 equ DESCALE(1139878239, 30-CONST_BITS) ; FIX(1.061594337)
F_1_272 equ DESCALE(1366614119, 30-CONST_BITS) ; FIX(1.272758580)
F_1_451 equ DESCALE(1558831516, 30-CONST_BITS) ; FIX(1.451774981)
F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
F_2_172 equ DESCALE(2332956230, 30-CONST_BITS) ; FIX(2.172734803)
F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
F_3_624 equ DESCALE(3891787747, 30-CONST_BITS) ; FIX(3.624509785)
%endif
; --------------------------------------------------------------------------
SECTION SEG_CONST
SECTION SEG_CONST
alignz 16
global EXTN(jconst_idct_red_sse2)
alignz 16
global EXTN(jconst_idct_red_sse2)
EXTN(jconst_idct_red_sse2):
PW_F184_MF076 times 4 dw F_1_847,-F_0_765
PW_F256_F089 times 4 dw F_2_562, F_0_899
PW_F106_MF217 times 4 dw F_1_061,-F_2_172
PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509
PW_F145_MF021 times 4 dw F_1_451,-F_0_211
PW_F362_MF127 times 4 dw F_3_624,-F_1_272
PW_F085_MF072 times 4 dw F_0_850,-F_0_720
PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1)
PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1)
PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1)
PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
PW_F184_MF076 times 4 dw F_1_847,-F_0_765
PW_F256_F089 times 4 dw F_2_562, F_0_899
PW_F106_MF217 times 4 dw F_1_061,-F_2_172
PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509
PW_F145_MF021 times 4 dw F_1_451,-F_0_211
PW_F362_MF127 times 4 dw F_3_624,-F_1_272
PW_F085_MF072 times 4 dw F_0_850,-F_0_720
PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1)
PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1)
PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1)
PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 16
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
SECTION SEG_TEXT
BITS 32
;
; Perform dequantization and inverse DCT on one block of coefficients,
; producing a reduced-size 4x4 output block.
@@ -102,309 +102,309 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; JSAMPARRAY output_buf, JDIMENSION output_col)
;
%define dct_table(b) (b)+8 ; void *dct_table
%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
%define output_col(b) (b)+20 ; JDIMENSION output_col
%define dct_table(b) (b)+8 ; void *dct_table
%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
%define output_col(b) (b)+20 ; JDIMENSION output_col
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
align 16
global EXTN(jsimd_idct_4x4_sse2)
align 16
global EXTN(jsimd_idct_4x4_sse2)
EXTN(jsimd_idct_4x4_sse2):
push ebp
mov eax,esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax
mov ebp,esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic ebx
; push ecx ; unused
; push edx ; need not be preserved
push esi
push edi
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic ebx
; push ecx ; unused
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
get_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input.
; ---- Pass 1: process columns from input.
; mov eax, [original_ebp]
mov edx, POINTER [dct_table(eax)] ; quantptr
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
; mov eax, [original_ebp]
mov edx, POINTER [dct_table(eax)] ; quantptr
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz short .columnDCT
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz short .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
por xmm0,xmm1
packsswb xmm0,xmm0
packsswb xmm0,xmm0
movd eax,xmm0
test eax,eax
jnz short .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
por xmm0, xmm1
packsswb xmm0, xmm0
packsswb xmm0, xmm0
movd eax, xmm0
test eax, eax
jnz short .columnDCT
; -- AC terms all zero
; -- AC terms all zero
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
psllw xmm0,PASS1_BITS
psllw xmm0, PASS1_BITS
movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
jmp near .column_end
alignx 16,7
jmp near .column_end
alignx 16, 7
%endif
.columnDCT:
; -- Odd part
; -- Odd part
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm4,xmm0
movdqa xmm5,xmm0
punpcklwd xmm4,xmm1
punpckhwd xmm5,xmm1
movdqa xmm0,xmm4
movdqa xmm1,xmm5
pmaddwd xmm4,[GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L)
pmaddwd xmm5,[GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H)
pmaddwd xmm0,[GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L)
pmaddwd xmm1,[GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H)
movdqa xmm4, xmm0
movdqa xmm5, xmm0
punpcklwd xmm4, xmm1
punpckhwd xmm5, xmm1
movdqa xmm0, xmm4
movdqa xmm1, xmm5
pmaddwd xmm4, [GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L)
pmaddwd xmm5, [GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H)
pmaddwd xmm0, [GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L)
pmaddwd xmm1, [GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H)
movdqa xmm6,xmm2
movdqa xmm7,xmm2
punpcklwd xmm6,xmm3
punpckhwd xmm7,xmm3
movdqa xmm2,xmm6
movdqa xmm3,xmm7
pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L)
pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H)
pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L)
pmaddwd xmm3,[GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H)
movdqa xmm6, xmm2
movdqa xmm7, xmm2
punpcklwd xmm6, xmm3
punpckhwd xmm7, xmm3
movdqa xmm2, xmm6
movdqa xmm3, xmm7
pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L)
pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H)
pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L)
pmaddwd xmm3, [GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H)
paddd xmm6,xmm4 ; xmm6=tmp2L
paddd xmm7,xmm5 ; xmm7=tmp2H
paddd xmm2,xmm0 ; xmm2=tmp0L
paddd xmm3,xmm1 ; xmm3=tmp0H
paddd xmm6, xmm4 ; xmm6=tmp2L
paddd xmm7, xmm5 ; xmm7=tmp2H
paddd xmm2, xmm0 ; xmm2=tmp0L
paddd xmm3, xmm1 ; xmm3=tmp0H
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
; -- Even part
; -- Even part
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pxor xmm1,xmm1
pxor xmm2,xmm2
punpcklwd xmm1,xmm4 ; xmm1=tmp0L
punpckhwd xmm2,xmm4 ; xmm2=tmp0H
psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
pxor xmm1, xmm1
pxor xmm2, xmm2
punpcklwd xmm1, xmm4 ; xmm1=tmp0L
punpckhwd xmm2, xmm4 ; xmm2=tmp0H
psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
movdqa xmm3,xmm5 ; xmm5=in2=z2
punpcklwd xmm5,xmm0 ; xmm0=in6=z3
punpckhwd xmm3,xmm0
pmaddwd xmm5,[GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L
pmaddwd xmm3,[GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H
movdqa xmm3, xmm5 ; xmm5=in2=z2
punpcklwd xmm5, xmm0 ; xmm0=in6=z3
punpckhwd xmm3, xmm0
pmaddwd xmm5, [GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L
pmaddwd xmm3, [GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H
movdqa xmm4,xmm1
movdqa xmm0,xmm2
paddd xmm1,xmm5 ; xmm1=tmp10L
paddd xmm2,xmm3 ; xmm2=tmp10H
psubd xmm4,xmm5 ; xmm4=tmp12L
psubd xmm0,xmm3 ; xmm0=tmp12H
movdqa xmm4, xmm1
movdqa xmm0, xmm2
paddd xmm1, xmm5 ; xmm1=tmp10L
paddd xmm2, xmm3 ; xmm2=tmp10H
psubd xmm4, xmm5 ; xmm4=tmp12L
psubd xmm0, xmm3 ; xmm0=tmp12H
; -- Final output stage
; -- Final output stage
movdqa xmm5,xmm1
movdqa xmm3,xmm2
paddd xmm1,xmm6 ; xmm1=data0L
paddd xmm2,xmm7 ; xmm2=data0H
psubd xmm5,xmm6 ; xmm5=data3L
psubd xmm3,xmm7 ; xmm3=data3H
movdqa xmm5, xmm1
movdqa xmm3, xmm2
paddd xmm1, xmm6 ; xmm1=data0L
paddd xmm2, xmm7 ; xmm2=data0H
psubd xmm5, xmm6 ; xmm5=data3L
psubd xmm3, xmm7 ; xmm3=data3H
movdqa xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4]
movdqa xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4]
paddd xmm1,xmm6
paddd xmm2,xmm6
psrad xmm1,DESCALE_P1_4
psrad xmm2,DESCALE_P1_4
paddd xmm5,xmm6
paddd xmm3,xmm6
psrad xmm5,DESCALE_P1_4
psrad xmm3,DESCALE_P1_4
paddd xmm1, xmm6
paddd xmm2, xmm6
psrad xmm1, DESCALE_P1_4
psrad xmm2, DESCALE_P1_4
paddd xmm5, xmm6
paddd xmm3, xmm6
psrad xmm5, DESCALE_P1_4
psrad xmm3, DESCALE_P1_4
packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
movdqa xmm2,xmm4
movdqa xmm3,xmm0
paddd xmm4,xmm7 ; xmm4=data1L
paddd xmm0,xmm6 ; xmm0=data1H
psubd xmm2,xmm7 ; xmm2=data2L
psubd xmm3,xmm6 ; xmm3=data2H
movdqa xmm2, xmm4
movdqa xmm3, xmm0
paddd xmm4, xmm7 ; xmm4=data1L
paddd xmm0, xmm6 ; xmm0=data1H
psubd xmm2, xmm7 ; xmm2=data2L
psubd xmm3, xmm6 ; xmm3=data2H
movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4]
movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4]
paddd xmm4,xmm7
paddd xmm0,xmm7
psrad xmm4,DESCALE_P1_4
psrad xmm0,DESCALE_P1_4
paddd xmm2,xmm7
paddd xmm3,xmm7
psrad xmm2,DESCALE_P1_4
psrad xmm3,DESCALE_P1_4
paddd xmm4, xmm7
paddd xmm0, xmm7
psrad xmm4, DESCALE_P1_4
psrad xmm0, DESCALE_P1_4
paddd xmm2, xmm7
paddd xmm3, xmm7
psrad xmm2, DESCALE_P1_4
psrad xmm3, DESCALE_P1_4
packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
movdqa xmm6,xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
movdqa xmm7,xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
movdqa xmm6, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
movdqa xmm7, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
movdqa xmm3,xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
movdqa xmm3, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
.column_end:
; -- Prefetch the next coefficient block
; -- Prefetch the next coefficient block
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
; ---- Pass 2: process rows, store into output array.
; ---- Pass 2: process rows, store into output array.
mov eax, [original_ebp]
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(eax)]
mov eax, [original_ebp]
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(eax)]
; -- Even part
; -- Even part
pxor xmm4,xmm4
punpcklwd xmm4,xmm1 ; xmm4=tmp0
psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
pxor xmm4, xmm4
punpcklwd xmm4, xmm1 ; xmm4=tmp0
psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
; -- Odd part
; -- Odd part
punpckhwd xmm1,xmm0
punpckhwd xmm6,xmm3
movdqa xmm5,xmm1
movdqa xmm2,xmm6
pmaddwd xmm1,[GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2)
pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2)
pmaddwd xmm5,[GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0)
pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0)
punpckhwd xmm1, xmm0
punpckhwd xmm6, xmm3
movdqa xmm5, xmm1
movdqa xmm2, xmm6
pmaddwd xmm1, [GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2)
pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2)
pmaddwd xmm5, [GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0)
pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0)
paddd xmm6,xmm1 ; xmm6=tmp2
paddd xmm2,xmm5 ; xmm2=tmp0
paddd xmm6, xmm1 ; xmm6=tmp2
paddd xmm2, xmm5 ; xmm2=tmp0
; -- Even part
; -- Even part
punpcklwd xmm0,xmm3
pmaddwd xmm0,[GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2
punpcklwd xmm0, xmm3
pmaddwd xmm0, [GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2
movdqa xmm7,xmm4
paddd xmm4,xmm0 ; xmm4=tmp10
psubd xmm7,xmm0 ; xmm7=tmp12
movdqa xmm7, xmm4
paddd xmm4, xmm0 ; xmm4=tmp10
psubd xmm7, xmm0 ; xmm7=tmp12
; -- Final output stage
; -- Final output stage
movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4]
movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4]
movdqa xmm5,xmm4
movdqa xmm3,xmm7
paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30)
paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31)
psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33)
psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32)
movdqa xmm5, xmm4
movdqa xmm3, xmm7
paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30)
paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31)
psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33)
psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
paddd xmm4,xmm1
paddd xmm7,xmm1
psrad xmm4,DESCALE_P2_4
psrad xmm7,DESCALE_P2_4
paddd xmm5,xmm1
paddd xmm3,xmm1
psrad xmm5,DESCALE_P2_4
psrad xmm3,DESCALE_P2_4
paddd xmm4, xmm1
paddd xmm7, xmm1
psrad xmm4, DESCALE_P2_4
psrad xmm7, DESCALE_P2_4
paddd xmm5, xmm1
paddd xmm3, xmm1
psrad xmm5, DESCALE_P2_4
psrad xmm3, DESCALE_P2_4
packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
movdqa xmm0,xmm4 ; transpose coefficients(phase 1)
punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
movdqa xmm0, xmm4 ; transpose coefficients(phase 1)
punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
movdqa xmm6,xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
movdqa xmm6, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
paddb xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)]
packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
paddb xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)]
pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; unused
poppic ebx
mov esp,ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; unused
poppic ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; --------------------------------------------------------------------------
@@ -417,177 +417,177 @@ EXTN(jsimd_idct_4x4_sse2):
; JSAMPARRAY output_buf, JDIMENSION output_col)
;
%define dct_table(b) (b)+8 ; void *dct_table
%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
%define output_col(b) (b)+20 ; JDIMENSION output_col
%define dct_table(b) (b)+8 ; void *dct_table
%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
%define output_col(b) (b)+20 ; JDIMENSION output_col
align 16
global EXTN(jsimd_idct_2x2_sse2)
align 16
global EXTN(jsimd_idct_2x2_sse2)
EXTN(jsimd_idct_2x2_sse2):
push ebp
mov ebp,esp
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
push ebp
mov ebp, esp
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
get_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input.
; ---- Pass 1: process columns from input.
mov edx, POINTER [dct_table(ebp)] ; quantptr
mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
mov edx, POINTER [dct_table(ebp)] ; quantptr
mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
; | input: | result: |
; | 00 01 ** 03 ** 05 ** 07 | |
; | 10 11 ** 13 ** 15 ** 17 | |
; | ** ** ** ** ** ** ** ** | |
; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
; | 50 51 ** 53 ** 55 ** 57 | |
; | ** ** ** ** ** ** ** ** | |
; | 70 71 ** 73 ** 75 ** 77 | |
; | input: | result: |
; | 00 01 ** 03 ** 05 ** 07 | |
; | 10 11 ** 13 ** 15 ** 17 | |
; | ** ** ** ** ** ** ** ** | |
; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
; | 50 51 ** 53 ** 55 ** 57 | |
; | ** ** ** ** ** ** ** ** | |
; | 70 71 ** 73 ** 75 ** 77 | |
; -- Odd part
; -- Odd part
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
pcmpeqd xmm7,xmm7
pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
pcmpeqd xmm7, xmm7
pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
pmaddwd xmm4,[GOTOFF(ebx,PW_F362_MF127)]
pmaddwd xmm5,[GOTOFF(ebx,PW_F085_MF072)]
movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
pmaddwd xmm4, [GOTOFF(ebx,PW_F362_MF127)]
pmaddwd xmm5, [GOTOFF(ebx,PW_F085_MF072)]
psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)]
pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)]
psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
pmaddwd xmm0, [GOTOFF(ebx,PW_F362_MF127)]
pmaddwd xmm2, [GOTOFF(ebx,PW_F085_MF072)]
paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3]
paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3]
paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
; -- Even part
; -- Even part
movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
; xmm6=(00 01 ** 03 ** 05 ** 07)
; xmm6=(00 01 ** 03 ** 05 ** 07)
movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
; -- Final output stage
; -- Final output stage
movdqa xmm3,xmm6
movdqa xmm5,xmm1
paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
movdqa xmm3, xmm6
movdqa xmm5, xmm1
paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2]
movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2]
punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **)
punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **)
movdqa xmm7,xmm1
punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3)
punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7)
movdqa xmm7, xmm1
punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3)
punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7)
paddd xmm6,xmm2
psrad xmm6,DESCALE_P1_2
paddd xmm6, xmm2
psrad xmm6, DESCALE_P1_2
paddd xmm1,xmm2
paddd xmm7,xmm2
psrad xmm1,DESCALE_P1_2
psrad xmm7,DESCALE_P1_2
paddd xmm1, xmm2
paddd xmm7, xmm2
psrad xmm1, DESCALE_P1_2
psrad xmm7, DESCALE_P1_2
; -- Prefetch the next coefficient block
; -- Prefetch the next coefficient block
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
; ---- Pass 2: process rows, store into output array.
; ---- Pass 2: process rows, store into output array.
mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(ebp)]
mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(ebp)]
; | input:| result:|
; | A0 B0 | |
; | A1 B1 | C0 C1 |
; | A3 B3 | D0 D1 |
; | A5 B5 | |
; | A7 B7 | |
; | input:| result:|
; | A0 B0 | |
; | A1 B1 | C0 C1 |
; | A3 B3 | D0 D1 |
; | A5 B5 | |
; | A7 B7 | |
; -- Odd part
; -- Odd part
packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
pmaddwd xmm1,[GOTOFF(ebx,PW_F362_MF127)]
pmaddwd xmm7,[GOTOFF(ebx,PW_F085_MF072)]
packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
pmaddwd xmm1, [GOTOFF(ebx,PW_F362_MF127)]
pmaddwd xmm7, [GOTOFF(ebx,PW_F085_MF072)]
paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
; -- Even part
; -- Even part
pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
; -- Final output stage
; -- Final output stage
movdqa xmm4,xmm6
paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
movdqa xmm4, xmm6
paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1)
punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1)
paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)]
psrad xmm6,DESCALE_P2_2
paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)]
psrad xmm6, DESCALE_P2_2
packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
paddb xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)]
packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
paddb xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)]
pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --)
pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --)
pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
mov WORD [edx+eax*SIZEOF_JSAMPLE], bx
mov WORD [esi+eax*SIZEOF_JSAMPLE], cx
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
mov WORD [edx+eax*SIZEOF_JSAMPLE], bx
mov WORD [esi+eax*SIZEOF_JSAMPLE], cx
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
pop ebp
ret
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -20,8 +20,8 @@
%include "jdct.inc"
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
SECTION SEG_TEXT
BITS 64
;
; Load data into workspace, applying unsigned->signed conversion
;
@@ -34,65 +34,65 @@
; r11 = JDIMENSION start_col
; r12 = FAST_FLOAT *workspace
align 16
global EXTN(jsimd_convsamp_float_sse2)
align 16
global EXTN(jsimd_convsamp_float_sse2)
EXTN(jsimd_convsamp_float_sse2):
push rbp
mov rax,rsp
mov rbp,rsp
collect_args
push rbx
push rbp
mov rax, rsp
mov rbp, rsp
collect_args
push rbx
pcmpeqw xmm7,xmm7
psllw xmm7,7
packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
pcmpeqw xmm7, xmm7
psllw xmm7, 7
packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
mov rsi, r10
mov eax, r11d
mov rdi, r12
mov rcx, DCTSIZE/2
mov rsi, r10
mov eax, r11d
mov rdi, r12
mov rcx, DCTSIZE/2
.convloop:
mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
psubb xmm0,xmm7 ; xmm0=(01234567)
psubb xmm1,xmm7 ; xmm1=(89ABCDEF)
psubb xmm0, xmm7 ; xmm0=(01234567)
psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3)
punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7)
punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B)
punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F)
punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
cvtdq2ps xmm2,xmm2 ; xmm2=(0123)
cvtdq2ps xmm0,xmm0 ; xmm0=(4567)
psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
cvtdq2ps xmm3,xmm3 ; xmm3=(89AB)
cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF)
psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
add rsi, byte 2*SIZEOF_JSAMPROW
add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
dec rcx
jnz short .convloop
add rsi, byte 2*SIZEOF_JSAMPROW
add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
dec rcx
jnz short .convloop
pop rbx
uncollect_args
pop rbp
ret
pop rbx
uncollect_args
pop rbp
ret
; --------------------------------------------------------------------------
@@ -108,50 +108,50 @@ EXTN(jsimd_convsamp_float_sse2):
; r11 = FAST_FLOAT *divisors
; r12 = FAST_FLOAT *workspace
align 16
global EXTN(jsimd_quantize_float_sse2)
align 16
global EXTN(jsimd_quantize_float_sse2)
EXTN(jsimd_quantize_float_sse2):
push rbp
mov rax,rsp
mov rbp,rsp
collect_args
push rbp
mov rax, rsp
mov rbp, rsp
collect_args
mov rsi, r12
mov rdx, r11
mov rdi, r10
mov rax, DCTSIZE2/16
mov rsi, r12
mov rdx, r11
mov rdi, r10
mov rax, DCTSIZE2/16
.quantloop:
movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
cvtps2dq xmm0,xmm0
cvtps2dq xmm1,xmm1
cvtps2dq xmm2,xmm2
cvtps2dq xmm3,xmm3
cvtps2dq xmm0, xmm0
cvtps2dq xmm1, xmm1
cvtps2dq xmm2, xmm2
cvtps2dq xmm3, xmm3
packssdw xmm0,xmm1
packssdw xmm2,xmm3
packssdw xmm0, xmm1
packssdw xmm2, xmm3
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
add rsi, byte 16*SIZEOF_FAST_FLOAT
add rdx, byte 16*SIZEOF_FAST_FLOAT
add rdi, byte 16*SIZEOF_JCOEF
dec rax
jnz short .quantloop
add rsi, byte 16*SIZEOF_FAST_FLOAT
add rdx, byte 16*SIZEOF_FAST_FLOAT
add rdi, byte 16*SIZEOF_JCOEF
dec rax
jnz short .quantloop
uncollect_args
pop rbp
ret
uncollect_args
pop rbp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -19,8 +19,8 @@
%include "jdct.inc"
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
SECTION SEG_TEXT
BITS 32
;
; Load data into workspace, applying unsigned->signed conversion
;
@@ -29,75 +29,75 @@
; FAST_FLOAT *workspace);
;
%define sample_data ebp+8 ; JSAMPARRAY sample_data
%define start_col ebp+12 ; JDIMENSION start_col
%define workspace ebp+16 ; FAST_FLOAT *workspace
%define sample_data ebp+8 ; JSAMPARRAY sample_data
%define start_col ebp+12 ; JDIMENSION start_col
%define workspace ebp+16 ; FAST_FLOAT *workspace
align 16
global EXTN(jsimd_convsamp_float_sse2)
align 16
global EXTN(jsimd_convsamp_float_sse2)
EXTN(jsimd_convsamp_float_sse2):
push ebp
mov ebp,esp
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
push ebp
mov ebp, esp
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
pcmpeqw xmm7,xmm7
psllw xmm7,7
packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
pcmpeqw xmm7, xmm7
psllw xmm7, 7
packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
mov eax, JDIMENSION [start_col]
mov edi, POINTER [workspace] ; (DCTELEM *)
mov ecx, DCTSIZE/2
alignx 16,7
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
mov eax, JDIMENSION [start_col]
mov edi, POINTER [workspace] ; (DCTELEM *)
mov ecx, DCTSIZE/2
alignx 16, 7
.convloop:
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
psubb xmm0,xmm7 ; xmm0=(01234567)
psubb xmm1,xmm7 ; xmm1=(89ABCDEF)
psubb xmm0, xmm7 ; xmm0=(01234567)
psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3)
punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7)
punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B)
punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F)
punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
cvtdq2ps xmm2,xmm2 ; xmm2=(0123)
cvtdq2ps xmm0,xmm0 ; xmm0=(4567)
psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
cvtdq2ps xmm3,xmm3 ; xmm3=(89AB)
cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF)
psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
add esi, byte 2*SIZEOF_JSAMPROW
add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
dec ecx
jnz short .convloop
add esi, byte 2*SIZEOF_JSAMPROW
add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
dec ecx
jnz short .convloop
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
pop ebp
ret
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
pop ebp
ret
; --------------------------------------------------------------------------
@@ -109,62 +109,62 @@ EXTN(jsimd_convsamp_float_sse2):
; FAST_FLOAT *workspace);
;
%define coef_block ebp+8 ; JCOEFPTR coef_block
%define divisors ebp+12 ; FAST_FLOAT *divisors
%define workspace ebp+16 ; FAST_FLOAT *workspace
%define coef_block ebp+8 ; JCOEFPTR coef_block
%define divisors ebp+12 ; FAST_FLOAT *divisors
%define workspace ebp+16 ; FAST_FLOAT *workspace
align 16
global EXTN(jsimd_quantize_float_sse2)
align 16
global EXTN(jsimd_quantize_float_sse2)
EXTN(jsimd_quantize_float_sse2):
push ebp
mov ebp,esp
; push ebx ; unused
; push ecx ; unused
; push edx ; need not be preserved
push esi
push edi
push ebp
mov ebp, esp
; push ebx ; unused
; push ecx ; unused
; push edx ; need not be preserved
push esi
push edi
mov esi, POINTER [workspace]
mov edx, POINTER [divisors]
mov edi, JCOEFPTR [coef_block]
mov eax, DCTSIZE2/16
alignx 16,7
mov esi, POINTER [workspace]
mov edx, POINTER [divisors]
mov edi, JCOEFPTR [coef_block]
mov eax, DCTSIZE2/16
alignx 16, 7
.quantloop:
movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
cvtps2dq xmm0,xmm0
cvtps2dq xmm1,xmm1
cvtps2dq xmm2,xmm2
cvtps2dq xmm3,xmm3
cvtps2dq xmm0, xmm0
cvtps2dq xmm1, xmm1
cvtps2dq xmm2, xmm2
cvtps2dq xmm3, xmm3
packssdw xmm0,xmm1
packssdw xmm2,xmm3
packssdw xmm0, xmm1
packssdw xmm2, xmm3
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
add esi, byte 16*SIZEOF_FAST_FLOAT
add edx, byte 16*SIZEOF_FAST_FLOAT
add edi, byte 16*SIZEOF_JCOEF
dec eax
jnz short .quantloop
add esi, byte 16*SIZEOF_FAST_FLOAT
add edx, byte 16*SIZEOF_FAST_FLOAT
add edi, byte 16*SIZEOF_JCOEF
dec eax
jnz short .quantloop
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; unused
; pop ebx ; unused
pop ebp
ret
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; unused
; pop ebx ; unused
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -20,8 +20,8 @@
%include "jdct.inc"
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
SECTION SEG_TEXT
BITS 64
;
; Load data into workspace, applying unsigned->signed conversion
;
@@ -34,60 +34,60 @@
; r11 = JDIMENSION start_col
; r12 = DCTELEM *workspace
align 16
global EXTN(jsimd_convsamp_sse2)
align 16
global EXTN(jsimd_convsamp_sse2)
EXTN(jsimd_convsamp_sse2):
push rbp
mov rax,rsp
mov rbp,rsp
collect_args
push rbx
push rbp
mov rax, rsp
mov rbp, rsp
collect_args
push rbx
pxor xmm6,xmm6 ; xmm6=(all 0's)
pcmpeqw xmm7,xmm7
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
pxor xmm6, xmm6 ; xmm6=(all 0's)
pcmpeqw xmm7, xmm7
psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
mov rsi, r10
mov eax, r11d
mov rdi, r12
mov rcx, DCTSIZE/4
mov rsi, r10
mov eax, r11d
mov rdi, r12
mov rcx, DCTSIZE/4
.convloop:
mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
punpcklbw xmm0,xmm6 ; xmm0=(01234567)
punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF)
paddw xmm0,xmm7
paddw xmm1,xmm7
punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN)
punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV)
paddw xmm2,xmm7
paddw xmm3,xmm7
punpcklbw xmm0, xmm6 ; xmm0=(01234567)
punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
paddw xmm0, xmm7
paddw xmm1, xmm7
punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
paddw xmm2, xmm7
paddw xmm3, xmm7
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
add rsi, byte 4*SIZEOF_JSAMPROW
add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
dec rcx
jnz short .convloop
add rsi, byte 4*SIZEOF_JSAMPROW
add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
dec rcx
jnz short .convloop
pop rbx
uncollect_args
pop rbp
ret
pop rbx
uncollect_args
pop rbp
ret
; --------------------------------------------------------------------------
;
@@ -102,85 +102,85 @@ EXTN(jsimd_convsamp_sse2):
; DCTELEM *workspace);
;
%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
; r10 = JCOEFPTR coef_block
; r11 = DCTELEM *divisors
; r12 = DCTELEM *workspace
align 16
global EXTN(jsimd_quantize_sse2)
align 16
global EXTN(jsimd_quantize_sse2)
EXTN(jsimd_quantize_sse2):
push rbp
mov rax,rsp
mov rbp,rsp
collect_args
push rbp
mov rax, rsp
mov rbp, rsp
collect_args
mov rsi, r12
mov rdx, r11
mov rdi, r10
mov rax, DCTSIZE2/32
mov rsi, r12
mov rdx, r11
mov rdi, r10
mov rax, DCTSIZE2/32
.quantloop:
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
movdqa xmm0,xmm4
movdqa xmm1,xmm5
movdqa xmm2,xmm6
movdqa xmm3,xmm7
psraw xmm4,(WORD_BIT-1)
psraw xmm5,(WORD_BIT-1)
psraw xmm6,(WORD_BIT-1)
psraw xmm7,(WORD_BIT-1)
pxor xmm0,xmm4
pxor xmm1,xmm5
pxor xmm2,xmm6
pxor xmm3,xmm7
psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
movdqa xmm0, xmm4
movdqa xmm1, xmm5
movdqa xmm2, xmm6
movdqa xmm3, xmm7
psraw xmm4, (WORD_BIT-1)
psraw xmm5, (WORD_BIT-1)
psraw xmm6, (WORD_BIT-1)
psraw xmm7, (WORD_BIT-1)
pxor xmm0, xmm4
pxor xmm1, xmm5
pxor xmm2, xmm6
pxor xmm3, xmm7
psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)]
paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)]
pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal
pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale
pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)]
paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)]
pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal
pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale
pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
pxor xmm0,xmm4
pxor xmm1,xmm5
pxor xmm2,xmm6
pxor xmm3,xmm7
psubw xmm0,xmm4
psubw xmm1,xmm5
psubw xmm2,xmm6
psubw xmm3,xmm7
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
pxor xmm0, xmm4
pxor xmm1, xmm5
pxor xmm2, xmm6
pxor xmm3, xmm7
psubw xmm0, xmm4
psubw xmm1, xmm5
psubw xmm2, xmm6
psubw xmm3, xmm7
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
add rsi, byte 32*SIZEOF_DCTELEM
add rdx, byte 32*SIZEOF_DCTELEM
add rdi, byte 32*SIZEOF_JCOEF
dec rax
jnz near .quantloop
add rsi, byte 32*SIZEOF_DCTELEM
add rdx, byte 32*SIZEOF_DCTELEM
add rdi, byte 32*SIZEOF_JCOEF
dec rax
jnz near .quantloop
uncollect_args
pop rbp
ret
uncollect_args
pop rbp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -19,8 +19,8 @@
%include "jdct.inc"
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
SECTION SEG_TEXT
BITS 32
;
; Load data into workspace, applying unsigned->signed conversion
;
@@ -29,70 +29,70 @@
; DCTELEM *workspace);
;
%define sample_data ebp+8 ; JSAMPARRAY sample_data
%define start_col ebp+12 ; JDIMENSION start_col
%define workspace ebp+16 ; DCTELEM *workspace
%define sample_data ebp+8 ; JSAMPARRAY sample_data
%define start_col ebp+12 ; JDIMENSION start_col
%define workspace ebp+16 ; DCTELEM *workspace
align 16
global EXTN(jsimd_convsamp_sse2)
align 16
global EXTN(jsimd_convsamp_sse2)
EXTN(jsimd_convsamp_sse2):
push ebp
mov ebp,esp
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
push ebp
mov ebp, esp
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
pxor xmm6,xmm6 ; xmm6=(all 0's)
pcmpeqw xmm7,xmm7
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
pxor xmm6, xmm6 ; xmm6=(all 0's)
pcmpeqw xmm7, xmm7
psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
mov eax, JDIMENSION [start_col]
mov edi, POINTER [workspace] ; (DCTELEM *)
mov ecx, DCTSIZE/4
alignx 16,7
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
mov eax, JDIMENSION [start_col]
mov edi, POINTER [workspace] ; (DCTELEM *)
mov ecx, DCTSIZE/4
alignx 16, 7
.convloop:
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
punpcklbw xmm0,xmm6 ; xmm0=(01234567)
punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF)
paddw xmm0,xmm7
paddw xmm1,xmm7
punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN)
punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV)
paddw xmm2,xmm7
paddw xmm3,xmm7
punpcklbw xmm0, xmm6 ; xmm0=(01234567)
punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
paddw xmm0, xmm7
paddw xmm1, xmm7
punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
paddw xmm2, xmm7
paddw xmm3, xmm7
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
add esi, byte 4*SIZEOF_JSAMPROW
add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
dec ecx
jnz short .convloop
add esi, byte 4*SIZEOF_JSAMPROW
add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
dec ecx
jnz short .convloop
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
pop ebp
ret
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
pop ebp
ret
; --------------------------------------------------------------------------
;
@@ -107,93 +107,93 @@ EXTN(jsimd_convsamp_sse2):
; DCTELEM *workspace);
;
%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
%define coef_block ebp+8 ; JCOEFPTR coef_block
%define divisors ebp+12 ; DCTELEM *divisors
%define workspace ebp+16 ; DCTELEM *workspace
%define coef_block ebp+8 ; JCOEFPTR coef_block
%define divisors ebp+12 ; DCTELEM *divisors
%define workspace ebp+16 ; DCTELEM *workspace
align 16
global EXTN(jsimd_quantize_sse2)
align 16
global EXTN(jsimd_quantize_sse2)
EXTN(jsimd_quantize_sse2):
push ebp
mov ebp,esp
; push ebx ; unused
; push ecx ; unused
; push edx ; need not be preserved
push esi
push edi
push ebp
mov ebp, esp
; push ebx ; unused
; push ecx ; unused
; push edx ; need not be preserved
push esi
push edi
mov esi, POINTER [workspace]
mov edx, POINTER [divisors]
mov edi, JCOEFPTR [coef_block]
mov eax, DCTSIZE2/32
alignx 16,7
mov esi, POINTER [workspace]
mov edx, POINTER [divisors]
mov edi, JCOEFPTR [coef_block]
mov eax, DCTSIZE2/32
alignx 16, 7
.quantloop:
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
movdqa xmm0,xmm4
movdqa xmm1,xmm5
movdqa xmm2,xmm6
movdqa xmm3,xmm7
psraw xmm4,(WORD_BIT-1)
psraw xmm5,(WORD_BIT-1)
psraw xmm6,(WORD_BIT-1)
psraw xmm7,(WORD_BIT-1)
pxor xmm0,xmm4
pxor xmm1,xmm5
pxor xmm2,xmm6
pxor xmm3,xmm7
psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
movdqa xmm0, xmm4
movdqa xmm1, xmm5
movdqa xmm2, xmm6
movdqa xmm3, xmm7
psraw xmm4, (WORD_BIT-1)
psraw xmm5, (WORD_BIT-1)
psraw xmm6, (WORD_BIT-1)
psraw xmm7, (WORD_BIT-1)
pxor xmm0, xmm4
pxor xmm1, xmm5
pxor xmm2, xmm6
pxor xmm3, xmm7
psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
paddw xmm1, XMMWORD [CORRECTION(1,0,edx)]
paddw xmm2, XMMWORD [CORRECTION(2,0,edx)]
paddw xmm3, XMMWORD [CORRECTION(3,0,edx)]
pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale
pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
paddw xmm1, XMMWORD [CORRECTION(1,0,edx)]
paddw xmm2, XMMWORD [CORRECTION(2,0,edx)]
paddw xmm3, XMMWORD [CORRECTION(3,0,edx)]
pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale
pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
pxor xmm0,xmm4
pxor xmm1,xmm5
pxor xmm2,xmm6
pxor xmm3,xmm7
psubw xmm0,xmm4
psubw xmm1,xmm5
psubw xmm2,xmm6
psubw xmm3,xmm7
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
pxor xmm0, xmm4
pxor xmm1, xmm5
pxor xmm2, xmm6
pxor xmm3, xmm7
psubw xmm0, xmm4
psubw xmm1, xmm5
psubw xmm2, xmm6
psubw xmm3, xmm7
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
add esi, byte 32*SIZEOF_DCTELEM
add edx, byte 32*SIZEOF_DCTELEM
add edi, byte 32*SIZEOF_JCOEF
dec eax
jnz near .quantloop
add esi, byte 32*SIZEOF_DCTELEM
add edx, byte 32*SIZEOF_DCTELEM
add edi, byte 32*SIZEOF_JCOEF
dec eax
jnz near .quantloop
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; unused
; pop ebx ; unused
pop ebp
ret
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; unused
; pop ebx ; unused
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16
align 16

View File

@@ -19,79 +19,79 @@
; -- jpeglib.h
;
%define _cpp_protection_DCTSIZE DCTSIZE
%define _cpp_protection_DCTSIZE2 DCTSIZE2
%define _cpp_protection_DCTSIZE DCTSIZE
%define _cpp_protection_DCTSIZE2 DCTSIZE2
;
; -- jmorecfg.h
;
%define _cpp_protection_RGB_RED RGB_RED
%define _cpp_protection_RGB_GREEN RGB_GREEN
%define _cpp_protection_RGB_BLUE RGB_BLUE
%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE
%define _cpp_protection_RGB_RED RGB_RED
%define _cpp_protection_RGB_GREEN RGB_GREEN
%define _cpp_protection_RGB_BLUE RGB_BLUE
%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE
%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED
%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN
%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE
%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE
%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED
%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN
%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE
%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE
%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED
%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN
%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE
%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE
%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED
%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN
%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE
%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE
%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED
%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN
%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE
%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE
%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED
%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN
%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE
%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE
%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED
%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN
%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE
%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE
%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED
%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN
%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE
%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE
%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED
%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN
%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE
%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE
%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED
%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN
%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE
%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE
%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED
%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN
%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE
%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE
%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED
%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN
%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE
%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE
%define RGBX_FILLER_0XFF 1
%define RGBX_FILLER_0XFF 1
; Representation of a single sample (pixel element value).
; On this SIMD implementation, this must be 'unsigned char'.
;
%define JSAMPLE byte ; unsigned char
%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
%define JSAMPLE byte ; unsigned char
%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE
%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE
; Representation of a DCT frequency coefficient.
; On this SIMD implementation, this must be 'short'.
;
%define JCOEF word ; short
%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
%define JCOEF word ; short
%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
; Datatype used for image dimensions.
; On this SIMD implementation, this must be 'unsigned int'.
;
%define JDIMENSION dword ; unsigned int
%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
%define JDIMENSION dword ; unsigned int
%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
;
; -- jdct.h
@@ -101,30 +101,30 @@
; the DCT is to be performed in-place in that buffer.
; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
;
%define DCTELEM word ; short
%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
%define DCTELEM word ; short
%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
%define FAST_FLOAT FP32 ; float
%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT)
%define FAST_FLOAT FP32 ; float
%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT)
; To maximize parallelism, Type MULTIPLIER is changed to short.
;
%define ISLOW_MULT_TYPE word ; must be short
%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
%define ISLOW_MULT_TYPE word ; must be short
%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
%define IFAST_MULT_TYPE word ; must be short
%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
%define IFAST_MULT_TYPE word ; must be short
%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
%define FLOAT_MULT_TYPE FP32 ; must be float
%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
%define FLOAT_MULT_TYPE FP32 ; must be float
%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
;
; -- jsimd.h
;
%define _cpp_protection_JSIMD_NONE JSIMD_NONE
%define _cpp_protection_JSIMD_MMX JSIMD_MMX
%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW
%define _cpp_protection_JSIMD_SSE JSIMD_SSE
%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2
%define _cpp_protection_JSIMD_NONE JSIMD_NONE
%define _cpp_protection_JSIMD_MMX JSIMD_MMX
%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW
%define _cpp_protection_JSIMD_SSE JSIMD_SSE
%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2

View File

@@ -38,11 +38,11 @@
; -- segment definition --
;
%ifdef __YASM_VER__
%define SEG_TEXT .text align=16
%define SEG_CONST .rdata align=16
%define SEG_TEXT .text align=16
%define SEG_CONST .rdata align=16
%else
%define SEG_TEXT .text align=16 public use32 class=CODE
%define SEG_CONST .rdata align=16 public use32 class=CONST
%define SEG_TEXT .text align=16 public use32 class=CODE
%define SEG_CONST .rdata align=16 public use32 class=CONST
%endif
%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)--------
@@ -57,15 +57,15 @@
%define SEG_TEXT .text align=16 public use64 class=CODE
%define SEG_CONST .rdata align=16 public use64 class=CONST
%endif
%define EXTN(name) name ; foo() -> foo
%define EXTN(name) name ; foo() -> foo
%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
; * Borland C++ (Win32)
; -- segment definition --
;
%define SEG_TEXT _text align=16 public use32 class=CODE
%define SEG_CONST _data align=16 public use32 class=DATA
%define SEG_TEXT _text align=16 public use32 class=CODE
%define SEG_CONST _data align=16 public use32 class=DATA
%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
; * Linux
@@ -78,17 +78,17 @@ section .note.GNU-stack noalloc noexec nowrite progbits
; -- segment definition --
;
%ifdef __x86_64__
%define SEG_TEXT .text progbits align=16
%define SEG_CONST .rodata progbits align=16
%define SEG_TEXT .text progbits align=16
%define SEG_CONST .rodata progbits align=16
%else
%define SEG_TEXT .text progbits alloc exec nowrite align=16
%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
%define SEG_TEXT .text progbits alloc exec nowrite align=16
%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
%endif
; To make the code position-independent, append -DPIC to the commandline
;
%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
%define EXTN(name) name ; foo() -> foo
%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
%define EXTN(name) name ; foo() -> foo
%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)----
; * Older Linux using a.out format (nasm -f aout -DAOUT ...)
@@ -96,20 +96,20 @@ section .note.GNU-stack noalloc noexec nowrite progbits
; -- segment definition --
;
%define SEG_TEXT .text
%define SEG_CONST .data
%define SEG_TEXT .text
%define SEG_CONST .data
; To make the code position-independent, append -DPIC to the commandline
;
%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
; -- segment definition --
;
%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why?
%define SEG_CONST .rodata align=16
%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why?
%define SEG_CONST .rodata align=16
; The generation of position-independent code (PIC) is the default on Darwin.
;
@@ -120,10 +120,10 @@ section .note.GNU-stack noalloc noexec nowrite progbits
; -- segment definition --
;
%define SEG_TEXT .text
%define SEG_CONST .data
%define SEG_TEXT .text
%define SEG_CONST .data
%endif ; ----------------------------------------------
%endif ; ----------------------------------------------
; ==========================================================================
@@ -131,54 +131,54 @@ section .note.GNU-stack noalloc noexec nowrite progbits
; Common types
;
%ifdef __x86_64__
%define POINTER qword ; general pointer type
%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
%define POINTER qword ; general pointer type
%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
%else
%define POINTER dword ; general pointer type
%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
%define POINTER dword ; general pointer type
%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
%endif
%define INT dword ; signed integer type
%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
%define INT dword ; signed integer type
%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
%define FP32 dword ; IEEE754 single
%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
%define FP32 dword ; IEEE754 single
%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
%define MMWORD qword ; int64 (MMX register)
%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
%define MMWORD qword ; int64 (MMX register)
%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
; NASM is buggy and doesn't properly handle operand sizes for SSE
; instructions, so for now we have to define XMMWORD as blank.
%define XMMWORD ; int128 (SSE register)
%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
%define XMMWORD ; int128 (SSE register)
%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
; Similar hacks for when we load a dword or MMWORD into an xmm# register
%define XMM_DWORD
%define XMM_MMWORD
%define SIZEOF_BYTE 1 ; sizeof(BYTE)
%define SIZEOF_WORD 2 ; sizeof(WORD)
%define SIZEOF_DWORD 4 ; sizeof(DWORD)
%define SIZEOF_QWORD 8 ; sizeof(QWORD)
%define SIZEOF_OWORD 16 ; sizeof(OWORD)
%define SIZEOF_BYTE 1 ; sizeof(BYTE)
%define SIZEOF_WORD 2 ; sizeof(WORD)
%define SIZEOF_DWORD 4 ; sizeof(DWORD)
%define SIZEOF_QWORD 8 ; sizeof(QWORD)
%define SIZEOF_OWORD 16 ; sizeof(OWORD)
%define BYTE_BIT 8 ; CHAR_BIT in C
%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT
%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT
%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT
%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT
%define BYTE_BIT 8 ; CHAR_BIT in C
%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT
%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT
%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT
%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT
; --------------------------------------------------------------------------
; External Symbol Name
;
%ifndef EXTN
%define EXTN(name) _ %+ name ; foo() -> _foo
%define EXTN(name) _ %+ name ; foo() -> _foo
%endif
; --------------------------------------------------------------------------
@@ -188,75 +188,76 @@ section .note.GNU-stack noalloc noexec nowrite progbits
%undef PIC
%endif
%ifdef PIC ; -------------------------------------------
%ifdef PIC ; -------------------------------------------
%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
%ifidn GOT_SYMBOL, _MACHO_PIC_ ; --------------------
; At present, nasm doesn't seem to support PIC generation for Mach-O.
; The PIC support code below is a little tricky.
SECTION SEG_CONST
SECTION SEG_CONST
const_base:
%define GOTOFF(got,sym) (got) + (sym) - const_base
%define GOTOFF(got,sym) (got) + (sym) - const_base
%imacro get_GOT 1
; NOTE: this macro destroys ecx resister.
call %%geteip
add ecx, byte (%%ref - $)
jmp short %%adjust
; NOTE: this macro destroys ecx resister.
call %%geteip
add ecx, byte (%%ref - $)
jmp short %%adjust
%%geteip:
mov ecx, POINTER [esp]
ret
mov ecx, POINTER [esp]
ret
%%adjust:
push ebp
xor ebp,ebp ; ebp = 0
%ifidni %1,ebx ; (%1 == ebx)
; db 0x8D,0x9C + jmp near const_base =
; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
db 0x8D,0x9C ; 8D,9C
jmp near const_base ; E9,(const_base-%%ref)
push ebp
xor ebp, ebp ; ebp = 0
%ifidni %1, ebx ; (%1 == ebx)
; db 0x8D,0x9C + jmp near const_base =
; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
db 0x8D, 0x9C ; 8D,9C
jmp near const_base ; E9,(const_base-%%ref)
%%ref:
%else ; (%1 != ebx)
; db 0x8D,0x8C + jmp near const_base =
; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
db 0x8D,0x8C ; 8D,8C
jmp near const_base ; E9,(const_base-%%ref)
%%ref: mov %1, ecx
%endif ; (%1 == ebx)
pop ebp
; db 0x8D,0x8C + jmp near const_base =
; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
db 0x8D, 0x8C ; 8D,8C
jmp near const_base ; E9,(const_base-%%ref)
%%ref:
mov %1, ecx
%endif ; (%1 == ebx)
pop ebp
%endmacro
%else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
%else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
%imacro get_GOT 1
extern GOT_SYMBOL
call %%geteip
add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
jmp short %%done
extern GOT_SYMBOL
call %%geteip
add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
jmp short %%done
%%geteip:
mov %1, POINTER [esp]
ret
mov %1, POINTER [esp]
ret
%%done:
%endmacro
%endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
%endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
%imacro pushpic 1.nolist
push %1
push %1
%endmacro
%imacro poppic 1.nolist
pop %1
pop %1
%endmacro
%imacro movpic 2.nolist
mov %1,%2
mov %1, %2
%endmacro
%else ; !PIC -----------------------------------------
%else ; !PIC -----------------------------------------
%define GOTOFF(got,sym) (sym)
%define GOTOFF(got,sym) (sym)
%imacro get_GOT 1.nolist
%endmacro
@@ -267,7 +268,7 @@ const_base:
%imacro movpic 2.nolist
%endmacro
%endif ; PIC -----------------------------------------
%endif ; PIC -----------------------------------------
; --------------------------------------------------------------------------
; Align the next instruction on {2,4,8,16,..}-byte boundary.
@@ -277,28 +278,29 @@ const_base:
%define FILLB(b,n) (($$-(b)) & ((n)-1))
%imacro alignx 1-2.nolist 0xFFFF
%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
db 0x90 ; nop
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00]
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00]
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
db 0x8B,0xED ; mov ebp,ebp
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
db 0x90 ; nop
%%bs: \
times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
db 0x90 ; nop
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00]
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00]
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
db 0x8B,0xED ; mov ebp,ebp
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
db 0x90 ; nop
%endmacro
; Align the next data on {2,4,8,16,..}-byte boundary.
;
%imacro alignz 1.nolist
align %1, db 0 ; filling zeros
align %1, db 0 ; filling zeros
%endmacro
%ifdef __x86_64__
@@ -306,61 +308,61 @@ const_base:
%ifdef WIN64
%imacro collect_args 0
push r12
push r13
push r14
push r15
mov r10, rcx
mov r11, rdx
mov r12, r8
mov r13, r9
mov r14, [rax+48]
mov r15, [rax+56]
push rsi
push rdi
sub rsp, SIZEOF_XMMWORD
movaps XMMWORD [rsp], xmm6
sub rsp, SIZEOF_XMMWORD
movaps XMMWORD [rsp], xmm7
push r12
push r13
push r14
push r15
mov r10, rcx
mov r11, rdx
mov r12, r8
mov r13, r9
mov r14, [rax+48]
mov r15, [rax+56]
push rsi
push rdi
sub rsp, SIZEOF_XMMWORD
movaps XMMWORD [rsp], xmm6
sub rsp, SIZEOF_XMMWORD
movaps XMMWORD [rsp], xmm7
%endmacro
%imacro uncollect_args 0
movaps xmm7, XMMWORD [rsp]
add rsp, SIZEOF_XMMWORD
movaps xmm6, XMMWORD [rsp]
add rsp, SIZEOF_XMMWORD
pop rdi
pop rsi
pop r15
pop r14
pop r13
pop r12
movaps xmm7, XMMWORD [rsp]
add rsp, SIZEOF_XMMWORD
movaps xmm6, XMMWORD [rsp]
add rsp, SIZEOF_XMMWORD
pop rdi
pop rsi
pop r15
pop r14
pop r13
pop r12
%endmacro
%else
%imacro collect_args 0
push r10
push r11
push r12
push r13
push r14
push r15
mov r10, rdi
mov r11, rsi
mov r12, rdx
mov r13, rcx
mov r14, r8
mov r15, r9
push r10
push r11
push r12
push r13
push r14
push r15
mov r10, rdi
mov r11, rsi
mov r12, rdx
mov r13, rcx
mov r14, r8
mov r15, r9
%endmacro
%imacro uncollect_args 0
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
%endmacro
%endif