Reformat SSE/SSE2 SIMD code to improve readability

This commit is contained in:
DRC
2016-05-27 16:58:23 -05:00
parent 3ff13e651b
commit ff5685d534
43 changed files with 11067 additions and 11065 deletions

View File

@@ -33,454 +33,454 @@
; r13 = JDIMENSION output_row ; r13 = JDIMENSION output_row
; r14 = int num_rows ; r14 = int num_rows
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 8 %define WK_NUM 8
align 16 align 16
global EXTN(jsimd_rgb_ycc_convert_sse2) global EXTN(jsimd_rgb_ycc_convert_sse2)
EXTN(jsimd_rgb_ycc_convert_sse2): EXTN(jsimd_rgb_ycc_convert_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
push rbx push rbx
mov ecx, r10d mov ecx, r10d
test rcx,rcx test rcx, rcx
jz near .return jz near .return
push rcx push rcx
mov rsi, r12 mov rsi, r12
mov ecx, r13d mov ecx, r13d
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
pop rcx pop rcx
mov rsi, r11 mov rsi, r11
mov eax, r14d mov eax, r14d
test rax,rax test rax, rax
jle near .return jle near .return
.rowloop: .rowloop:
push rdx push rdx
push rbx push rbx
push rdi push rdi
push rsi push rsi
push rcx ; col push rcx ; col
mov rsi, JSAMPROW [rsi] ; inptr mov rsi, JSAMPROW [rsi] ; inptr
mov rdi, JSAMPROW [rdi] ; outptr0 mov rdi, JSAMPROW [rdi] ; outptr0
mov rbx, JSAMPROW [rbx] ; outptr1 mov rbx, JSAMPROW [rbx] ; outptr1
mov rdx, JSAMPROW [rdx] ; outptr2 mov rdx, JSAMPROW [rdx] ; outptr2
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1: .column_ld1:
push rax push rax
push rdx push rdx
lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE test cl, SIZEOF_BYTE
jz short .column_ld2 jz short .column_ld2
sub rcx, byte SIZEOF_BYTE sub rcx, byte SIZEOF_BYTE
movzx rax, BYTE [rsi+rcx] movzx rax, BYTE [rsi+rcx]
.column_ld2: .column_ld2:
test cl, SIZEOF_WORD test cl, SIZEOF_WORD
jz short .column_ld4 jz short .column_ld4
sub rcx, byte SIZEOF_WORD sub rcx, byte SIZEOF_WORD
movzx rdx, WORD [rsi+rcx] movzx rdx, WORD [rsi+rcx]
shl rax, WORD_BIT shl rax, WORD_BIT
or rax,rdx or rax, rdx
.column_ld4: .column_ld4:
movd xmmA,eax movd xmmA, eax
pop rdx pop rdx
pop rax pop rax
test cl, SIZEOF_DWORD test cl, SIZEOF_DWORD
jz short .column_ld8 jz short .column_ld8
sub rcx, byte SIZEOF_DWORD sub rcx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [rsi+rcx] movd xmmF, XMM_DWORD [rsi+rcx]
pslldq xmmA, SIZEOF_DWORD pslldq xmmA, SIZEOF_DWORD
por xmmA,xmmF por xmmA, xmmF
.column_ld8: .column_ld8:
test cl, SIZEOF_MMWORD test cl, SIZEOF_MMWORD
jz short .column_ld16 jz short .column_ld16
sub rcx, byte SIZEOF_MMWORD sub rcx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [rsi+rcx] movq xmmB, XMM_MMWORD [rsi+rcx]
pslldq xmmA, SIZEOF_MMWORD pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmB por xmmA, xmmB
.column_ld16: .column_ld16:
test cl, SIZEOF_XMMWORD test cl, SIZEOF_XMMWORD
jz short .column_ld32 jz short .column_ld32
movdqa xmmF,xmmA movdqa xmmF, xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
mov rcx, SIZEOF_XMMWORD mov rcx, SIZEOF_XMMWORD
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
.column_ld32: .column_ld32:
test cl, 2*SIZEOF_XMMWORD test cl, 2*SIZEOF_XMMWORD
mov rcx, SIZEOF_XMMWORD mov rcx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv jz short .rgb_ycc_cnv
movdqa xmmB,xmmA movdqa xmmB, xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
.columnloop: .columnloop:
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
.rgb_ycc_cnv: .rgb_ycc_cnv:
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
movdqa xmmG,xmmA movdqa xmmG, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
movdqa xmmE,xmmA movdqa xmmE, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
pxor xmmH,xmmH pxor xmmH, xmmH
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmB,xmmE movdqa xmmB, xmmE
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmF,xmmD movdqa xmmF, xmmD
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1: .column_ld1:
test cl, SIZEOF_XMMWORD/16 test cl, SIZEOF_XMMWORD/16
jz short .column_ld2 jz short .column_ld2
sub rcx, byte SIZEOF_XMMWORD/16 sub rcx, byte SIZEOF_XMMWORD/16
movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
.column_ld2: .column_ld2:
test cl, SIZEOF_XMMWORD/8 test cl, SIZEOF_XMMWORD/8
jz short .column_ld4 jz short .column_ld4
sub rcx, byte SIZEOF_XMMWORD/8 sub rcx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmE por xmmA, xmmE
.column_ld4: .column_ld4:
test cl, SIZEOF_XMMWORD/4 test cl, SIZEOF_XMMWORD/4
jz short .column_ld8 jz short .column_ld8
sub rcx, byte SIZEOF_XMMWORD/4 sub rcx, byte SIZEOF_XMMWORD/4
movdqa xmmE,xmmA movdqa xmmE, xmmA
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
.column_ld8: .column_ld8:
test cl, SIZEOF_XMMWORD/2 test cl, SIZEOF_XMMWORD/2
mov rcx, SIZEOF_XMMWORD mov rcx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv jz short .rgb_ycc_cnv
movdqa xmmF,xmmA movdqa xmmF, xmmA
movdqa xmmH,xmmE movdqa xmmH, xmmE
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
.columnloop: .columnloop:
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
.rgb_ycc_cnv: .rgb_ycc_cnv:
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmC,xmmF movdqa xmmC, xmmF
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmB,xmmA movdqa xmmB, xmmA
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmG,xmmD movdqa xmmG, xmmD
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmE,xmmA movdqa xmmE, xmmA
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmH,xmmB movdqa xmmH, xmmB
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
pxor xmmF,xmmF pxor xmmF, xmmF
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmD,xmmB movdqa xmmD, xmmB
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmG,xmmE movdqa xmmG, xmmE
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
punpcklbw xmmF,xmmH punpcklbw xmmF, xmmH
punpckhbw xmmH,xmmH punpckhbw xmmH, xmmH
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
; (Original) ; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
; ;
; (This implementation) ; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
movdqa xmm6,xmm1 movdqa xmm6, xmm1
punpcklwd xmm1,xmm3 punpcklwd xmm1, xmm3
punpckhwd xmm6,xmm3 punpckhwd xmm6, xmm3
movdqa xmm7,xmm1 movdqa xmm7, xmm1
movdqa xmm4,xmm6 movdqa xmm4, xmm6
pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
pmaddwd xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) pmaddwd xmm7, [rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
pxor xmm1,xmm1 pxor xmm1, xmm1
pxor xmm6,xmm6 pxor xmm6, xmm6
punpcklwd xmm1,xmm5 ; xmm1=BOL punpcklwd xmm1, xmm5 ; xmm1=BOL
punpckhwd xmm6,xmm5 ; xmm6=BOH punpckhwd xmm6, xmm5 ; xmm6=BOH
psrld xmm1,1 ; xmm1=BOL*FIX(0.500) psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
psrld xmm6,1 ; xmm6=BOH*FIX(0.500) psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
movdqa xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ] movdqa xmm5, [rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
paddd xmm7,xmm1 paddd xmm7, xmm1
paddd xmm4,xmm6 paddd xmm4, xmm6
paddd xmm7,xmm5 paddd xmm7, xmm5
paddd xmm4,xmm5 paddd xmm4, xmm5
psrld xmm7,SCALEBITS ; xmm7=CbOL psrld xmm7, SCALEBITS ; xmm7=CbOL
psrld xmm4,SCALEBITS ; xmm4=CbOH psrld xmm4, SCALEBITS ; xmm4=CbOH
packssdw xmm7,xmm4 ; xmm7=CbO packssdw xmm7, xmm4 ; xmm7=CbO
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
movdqa xmm6,xmm0 movdqa xmm6, xmm0
punpcklwd xmm0,xmm2 punpcklwd xmm0, xmm2
punpckhwd xmm6,xmm2 punpckhwd xmm6, xmm2
movdqa xmm5,xmm0 movdqa xmm5, xmm0
movdqa xmm4,xmm6 movdqa xmm4, xmm6
pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
pmaddwd xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) pmaddwd xmm5, [rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
pxor xmm0,xmm0 pxor xmm0, xmm0
pxor xmm6,xmm6 pxor xmm6, xmm6
punpcklwd xmm0,xmm1 ; xmm0=BEL punpcklwd xmm0, xmm1 ; xmm0=BEL
punpckhwd xmm6,xmm1 ; xmm6=BEH punpckhwd xmm6, xmm1 ; xmm6=BEH
psrld xmm0,1 ; xmm0=BEL*FIX(0.500) psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
psrld xmm6,1 ; xmm6=BEH*FIX(0.500) psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
paddd xmm5,xmm0 paddd xmm5, xmm0
paddd xmm4,xmm6 paddd xmm4, xmm6
paddd xmm5,xmm1 paddd xmm5, xmm1
paddd xmm4,xmm1 paddd xmm4, xmm1
psrld xmm5,SCALEBITS ; xmm5=CbEL psrld xmm5, SCALEBITS ; xmm5=CbEL
psrld xmm4,SCALEBITS ; xmm4=CbEH psrld xmm4, SCALEBITS ; xmm4=CbEH
packssdw xmm5,xmm4 ; xmm5=CbE packssdw xmm5, xmm4 ; xmm5=CbE
psllw xmm7,BYTE_BIT psllw xmm7, BYTE_BIT
por xmm5,xmm7 ; xmm5=Cb por xmm5, xmm7 ; xmm5=Cb
movdqa XMMWORD [rbx], xmm5 ; Save Cb movdqa XMMWORD [rbx], xmm5 ; Save Cb
movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
movdqa xmm4,xmm0 movdqa xmm4, xmm0
punpcklwd xmm0,xmm3 punpcklwd xmm0, xmm3
punpckhwd xmm4,xmm3 punpckhwd xmm4, xmm3
movdqa xmm7,xmm0 movdqa xmm7, xmm0
movdqa xmm5,xmm4 movdqa xmm5, xmm4
pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
pmaddwd xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) pmaddwd xmm7, [rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
paddd xmm0, XMMWORD [wk(4)] paddd xmm0, XMMWORD [wk(4)]
paddd xmm4, XMMWORD [wk(5)] paddd xmm4, XMMWORD [wk(5)]
paddd xmm0,xmm3 paddd xmm0, xmm3
paddd xmm4,xmm3 paddd xmm4, xmm3
psrld xmm0,SCALEBITS ; xmm0=YOL psrld xmm0, SCALEBITS ; xmm0=YOL
psrld xmm4,SCALEBITS ; xmm4=YOH psrld xmm4, SCALEBITS ; xmm4=YOH
packssdw xmm0,xmm4 ; xmm0=YO packssdw xmm0, xmm4 ; xmm0=YO
pxor xmm3,xmm3 pxor xmm3, xmm3
pxor xmm4,xmm4 pxor xmm4, xmm4
punpcklwd xmm3,xmm1 ; xmm3=ROL punpcklwd xmm3, xmm1 ; xmm3=ROL
punpckhwd xmm4,xmm1 ; xmm4=ROH punpckhwd xmm4, xmm1 ; xmm4=ROH
psrld xmm3,1 ; xmm3=ROL*FIX(0.500) psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
psrld xmm4,1 ; xmm4=ROH*FIX(0.500) psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
paddd xmm7,xmm3 paddd xmm7, xmm3
paddd xmm5,xmm4 paddd xmm5, xmm4
paddd xmm7,xmm1 paddd xmm7, xmm1
paddd xmm5,xmm1 paddd xmm5, xmm1
psrld xmm7,SCALEBITS ; xmm7=CrOL psrld xmm7, SCALEBITS ; xmm7=CrOL
psrld xmm5,SCALEBITS ; xmm5=CrOH psrld xmm5, SCALEBITS ; xmm5=CrOH
packssdw xmm7,xmm5 ; xmm7=CrO packssdw xmm7, xmm5 ; xmm7=CrO
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
movdqa xmm4,xmm6 movdqa xmm4, xmm6
punpcklwd xmm6,xmm2 punpcklwd xmm6, xmm2
punpckhwd xmm4,xmm2 punpckhwd xmm4, xmm2
movdqa xmm1,xmm6 movdqa xmm1, xmm6
movdqa xmm5,xmm4 movdqa xmm5, xmm4
pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
pmaddwd xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) pmaddwd xmm1, [rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
paddd xmm6, XMMWORD [wk(6)] paddd xmm6, XMMWORD [wk(6)]
paddd xmm4, XMMWORD [wk(7)] paddd xmm4, XMMWORD [wk(7)]
paddd xmm6,xmm2 paddd xmm6, xmm2
paddd xmm4,xmm2 paddd xmm4, xmm2
psrld xmm6,SCALEBITS ; xmm6=YEL psrld xmm6, SCALEBITS ; xmm6=YEL
psrld xmm4,SCALEBITS ; xmm4=YEH psrld xmm4, SCALEBITS ; xmm4=YEH
packssdw xmm6,xmm4 ; xmm6=YE packssdw xmm6, xmm4 ; xmm6=YE
psllw xmm0,BYTE_BIT psllw xmm0, BYTE_BIT
por xmm6,xmm0 ; xmm6=Y por xmm6, xmm0 ; xmm6=Y
movdqa XMMWORD [rdi], xmm6 ; Save Y movdqa XMMWORD [rdi], xmm6 ; Save Y
pxor xmm2,xmm2 pxor xmm2, xmm2
pxor xmm4,xmm4 pxor xmm4, xmm4
punpcklwd xmm2,xmm3 ; xmm2=REL punpcklwd xmm2, xmm3 ; xmm2=REL
punpckhwd xmm4,xmm3 ; xmm4=REH punpckhwd xmm4, xmm3 ; xmm4=REH
psrld xmm2,1 ; xmm2=REL*FIX(0.500) psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
psrld xmm4,1 ; xmm4=REH*FIX(0.500) psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
movdqa xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ] movdqa xmm0, [rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
paddd xmm1,xmm2 paddd xmm1, xmm2
paddd xmm5,xmm4 paddd xmm5, xmm4
paddd xmm1,xmm0 paddd xmm1, xmm0
paddd xmm5,xmm0 paddd xmm5, xmm0
psrld xmm1,SCALEBITS ; xmm1=CrEL psrld xmm1, SCALEBITS ; xmm1=CrEL
psrld xmm5,SCALEBITS ; xmm5=CrEH psrld xmm5, SCALEBITS ; xmm5=CrEH
packssdw xmm1,xmm5 ; xmm1=CrE packssdw xmm1, xmm5 ; xmm1=CrE
psllw xmm7,BYTE_BIT psllw xmm7, BYTE_BIT
por xmm1,xmm7 ; xmm1=Cr por xmm1, xmm7 ; xmm1=Cr
movdqa XMMWORD [rdx], xmm1 ; Save Cr movdqa XMMWORD [rdx], xmm1 ; Save Cr
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
add rdi, byte SIZEOF_XMMWORD ; outptr0 add rdi, byte SIZEOF_XMMWORD ; outptr0
add rbx, byte SIZEOF_XMMWORD ; outptr1 add rbx, byte SIZEOF_XMMWORD ; outptr1
add rdx, byte SIZEOF_XMMWORD ; outptr2 add rdx, byte SIZEOF_XMMWORD ; outptr2
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
test rcx,rcx test rcx, rcx
jnz near .column_ld1 jnz near .column_ld1
pop rcx ; col pop rcx ; col
pop rsi pop rsi
pop rdi pop rdi
pop rbx pop rbx
pop rdx pop rdx
add rsi, byte SIZEOF_JSAMPROW ; input_buf add rsi, byte SIZEOF_JSAMPROW ; input_buf
add rdi, byte SIZEOF_JSAMPROW add rdi, byte SIZEOF_JSAMPROW
add rbx, byte SIZEOF_JSAMPROW add rbx, byte SIZEOF_JSAMPROW
add rdx, byte SIZEOF_JSAMPROW add rdx, byte SIZEOF_JSAMPROW
dec rax ; num_rows dec rax ; num_rows
jg near .rowloop jg near .rowloop
.return: .return:
pop rbx pop rbx
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -25,479 +25,479 @@
; JDIMENSION output_row, int num_rows); ; JDIMENSION output_row, int num_rows);
; ;
%define img_width(b) (b)+8 ; JDIMENSION img_width %define img_width(b) (b)+8 ; JDIMENSION img_width
%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf %define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf %define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
%define output_row(b) (b)+20 ; JDIMENSION output_row %define output_row(b) (b)+20 ; JDIMENSION output_row
%define num_rows(b) (b)+24 ; int num_rows %define num_rows(b) (b)+24 ; int num_rows
%define original_ebp ebp+0 %define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 8 %define WK_NUM 8
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
align 16 align 16
global EXTN(jsimd_rgb_ycc_convert_sse2) global EXTN(jsimd_rgb_ycc_convert_sse2)
EXTN(jsimd_rgb_ycc_convert_sse2): EXTN(jsimd_rgb_ycc_convert_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address pushpic eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)] mov ecx, JDIMENSION [img_width(eax)]
test ecx,ecx test ecx, ecx
jz near .return jz near .return
push ecx push ecx
mov esi, JSAMPIMAGE [output_buf(eax)] mov esi, JSAMPIMAGE [output_buf(eax)]
mov ecx, JDIMENSION [output_row(eax)] mov ecx, JDIMENSION [output_row(eax)]
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
lea edi, [edi+ecx*SIZEOF_JSAMPROW] lea edi, [edi+ecx*SIZEOF_JSAMPROW]
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
lea edx, [edx+ecx*SIZEOF_JSAMPROW] lea edx, [edx+ecx*SIZEOF_JSAMPROW]
pop ecx pop ecx
mov esi, JSAMPARRAY [input_buf(eax)] mov esi, JSAMPARRAY [input_buf(eax)]
mov eax, INT [num_rows(eax)] mov eax, INT [num_rows(eax)]
test eax,eax test eax, eax
jle near .return jle near .return
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
pushpic eax pushpic eax
push edx push edx
push ebx push ebx
push edi push edi
push esi push esi
push ecx ; col push ecx ; col
mov esi, JSAMPROW [esi] ; inptr mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0 mov edi, JSAMPROW [edi] ; outptr0
mov ebx, JSAMPROW [ebx] ; outptr1 mov ebx, JSAMPROW [ebx] ; outptr1
mov edx, JSAMPROW [edx] ; outptr2 mov edx, JSAMPROW [edx] ; outptr2
movpic eax, POINTER [gotptr] ; load GOT address (eax) movpic eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
alignx 16,7 alignx 16, 7
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1: .column_ld1:
push eax push eax
push edx push edx
lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE test cl, SIZEOF_BYTE
jz short .column_ld2 jz short .column_ld2
sub ecx, byte SIZEOF_BYTE sub ecx, byte SIZEOF_BYTE
movzx eax, BYTE [esi+ecx] movzx eax, BYTE [esi+ecx]
.column_ld2: .column_ld2:
test cl, SIZEOF_WORD test cl, SIZEOF_WORD
jz short .column_ld4 jz short .column_ld4
sub ecx, byte SIZEOF_WORD sub ecx, byte SIZEOF_WORD
movzx edx, WORD [esi+ecx] movzx edx, WORD [esi+ecx]
shl eax, WORD_BIT shl eax, WORD_BIT
or eax,edx or eax, edx
.column_ld4: .column_ld4:
movd xmmA,eax movd xmmA, eax
pop edx pop edx
pop eax pop eax
test cl, SIZEOF_DWORD test cl, SIZEOF_DWORD
jz short .column_ld8 jz short .column_ld8
sub ecx, byte SIZEOF_DWORD sub ecx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [esi+ecx] movd xmmF, XMM_DWORD [esi+ecx]
pslldq xmmA, SIZEOF_DWORD pslldq xmmA, SIZEOF_DWORD
por xmmA,xmmF por xmmA, xmmF
.column_ld8: .column_ld8:
test cl, SIZEOF_MMWORD test cl, SIZEOF_MMWORD
jz short .column_ld16 jz short .column_ld16
sub ecx, byte SIZEOF_MMWORD sub ecx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [esi+ecx] movq xmmB, XMM_MMWORD [esi+ecx]
pslldq xmmA, SIZEOF_MMWORD pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmB por xmmA, xmmB
.column_ld16: .column_ld16:
test cl, SIZEOF_XMMWORD test cl, SIZEOF_XMMWORD
jz short .column_ld32 jz short .column_ld32
movdqa xmmF,xmmA movdqa xmmF, xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
.column_ld32: .column_ld32:
test cl, 2*SIZEOF_XMMWORD test cl, 2*SIZEOF_XMMWORD
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv jz short .rgb_ycc_cnv
movdqa xmmB,xmmA movdqa xmmB, xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
.rgb_ycc_cnv: .rgb_ycc_cnv:
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
movdqa xmmG,xmmA movdqa xmmG, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
movdqa xmmE,xmmA movdqa xmmE, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
pxor xmmH,xmmH pxor xmmH, xmmH
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmB,xmmE movdqa xmmB, xmmE
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmF,xmmD movdqa xmmF, xmmD
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1: .column_ld1:
test cl, SIZEOF_XMMWORD/16 test cl, SIZEOF_XMMWORD/16
jz short .column_ld2 jz short .column_ld2
sub ecx, byte SIZEOF_XMMWORD/16 sub ecx, byte SIZEOF_XMMWORD/16
movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld2: .column_ld2:
test cl, SIZEOF_XMMWORD/8 test cl, SIZEOF_XMMWORD/8
jz short .column_ld4 jz short .column_ld4
sub ecx, byte SIZEOF_XMMWORD/8 sub ecx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmE por xmmA, xmmE
.column_ld4: .column_ld4:
test cl, SIZEOF_XMMWORD/4 test cl, SIZEOF_XMMWORD/4
jz short .column_ld8 jz short .column_ld8
sub ecx, byte SIZEOF_XMMWORD/4 sub ecx, byte SIZEOF_XMMWORD/4
movdqa xmmE,xmmA movdqa xmmE, xmmA
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld8: .column_ld8:
test cl, SIZEOF_XMMWORD/2 test cl, SIZEOF_XMMWORD/2
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv jz short .rgb_ycc_cnv
movdqa xmmF,xmmA movdqa xmmF, xmmA
movdqa xmmH,xmmE movdqa xmmH, xmmE
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
.rgb_ycc_cnv: .rgb_ycc_cnv:
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmC,xmmF movdqa xmmC, xmmF
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmB,xmmA movdqa xmmB, xmmA
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmG,xmmD movdqa xmmG, xmmD
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmE,xmmA movdqa xmmE, xmmA
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmH,xmmB movdqa xmmH, xmmB
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
pxor xmmF,xmmF pxor xmmF, xmmF
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmD,xmmB movdqa xmmD, xmmB
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmG,xmmE movdqa xmmG, xmmE
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
punpcklbw xmmF,xmmH punpcklbw xmmF, xmmH
punpckhbw xmmH,xmmH punpckhbw xmmH, xmmH
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
; (Original) ; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
; ;
; (This implementation) ; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
movdqa xmm6,xmm1 movdqa xmm6, xmm1
punpcklwd xmm1,xmm3 punpcklwd xmm1, xmm3
punpckhwd xmm6,xmm3 punpckhwd xmm6, xmm3
movdqa xmm7,xmm1 movdqa xmm7, xmm1
movdqa xmm4,xmm6 movdqa xmm4, xmm6
pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) pmaddwd xmm7, [GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
pxor xmm1,xmm1 pxor xmm1, xmm1
pxor xmm6,xmm6 pxor xmm6, xmm6
punpcklwd xmm1,xmm5 ; xmm1=BOL punpcklwd xmm1, xmm5 ; xmm1=BOL
punpckhwd xmm6,xmm5 ; xmm6=BOH punpckhwd xmm6, xmm5 ; xmm6=BOH
psrld xmm1,1 ; xmm1=BOL*FIX(0.500) psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
psrld xmm6,1 ; xmm6=BOH*FIX(0.500) psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ] movdqa xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
paddd xmm7,xmm1 paddd xmm7, xmm1
paddd xmm4,xmm6 paddd xmm4, xmm6
paddd xmm7,xmm5 paddd xmm7, xmm5
paddd xmm4,xmm5 paddd xmm4, xmm5
psrld xmm7,SCALEBITS ; xmm7=CbOL psrld xmm7, SCALEBITS ; xmm7=CbOL
psrld xmm4,SCALEBITS ; xmm4=CbOH psrld xmm4, SCALEBITS ; xmm4=CbOH
packssdw xmm7,xmm4 ; xmm7=CbO packssdw xmm7, xmm4 ; xmm7=CbO
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
movdqa xmm6,xmm0 movdqa xmm6, xmm0
punpcklwd xmm0,xmm2 punpcklwd xmm0, xmm2
punpckhwd xmm6,xmm2 punpckhwd xmm6, xmm2
movdqa xmm5,xmm0 movdqa xmm5, xmm0
movdqa xmm4,xmm6 movdqa xmm4, xmm6
pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) pmaddwd xmm5, [GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
pxor xmm0,xmm0 pxor xmm0, xmm0
pxor xmm6,xmm6 pxor xmm6, xmm6
punpcklwd xmm0,xmm1 ; xmm0=BEL punpcklwd xmm0, xmm1 ; xmm0=BEL
punpckhwd xmm6,xmm1 ; xmm6=BEH punpckhwd xmm6, xmm1 ; xmm6=BEH
psrld xmm0,1 ; xmm0=BEL*FIX(0.500) psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
psrld xmm6,1 ; xmm6=BEH*FIX(0.500) psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
paddd xmm5,xmm0 paddd xmm5, xmm0
paddd xmm4,xmm6 paddd xmm4, xmm6
paddd xmm5,xmm1 paddd xmm5, xmm1
paddd xmm4,xmm1 paddd xmm4, xmm1
psrld xmm5,SCALEBITS ; xmm5=CbEL psrld xmm5, SCALEBITS ; xmm5=CbEL
psrld xmm4,SCALEBITS ; xmm4=CbEH psrld xmm4, SCALEBITS ; xmm4=CbEH
packssdw xmm5,xmm4 ; xmm5=CbE packssdw xmm5, xmm4 ; xmm5=CbE
psllw xmm7,BYTE_BIT psllw xmm7, BYTE_BIT
por xmm5,xmm7 ; xmm5=Cb por xmm5, xmm7 ; xmm5=Cb
movdqa XMMWORD [ebx], xmm5 ; Save Cb movdqa XMMWORD [ebx], xmm5 ; Save Cb
movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
movdqa xmm4,xmm0 movdqa xmm4, xmm0
punpcklwd xmm0,xmm3 punpcklwd xmm0, xmm3
punpckhwd xmm4,xmm3 punpckhwd xmm4, xmm3
movdqa xmm7,xmm0 movdqa xmm7, xmm0
movdqa xmm5,xmm4 movdqa xmm5, xmm4
pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) pmaddwd xmm7, [GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
paddd xmm0, XMMWORD [wk(4)] paddd xmm0, XMMWORD [wk(4)]
paddd xmm4, XMMWORD [wk(5)] paddd xmm4, XMMWORD [wk(5)]
paddd xmm0,xmm3 paddd xmm0, xmm3
paddd xmm4,xmm3 paddd xmm4, xmm3
psrld xmm0,SCALEBITS ; xmm0=YOL psrld xmm0, SCALEBITS ; xmm0=YOL
psrld xmm4,SCALEBITS ; xmm4=YOH psrld xmm4, SCALEBITS ; xmm4=YOH
packssdw xmm0,xmm4 ; xmm0=YO packssdw xmm0, xmm4 ; xmm0=YO
pxor xmm3,xmm3 pxor xmm3, xmm3
pxor xmm4,xmm4 pxor xmm4, xmm4
punpcklwd xmm3,xmm1 ; xmm3=ROL punpcklwd xmm3, xmm1 ; xmm3=ROL
punpckhwd xmm4,xmm1 ; xmm4=ROH punpckhwd xmm4, xmm1 ; xmm4=ROH
psrld xmm3,1 ; xmm3=ROL*FIX(0.500) psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
psrld xmm4,1 ; xmm4=ROH*FIX(0.500) psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
paddd xmm7,xmm3 paddd xmm7, xmm3
paddd xmm5,xmm4 paddd xmm5, xmm4
paddd xmm7,xmm1 paddd xmm7, xmm1
paddd xmm5,xmm1 paddd xmm5, xmm1
psrld xmm7,SCALEBITS ; xmm7=CrOL psrld xmm7, SCALEBITS ; xmm7=CrOL
psrld xmm5,SCALEBITS ; xmm5=CrOH psrld xmm5, SCALEBITS ; xmm5=CrOH
packssdw xmm7,xmm5 ; xmm7=CrO packssdw xmm7, xmm5 ; xmm7=CrO
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
movdqa xmm4,xmm6 movdqa xmm4, xmm6
punpcklwd xmm6,xmm2 punpcklwd xmm6, xmm2
punpckhwd xmm4,xmm2 punpckhwd xmm4, xmm2
movdqa xmm1,xmm6 movdqa xmm1, xmm6
movdqa xmm5,xmm4 movdqa xmm5, xmm4
pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) pmaddwd xmm1, [GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
paddd xmm6, XMMWORD [wk(6)] paddd xmm6, XMMWORD [wk(6)]
paddd xmm4, XMMWORD [wk(7)] paddd xmm4, XMMWORD [wk(7)]
paddd xmm6,xmm2 paddd xmm6, xmm2
paddd xmm4,xmm2 paddd xmm4, xmm2
psrld xmm6,SCALEBITS ; xmm6=YEL psrld xmm6, SCALEBITS ; xmm6=YEL
psrld xmm4,SCALEBITS ; xmm4=YEH psrld xmm4, SCALEBITS ; xmm4=YEH
packssdw xmm6,xmm4 ; xmm6=YE packssdw xmm6, xmm4 ; xmm6=YE
psllw xmm0,BYTE_BIT psllw xmm0, BYTE_BIT
por xmm6,xmm0 ; xmm6=Y por xmm6, xmm0 ; xmm6=Y
movdqa XMMWORD [edi], xmm6 ; Save Y movdqa XMMWORD [edi], xmm6 ; Save Y
pxor xmm2,xmm2 pxor xmm2, xmm2
pxor xmm4,xmm4 pxor xmm4, xmm4
punpcklwd xmm2,xmm3 ; xmm2=REL punpcklwd xmm2, xmm3 ; xmm2=REL
punpckhwd xmm4,xmm3 ; xmm4=REH punpckhwd xmm4, xmm3 ; xmm4=REH
psrld xmm2,1 ; xmm2=REL*FIX(0.500) psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
psrld xmm4,1 ; xmm4=REH*FIX(0.500) psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ] movdqa xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
paddd xmm1,xmm2 paddd xmm1, xmm2
paddd xmm5,xmm4 paddd xmm5, xmm4
paddd xmm1,xmm0 paddd xmm1, xmm0
paddd xmm5,xmm0 paddd xmm5, xmm0
psrld xmm1,SCALEBITS ; xmm1=CrEL psrld xmm1, SCALEBITS ; xmm1=CrEL
psrld xmm5,SCALEBITS ; xmm5=CrEH psrld xmm5, SCALEBITS ; xmm5=CrEH
packssdw xmm1,xmm5 ; xmm1=CrE packssdw xmm1, xmm5 ; xmm1=CrE
psllw xmm7,BYTE_BIT psllw xmm7, BYTE_BIT
por xmm1,xmm7 ; xmm1=Cr por xmm1, xmm7 ; xmm1=Cr
movdqa XMMWORD [edx], xmm1 ; Save Cr movdqa XMMWORD [edx], xmm1 ; Save Cr
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
add edi, byte SIZEOF_XMMWORD ; outptr0 add edi, byte SIZEOF_XMMWORD ; outptr0
add ebx, byte SIZEOF_XMMWORD ; outptr1 add ebx, byte SIZEOF_XMMWORD ; outptr1
add edx, byte SIZEOF_XMMWORD ; outptr2 add edx, byte SIZEOF_XMMWORD ; outptr2
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
test ecx,ecx test ecx, ecx
jnz near .column_ld1 jnz near .column_ld1
pop ecx ; col pop ecx ; col
pop esi pop esi
pop edi pop edi
pop ebx pop ebx
pop edx pop edx
poppic eax poppic eax
add esi, byte SIZEOF_JSAMPROW ; input_buf add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW add edi, byte SIZEOF_JSAMPROW
add ebx, byte SIZEOF_JSAMPROW add ebx, byte SIZEOF_JSAMPROW
add edx, byte SIZEOF_JSAMPROW add edx, byte SIZEOF_JSAMPROW
dec eax ; num_rows dec eax ; num_rows
jg near .rowloop jg near .rowloop
.return: .return:
pop edi pop edi
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
pop ebx pop ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -19,23 +19,23 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%define SCALEBITS 16 %define SCALEBITS 16
F_0_081 equ 5329 ; FIX(0.08131) F_0_081 equ 5329 ; FIX(0.08131)
F_0_114 equ 7471 ; FIX(0.11400) F_0_114 equ 7471 ; FIX(0.11400)
F_0_168 equ 11059 ; FIX(0.16874) F_0_168 equ 11059 ; FIX(0.16874)
F_0_250 equ 16384 ; FIX(0.25000) F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900) F_0_299 equ 19595 ; FIX(0.29900)
F_0_331 equ 21709 ; FIX(0.33126) F_0_331 equ 21709 ; FIX(0.33126)
F_0_418 equ 27439 ; FIX(0.41869) F_0_418 equ 27439 ; FIX(0.41869)
F_0_587 equ 38470 ; FIX(0.58700) F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 16 alignz 16
global EXTN(jconst_rgb_ycc_convert_sse2) global EXTN(jconst_rgb_ycc_convert_sse2)
EXTN(jconst_rgb_ycc_convert_sse2): EXTN(jconst_rgb_ycc_convert_sse2):
@@ -46,11 +46,11 @@ PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418
PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 64 BITS 64
%include "jccolext-sse2-64.asm" %include "jccolext-sse2-64.asm"

View File

@@ -19,23 +19,23 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%define SCALEBITS 16 %define SCALEBITS 16
F_0_081 equ 5329 ; FIX(0.08131) F_0_081 equ 5329 ; FIX(0.08131)
F_0_114 equ 7471 ; FIX(0.11400) F_0_114 equ 7471 ; FIX(0.11400)
F_0_168 equ 11059 ; FIX(0.16874) F_0_168 equ 11059 ; FIX(0.16874)
F_0_250 equ 16384 ; FIX(0.25000) F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900) F_0_299 equ 19595 ; FIX(0.29900)
F_0_331 equ 21709 ; FIX(0.33126) F_0_331 equ 21709 ; FIX(0.33126)
F_0_418 equ 27439 ; FIX(0.41869) F_0_418 equ 27439 ; FIX(0.41869)
F_0_587 equ 38470 ; FIX(0.58700) F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 16 alignz 16
global EXTN(jconst_rgb_ycc_convert_sse2) global EXTN(jconst_rgb_ycc_convert_sse2)
EXTN(jconst_rgb_ycc_convert_sse2): EXTN(jconst_rgb_ycc_convert_sse2):
@@ -46,11 +46,11 @@ PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418
PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 32 BITS 32
%include "jccolext-sse2.asm" %include "jccolext-sse2.asm"

View File

@@ -19,31 +19,31 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%define SCALEBITS 16 %define SCALEBITS 16
F_0_114 equ 7471 ; FIX(0.11400) F_0_114 equ 7471 ; FIX(0.11400)
F_0_250 equ 16384 ; FIX(0.25000) F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900) F_0_299 equ 19595 ; FIX(0.29900)
F_0_587 equ 38470 ; FIX(0.58700) F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 16 alignz 16
global EXTN(jconst_rgb_gray_convert_sse2) global EXTN(jconst_rgb_gray_convert_sse2)
EXTN(jconst_rgb_gray_convert_sse2): EXTN(jconst_rgb_gray_convert_sse2):
PW_F0299_F0337 times 4 dw F_0_299, F_0_337 PW_F0299_F0337 times 4 dw F_0_299, F_0_337
PW_F0114_F0250 times 4 dw F_0_114, F_0_250 PW_F0114_F0250 times 4 dw F_0_114, F_0_250
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 64 BITS 64
%include "jcgryext-sse2-64.asm" %include "jcgryext-sse2-64.asm"

View File

@@ -19,31 +19,31 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%define SCALEBITS 16 %define SCALEBITS 16
F_0_114 equ 7471 ; FIX(0.11400) F_0_114 equ 7471 ; FIX(0.11400)
F_0_250 equ 16384 ; FIX(0.25000) F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900) F_0_299 equ 19595 ; FIX(0.29900)
F_0_587 equ 38470 ; FIX(0.58700) F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 16 alignz 16
global EXTN(jconst_rgb_gray_convert_sse2) global EXTN(jconst_rgb_gray_convert_sse2)
EXTN(jconst_rgb_gray_convert_sse2): EXTN(jconst_rgb_gray_convert_sse2):
PW_F0299_F0337 times 4 dw F_0_299, F_0_337 PW_F0299_F0337 times 4 dw F_0_299, F_0_337
PW_F0114_F0250 times 4 dw F_0_114, F_0_250 PW_F0114_F0250 times 4 dw F_0_114, F_0_250
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 32 BITS 32
%include "jcgryext-sse2.asm" %include "jcgryext-sse2.asm"

View File

@@ -33,333 +33,333 @@
; r13 = JDIMENSION output_row ; r13 = JDIMENSION output_row
; r14 = int num_rows ; r14 = int num_rows
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
align 16 align 16
global EXTN(jsimd_rgb_gray_convert_sse2) global EXTN(jsimd_rgb_gray_convert_sse2)
EXTN(jsimd_rgb_gray_convert_sse2): EXTN(jsimd_rgb_gray_convert_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
push rbx push rbx
mov ecx, r10d mov ecx, r10d
test rcx,rcx test rcx, rcx
jz near .return jz near .return
push rcx push rcx
mov rsi, r12 mov rsi, r12
mov ecx, r13d mov ecx, r13d
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
pop rcx pop rcx
mov rsi, r11 mov rsi, r11
mov eax, r14d mov eax, r14d
test rax,rax test rax, rax
jle near .return jle near .return
.rowloop: .rowloop:
push rdi push rdi
push rsi push rsi
push rcx ; col push rcx ; col
mov rsi, JSAMPROW [rsi] ; inptr mov rsi, JSAMPROW [rsi] ; inptr
mov rdi, JSAMPROW [rdi] ; outptr0 mov rdi, JSAMPROW [rdi] ; outptr0
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1: .column_ld1:
push rax push rax
push rdx push rdx
lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE test cl, SIZEOF_BYTE
jz short .column_ld2 jz short .column_ld2
sub rcx, byte SIZEOF_BYTE sub rcx, byte SIZEOF_BYTE
movzx rax, BYTE [rsi+rcx] movzx rax, BYTE [rsi+rcx]
.column_ld2: .column_ld2:
test cl, SIZEOF_WORD test cl, SIZEOF_WORD
jz short .column_ld4 jz short .column_ld4
sub rcx, byte SIZEOF_WORD sub rcx, byte SIZEOF_WORD
movzx rdx, WORD [rsi+rcx] movzx rdx, WORD [rsi+rcx]
shl rax, WORD_BIT shl rax, WORD_BIT
or rax,rdx or rax, rdx
.column_ld4: .column_ld4:
movd xmmA,eax movd xmmA, eax
pop rdx pop rdx
pop rax pop rax
test cl, SIZEOF_DWORD test cl, SIZEOF_DWORD
jz short .column_ld8 jz short .column_ld8
sub rcx, byte SIZEOF_DWORD sub rcx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [rsi+rcx] movd xmmF, XMM_DWORD [rsi+rcx]
pslldq xmmA, SIZEOF_DWORD pslldq xmmA, SIZEOF_DWORD
por xmmA,xmmF por xmmA, xmmF
.column_ld8: .column_ld8:
test cl, SIZEOF_MMWORD test cl, SIZEOF_MMWORD
jz short .column_ld16 jz short .column_ld16
sub rcx, byte SIZEOF_MMWORD sub rcx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [rsi+rcx] movq xmmB, XMM_MMWORD [rsi+rcx]
pslldq xmmA, SIZEOF_MMWORD pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmB por xmmA, xmmB
.column_ld16: .column_ld16:
test cl, SIZEOF_XMMWORD test cl, SIZEOF_XMMWORD
jz short .column_ld32 jz short .column_ld32
movdqa xmmF,xmmA movdqa xmmF, xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
mov rcx, SIZEOF_XMMWORD mov rcx, SIZEOF_XMMWORD
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
.column_ld32: .column_ld32:
test cl, 2*SIZEOF_XMMWORD test cl, 2*SIZEOF_XMMWORD
mov rcx, SIZEOF_XMMWORD mov rcx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv jz short .rgb_gray_cnv
movdqa xmmB,xmmA movdqa xmmB, xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
.columnloop: .columnloop:
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
.rgb_gray_cnv: .rgb_gray_cnv:
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
movdqa xmmG,xmmA movdqa xmmG, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
movdqa xmmE,xmmA movdqa xmmE, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
pxor xmmH,xmmH pxor xmmH, xmmH
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmB,xmmE movdqa xmmB, xmmE
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmF,xmmD movdqa xmmF, xmmD
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1: .column_ld1:
test cl, SIZEOF_XMMWORD/16 test cl, SIZEOF_XMMWORD/16
jz short .column_ld2 jz short .column_ld2
sub rcx, byte SIZEOF_XMMWORD/16 sub rcx, byte SIZEOF_XMMWORD/16
movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
.column_ld2: .column_ld2:
test cl, SIZEOF_XMMWORD/8 test cl, SIZEOF_XMMWORD/8
jz short .column_ld4 jz short .column_ld4
sub rcx, byte SIZEOF_XMMWORD/8 sub rcx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmE por xmmA, xmmE
.column_ld4: .column_ld4:
test cl, SIZEOF_XMMWORD/4 test cl, SIZEOF_XMMWORD/4
jz short .column_ld8 jz short .column_ld8
sub rcx, byte SIZEOF_XMMWORD/4 sub rcx, byte SIZEOF_XMMWORD/4
movdqa xmmE,xmmA movdqa xmmE, xmmA
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
.column_ld8: .column_ld8:
test cl, SIZEOF_XMMWORD/2 test cl, SIZEOF_XMMWORD/2
mov rcx, SIZEOF_XMMWORD mov rcx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv jz short .rgb_gray_cnv
movdqa xmmF,xmmA movdqa xmmF, xmmA
movdqa xmmH,xmmE movdqa xmmH, xmmE
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
.columnloop: .columnloop:
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
.rgb_gray_cnv: .rgb_gray_cnv:
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmC,xmmF movdqa xmmC, xmmF
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmB,xmmA movdqa xmmB, xmmA
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmG,xmmD movdqa xmmG, xmmD
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmE,xmmA movdqa xmmE, xmmA
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmH,xmmB movdqa xmmH, xmmB
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
pxor xmmF,xmmF pxor xmmF, xmmF
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmD,xmmB movdqa xmmD, xmmB
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmG,xmmE movdqa xmmG, xmmE
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
punpcklbw xmmF,xmmH punpcklbw xmmF, xmmH
punpckhbw xmmH,xmmH punpckhbw xmmH, xmmH
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
; (Original) ; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
; ;
; (This implementation) ; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
movdqa xmm6,xmm1 movdqa xmm6, xmm1
punpcklwd xmm1,xmm3 punpcklwd xmm1, xmm3
punpckhwd xmm6,xmm3 punpckhwd xmm6, xmm3
pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm6,xmm0 movdqa xmm6, xmm0
punpcklwd xmm0,xmm2 punpcklwd xmm0, xmm2
punpckhwd xmm6,xmm2 punpckhwd xmm6, xmm2
pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa xmm0, xmm5 ; xmm0=BO movdqa xmm0, xmm5 ; xmm0=BO
movdqa xmm6, xmm4 ; xmm6=BE movdqa xmm6, xmm4 ; xmm6=BE
movdqa xmm4,xmm0 movdqa xmm4, xmm0
punpcklwd xmm0,xmm3 punpcklwd xmm0, xmm3
punpckhwd xmm4,xmm3 punpckhwd xmm4, xmm3
pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
paddd xmm0, xmm1 paddd xmm0, xmm1
paddd xmm4, xmm7 paddd xmm4, xmm7
paddd xmm0,xmm3 paddd xmm0, xmm3
paddd xmm4,xmm3 paddd xmm4, xmm3
psrld xmm0,SCALEBITS ; xmm0=YOL psrld xmm0, SCALEBITS ; xmm0=YOL
psrld xmm4,SCALEBITS ; xmm4=YOH psrld xmm4, SCALEBITS ; xmm4=YOH
packssdw xmm0,xmm4 ; xmm0=YO packssdw xmm0, xmm4 ; xmm0=YO
movdqa xmm4,xmm6 movdqa xmm4, xmm6
punpcklwd xmm6,xmm2 punpcklwd xmm6, xmm2
punpckhwd xmm4,xmm2 punpckhwd xmm4, xmm2
pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
paddd xmm6, XMMWORD [wk(0)] paddd xmm6, XMMWORD [wk(0)]
paddd xmm4, XMMWORD [wk(1)] paddd xmm4, XMMWORD [wk(1)]
paddd xmm6,xmm2 paddd xmm6, xmm2
paddd xmm4,xmm2 paddd xmm4, xmm2
psrld xmm6,SCALEBITS ; xmm6=YEL psrld xmm6, SCALEBITS ; xmm6=YEL
psrld xmm4,SCALEBITS ; xmm4=YEH psrld xmm4, SCALEBITS ; xmm4=YEH
packssdw xmm6,xmm4 ; xmm6=YE packssdw xmm6, xmm4 ; xmm6=YE
psllw xmm0,BYTE_BIT psllw xmm0, BYTE_BIT
por xmm6,xmm0 ; xmm6=Y por xmm6, xmm0 ; xmm6=Y
movdqa XMMWORD [rdi], xmm6 ; Save Y movdqa XMMWORD [rdi], xmm6 ; Save Y
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
add rdi, byte SIZEOF_XMMWORD ; outptr0 add rdi, byte SIZEOF_XMMWORD ; outptr0
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
test rcx,rcx test rcx, rcx
jnz near .column_ld1 jnz near .column_ld1
pop rcx ; col pop rcx ; col
pop rsi pop rsi
pop rdi pop rdi
add rsi, byte SIZEOF_JSAMPROW ; input_buf add rsi, byte SIZEOF_JSAMPROW ; input_buf
add rdi, byte SIZEOF_JSAMPROW add rdi, byte SIZEOF_JSAMPROW
dec rax ; num_rows dec rax ; num_rows
jg near .rowloop jg near .rowloop
.return: .return:
pop rbx pop rbx
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -27,358 +27,358 @@
; JDIMENSION output_row, int num_rows); ; JDIMENSION output_row, int num_rows);
; ;
%define img_width(b) (b)+8 ; JDIMENSION img_width %define img_width(b) (b)+8 ; JDIMENSION img_width
%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf %define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf %define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
%define output_row(b) (b)+20 ; JDIMENSION output_row %define output_row(b) (b)+20 ; JDIMENSION output_row
%define num_rows(b) (b)+24 ; int num_rows %define num_rows(b) (b)+24 ; int num_rows
%define original_ebp ebp+0 %define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
align 16 align 16
global EXTN(jsimd_rgb_gray_convert_sse2) global EXTN(jsimd_rgb_gray_convert_sse2)
EXTN(jsimd_rgb_gray_convert_sse2): EXTN(jsimd_rgb_gray_convert_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address pushpic eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)] mov ecx, JDIMENSION [img_width(eax)]
test ecx,ecx test ecx, ecx
jz near .return jz near .return
push ecx push ecx
mov esi, JSAMPIMAGE [output_buf(eax)] mov esi, JSAMPIMAGE [output_buf(eax)]
mov ecx, JDIMENSION [output_row(eax)] mov ecx, JDIMENSION [output_row(eax)]
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
lea edi, [edi+ecx*SIZEOF_JSAMPROW] lea edi, [edi+ecx*SIZEOF_JSAMPROW]
pop ecx pop ecx
mov esi, JSAMPARRAY [input_buf(eax)] mov esi, JSAMPARRAY [input_buf(eax)]
mov eax, INT [num_rows(eax)] mov eax, INT [num_rows(eax)]
test eax,eax test eax, eax
jle near .return jle near .return
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
pushpic eax pushpic eax
push edi push edi
push esi push esi
push ecx ; col push ecx ; col
mov esi, JSAMPROW [esi] ; inptr mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0 mov edi, JSAMPROW [edi] ; outptr0
movpic eax, POINTER [gotptr] ; load GOT address (eax) movpic eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
alignx 16,7 alignx 16, 7
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1: .column_ld1:
push eax push eax
push edx push edx
lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE test cl, SIZEOF_BYTE
jz short .column_ld2 jz short .column_ld2
sub ecx, byte SIZEOF_BYTE sub ecx, byte SIZEOF_BYTE
movzx eax, BYTE [esi+ecx] movzx eax, BYTE [esi+ecx]
.column_ld2: .column_ld2:
test cl, SIZEOF_WORD test cl, SIZEOF_WORD
jz short .column_ld4 jz short .column_ld4
sub ecx, byte SIZEOF_WORD sub ecx, byte SIZEOF_WORD
movzx edx, WORD [esi+ecx] movzx edx, WORD [esi+ecx]
shl eax, WORD_BIT shl eax, WORD_BIT
or eax,edx or eax, edx
.column_ld4: .column_ld4:
movd xmmA,eax movd xmmA, eax
pop edx pop edx
pop eax pop eax
test cl, SIZEOF_DWORD test cl, SIZEOF_DWORD
jz short .column_ld8 jz short .column_ld8
sub ecx, byte SIZEOF_DWORD sub ecx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [esi+ecx] movd xmmF, XMM_DWORD [esi+ecx]
pslldq xmmA, SIZEOF_DWORD pslldq xmmA, SIZEOF_DWORD
por xmmA,xmmF por xmmA, xmmF
.column_ld8: .column_ld8:
test cl, SIZEOF_MMWORD test cl, SIZEOF_MMWORD
jz short .column_ld16 jz short .column_ld16
sub ecx, byte SIZEOF_MMWORD sub ecx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [esi+ecx] movq xmmB, XMM_MMWORD [esi+ecx]
pslldq xmmA, SIZEOF_MMWORD pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmB por xmmA, xmmB
.column_ld16: .column_ld16:
test cl, SIZEOF_XMMWORD test cl, SIZEOF_XMMWORD
jz short .column_ld32 jz short .column_ld32
movdqa xmmF,xmmA movdqa xmmF, xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
.column_ld32: .column_ld32:
test cl, 2*SIZEOF_XMMWORD test cl, 2*SIZEOF_XMMWORD
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv jz short .rgb_gray_cnv
movdqa xmmB,xmmA movdqa xmmB, xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
.rgb_gray_cnv: .rgb_gray_cnv:
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
movdqa xmmG,xmmA movdqa xmmG, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
movdqa xmmE,xmmA movdqa xmmE, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
pxor xmmH,xmmH pxor xmmH, xmmH
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmB,xmmE movdqa xmmB, xmmE
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmF,xmmD movdqa xmmF, xmmD
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1: .column_ld1:
test cl, SIZEOF_XMMWORD/16 test cl, SIZEOF_XMMWORD/16
jz short .column_ld2 jz short .column_ld2
sub ecx, byte SIZEOF_XMMWORD/16 sub ecx, byte SIZEOF_XMMWORD/16
movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld2: .column_ld2:
test cl, SIZEOF_XMMWORD/8 test cl, SIZEOF_XMMWORD/8
jz short .column_ld4 jz short .column_ld4
sub ecx, byte SIZEOF_XMMWORD/8 sub ecx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmE por xmmA, xmmE
.column_ld4: .column_ld4:
test cl, SIZEOF_XMMWORD/4 test cl, SIZEOF_XMMWORD/4
jz short .column_ld8 jz short .column_ld8
sub ecx, byte SIZEOF_XMMWORD/4 sub ecx, byte SIZEOF_XMMWORD/4
movdqa xmmE,xmmA movdqa xmmE, xmmA
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld8: .column_ld8:
test cl, SIZEOF_XMMWORD/2 test cl, SIZEOF_XMMWORD/2
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv jz short .rgb_gray_cnv
movdqa xmmF,xmmA movdqa xmmF, xmmA
movdqa xmmH,xmmE movdqa xmmH, xmmE
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
.rgb_gray_cnv: .rgb_gray_cnv:
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmC,xmmF movdqa xmmC, xmmF
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmB,xmmA movdqa xmmB, xmmA
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmG,xmmD movdqa xmmG, xmmD
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmE,xmmA movdqa xmmE, xmmA
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmH,xmmB movdqa xmmH, xmmB
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
pxor xmmF,xmmF pxor xmmF, xmmF
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmD,xmmB movdqa xmmD, xmmB
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmG,xmmE movdqa xmmG, xmmE
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
punpcklbw xmmF,xmmH punpcklbw xmmF, xmmH
punpckhbw xmmH,xmmH punpckhbw xmmH, xmmH
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
; (Original) ; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
; ;
; (This implementation) ; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
movdqa xmm6,xmm1 movdqa xmm6, xmm1
punpcklwd xmm1,xmm3 punpcklwd xmm1, xmm3
punpckhwd xmm6,xmm3 punpckhwd xmm6, xmm3
pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm6,xmm0 movdqa xmm6, xmm0
punpcklwd xmm0,xmm2 punpcklwd xmm0, xmm2
punpckhwd xmm6,xmm2 punpckhwd xmm6, xmm2
pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa xmm0, xmm5 ; xmm0=BO movdqa xmm0, xmm5 ; xmm0=BO
movdqa xmm6, xmm4 ; xmm6=BE movdqa xmm6, xmm4 ; xmm6=BE
movdqa xmm4,xmm0 movdqa xmm4, xmm0
punpcklwd xmm0,xmm3 punpcklwd xmm0, xmm3
punpckhwd xmm4,xmm3 punpckhwd xmm4, xmm3
pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
paddd xmm0, xmm1 paddd xmm0, xmm1
paddd xmm4, xmm7 paddd xmm4, xmm7
paddd xmm0,xmm3 paddd xmm0, xmm3
paddd xmm4,xmm3 paddd xmm4, xmm3
psrld xmm0,SCALEBITS ; xmm0=YOL psrld xmm0, SCALEBITS ; xmm0=YOL
psrld xmm4,SCALEBITS ; xmm4=YOH psrld xmm4, SCALEBITS ; xmm4=YOH
packssdw xmm0,xmm4 ; xmm0=YO packssdw xmm0, xmm4 ; xmm0=YO
movdqa xmm4,xmm6 movdqa xmm4, xmm6
punpcklwd xmm6,xmm2 punpcklwd xmm6, xmm2
punpckhwd xmm4,xmm2 punpckhwd xmm4, xmm2
pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
paddd xmm6, XMMWORD [wk(0)] paddd xmm6, XMMWORD [wk(0)]
paddd xmm4, XMMWORD [wk(1)] paddd xmm4, XMMWORD [wk(1)]
paddd xmm6,xmm2 paddd xmm6, xmm2
paddd xmm4,xmm2 paddd xmm4, xmm2
psrld xmm6,SCALEBITS ; xmm6=YEL psrld xmm6, SCALEBITS ; xmm6=YEL
psrld xmm4,SCALEBITS ; xmm4=YEH psrld xmm4, SCALEBITS ; xmm4=YEH
packssdw xmm6,xmm4 ; xmm6=YE packssdw xmm6, xmm4 ; xmm6=YE
psllw xmm0,BYTE_BIT psllw xmm0, BYTE_BIT
por xmm6,xmm0 ; xmm6=Y por xmm6, xmm0 ; xmm6=Y
movdqa XMMWORD [edi], xmm6 ; Save Y movdqa XMMWORD [edi], xmm6 ; Save Y
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
add edi, byte SIZEOF_XMMWORD ; outptr0 add edi, byte SIZEOF_XMMWORD ; outptr0
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
test ecx,ecx test ecx, ecx
jnz near .column_ld1 jnz near .column_ld1
pop ecx ; col pop ecx ; col
pop esi pop esi
pop edi pop edi
poppic eax poppic eax
add esi, byte SIZEOF_JSAMPROW ; input_buf add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW add edi, byte SIZEOF_JSAMPROW
dec eax ; num_rows dec eax ; num_rows
jg near .rowloop jg near .rowloop
.return: .return:
pop edi pop edi
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
pop ebx pop ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -23,20 +23,20 @@
%include "jsimdext.inc" %include "jsimdext.inc"
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 16 alignz 16
global EXTN(jconst_huff_encode_one_block) global EXTN(jconst_huff_encode_one_block)
EXTN(jconst_huff_encode_one_block): EXTN(jconst_huff_encode_one_block):
%include "jpeg_nbits_table.inc" %include "jpeg_nbits_table.inc"
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 64 BITS 64
; These macros perform the same task as the emit_bits() function in the ; These macros perform the same task as the emit_bits() function in the
; original libjpeg code. In addition to reducing overhead by explicitly ; original libjpeg code. In addition to reducing overhead by explicitly
@@ -46,118 +46,118 @@ EXTN(jconst_huff_encode_one_block):
; bytes can be stored in a 64-bit bit buffer before it has to be emptied. ; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
%macro EMIT_BYTE 0 %macro EMIT_BYTE 0
sub put_bits, 8 ; put_bits -= 8; sub put_bits, 8 ; put_bits -= 8;
mov rdx, put_buffer mov rdx, put_buffer
mov ecx, put_bits mov ecx, put_bits
shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits); shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
mov byte [buffer], dl ; *buffer++ = c; mov byte [buffer], dl ; *buffer++ = c;
add buffer, 1 add buffer, 1
cmp dl, 0xFF ; need to stuff a zero byte? cmp dl, 0xFF ; need to stuff a zero byte?
jne %%.EMIT_BYTE_END jne %%.EMIT_BYTE_END
mov byte [buffer], 0 ; *buffer++ = 0; mov byte [buffer], 0 ; *buffer++ = 0;
add buffer, 1 add buffer, 1
%%.EMIT_BYTE_END: %%.EMIT_BYTE_END:
%endmacro %endmacro
%macro PUT_BITS 1 %macro PUT_BITS 1
add put_bits, ecx ; put_bits += size; add put_bits, ecx ; put_bits += size;
shl put_buffer, cl ; put_buffer = (put_buffer << size); shl put_buffer, cl ; put_buffer = (put_buffer << size);
or put_buffer, %1 or put_buffer, %1
%endmacro %endmacro
%macro CHECKBUF31 0 %macro CHECKBUF31 0
cmp put_bits, 32 ; if (put_bits > 31) { cmp put_bits, 32 ; if (put_bits > 31) {
jl %%.CHECKBUF31_END jl %%.CHECKBUF31_END
EMIT_BYTE EMIT_BYTE
EMIT_BYTE EMIT_BYTE
EMIT_BYTE EMIT_BYTE
EMIT_BYTE EMIT_BYTE
%%.CHECKBUF31_END: %%.CHECKBUF31_END:
%endmacro %endmacro
%macro CHECKBUF47 0 %macro CHECKBUF47 0
cmp put_bits, 48 ; if (put_bits > 47) { cmp put_bits, 48 ; if (put_bits > 47) {
jl %%.CHECKBUF47_END jl %%.CHECKBUF47_END
EMIT_BYTE EMIT_BYTE
EMIT_BYTE EMIT_BYTE
EMIT_BYTE EMIT_BYTE
EMIT_BYTE EMIT_BYTE
EMIT_BYTE EMIT_BYTE
EMIT_BYTE EMIT_BYTE
%%.CHECKBUF47_END: %%.CHECKBUF47_END:
%endmacro %endmacro
%macro EMIT_BITS 2 %macro EMIT_BITS 2
CHECKBUF47 CHECKBUF47
mov ecx, %2 mov ecx, %2
PUT_BITS %1 PUT_BITS %1
%endmacro %endmacro
%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3) %macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128(); pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128();
pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128(); pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128();
pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128(); pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128();
pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128(); pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128();
pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0]; pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8]; pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16]; pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24]; pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1]; pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9]; pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17]; pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25]; pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2]; pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10]; pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18]; pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26]; pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3]; pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11]; pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19]; pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27]; pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4]; pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12]; pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20]; pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28]; pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5]; pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13]; pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21]; pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29]; pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6]; pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14]; pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22]; pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30]; pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7]; pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15]; pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23]; pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
%if %1 != 32 %if %1 != 32
pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31]; pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
%else %else
pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31]; pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31];
%endif %endif
pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1); pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1); pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1); pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1); pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg); paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg);
paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg); paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg);
paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg); paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg);
paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg); paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg);
pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg); pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg);
pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg); pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg);
pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg); pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg);
pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg); pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg);
pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1); pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1);
pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1); pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1);
pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1); pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1);
pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1); pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1);
movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1); movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1); movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1); movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1); movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg); movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg); movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg); movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg); movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
%endmacro %endmacro
; ;
@@ -176,185 +176,185 @@ EXTN(jconst_huff_encode_one_block):
; r14 = c_derived_tbl *dctbl ; r14 = c_derived_tbl *dctbl
; r15 = c_derived_tbl *actbl ; r15 = c_derived_tbl *actbl
%define t1 rbp-(DCTSIZE2*SIZEOF_WORD) %define t1 rbp-(DCTSIZE2*SIZEOF_WORD)
%define t2 t1-(DCTSIZE2*SIZEOF_WORD) %define t2 t1-(DCTSIZE2*SIZEOF_WORD)
%define put_buffer r8 %define put_buffer r8
%define put_bits r9d %define put_bits r9d
%define buffer rax %define buffer rax
align 16 align 16
global EXTN(jsimd_huff_encode_one_block_sse2) global EXTN(jsimd_huff_encode_one_block_sse2)
EXTN(jsimd_huff_encode_one_block_sse2): EXTN(jsimd_huff_encode_one_block_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp,rsp ; rbp = aligned rbp
lea rsp, [t2] lea rsp, [t2]
collect_args collect_args
%ifdef WIN64 %ifdef WIN64
movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8 movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9 movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
movaps XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10 movaps XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10
movaps XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11 movaps XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11
sub rsp, 4*SIZEOF_XMMWORD sub rsp, 4*SIZEOF_XMMWORD
%endif %endif
push rbx push rbx
mov buffer, r11 ; r11 is now sratch mov buffer, r11 ; r11 is now sratch
mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer; mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer;
mov put_bits, DWORD [r10+24] ; put_bits = state->cur.put_bits; mov put_bits, DWORD [r10+24] ; put_bits = state->cur.put_bits;
push r10 ; r10 is now scratch push r10 ; r10 is now scratch
; Encode the DC coefficient difference per section F.1.2.1 ; Encode the DC coefficient difference per section F.1.2.1
movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val; movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val;
sub edi, r13d ; r13 is not used anymore sub edi, r13d ; r13 is not used anymore
mov ebx, edi mov ebx, edi
; This is a well-known technique for obtaining the absolute value ; This is a well-known technique for obtaining the absolute value
; without a branch. It is derived from an assembly language technique ; without a branch. It is derived from an assembly language technique
; presented in "How to Optimize for the Pentium Processors", ; presented in "How to Optimize for the Pentium Processors",
; Copyright (c) 1996, 1997 by Agner Fog. ; Copyright (c) 1996, 1997 by Agner Fog.
mov esi, edi mov esi, edi
sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
xor edi, esi ; temp ^= temp3; xor edi, esi ; temp ^= temp3;
sub edi, esi ; temp -= temp3; sub edi, esi ; temp -= temp3;
; For a negative input, want temp2 = bitwise complement of abs(input) ; For a negative input, want temp2 = bitwise complement of abs(input)
; This code assumes we are on a two's complement machine ; This code assumes we are on a two's complement machine
add ebx, esi ; temp2 += temp3; add ebx, esi ; temp2 += temp3;
; Find the number of bits needed for the magnitude of the coefficient ; Find the number of bits needed for the magnitude of the coefficient
lea r11, [rel jpeg_nbits_table] lea r11, [rel jpeg_nbits_table]
movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp); movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp);
; Emit the Huffman-coded symbol for the number of bits ; Emit the Huffman-coded symbol for the number of bits
mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits]; mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits];
movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits]; movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits];
EMIT_BITS r11, esi ; EMIT_BITS(code, size) EMIT_BITS r11, esi ; EMIT_BITS(code, size)
; Mask off any extra bits in code ; Mask off any extra bits in code
mov esi, 1 mov esi, 1
mov ecx, edi mov ecx, edi
shl esi, cl shl esi, cl
dec esi dec esi
and ebx, esi ; temp2 &= (((JLONG) 1)<<nbits) - 1; and ebx, esi ; temp2 &= (((JLONG) 1)<<nbits) - 1;
; Emit that number of bits of the value, if positive, ; Emit that number of bits of the value, if positive,
; or the complement of its magnitude, if negative. ; or the complement of its magnitude, if negative.
EMIT_BITS rbx, edi ; EMIT_BITS(temp2, nbits) EMIT_BITS rbx, edi ; EMIT_BITS(temp2, nbits)
; Prepare data ; Prepare data
xor ebx, ebx xor ebx, ebx
kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \ kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \ 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
27, 20, 13, 6, 7, 14, 21, 28, 35, \ 27, 20, 13, 6, 7, 14, 21, 28, 35, \
xmm0, xmm1, xmm2, xmm3 xmm0, xmm1, xmm2, xmm3
kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \ kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \ 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
53, 60, 61, 54, 47, 55, 62, 63, 63, \ 53, 60, 61, 54, 47, 55, 62, 63, 63, \
xmm4, xmm5, xmm6, xmm7 xmm4, xmm5, xmm6, xmm7
pxor xmm8, xmm8 pxor xmm8, xmm8
pcmpeqw xmm0, xmm8 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero); pcmpeqw xmm0, xmm8 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
pcmpeqw xmm1, xmm8 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero); pcmpeqw xmm1, xmm8 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
pcmpeqw xmm2, xmm8 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero); pcmpeqw xmm2, xmm8 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
pcmpeqw xmm3, xmm8 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero); pcmpeqw xmm3, xmm8 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
pcmpeqw xmm4, xmm8 ; tmp4 = _mm_cmpeq_epi16(tmp4, zero); pcmpeqw xmm4, xmm8 ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
pcmpeqw xmm5, xmm8 ; tmp5 = _mm_cmpeq_epi16(tmp5, zero); pcmpeqw xmm5, xmm8 ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
pcmpeqw xmm6, xmm8 ; tmp6 = _mm_cmpeq_epi16(tmp6, zero); pcmpeqw xmm6, xmm8 ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
pcmpeqw xmm7, xmm8 ; tmp7 = _mm_cmpeq_epi16(tmp7, zero); pcmpeqw xmm7, xmm8 ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1); packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3); packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
packsswb xmm4, xmm5 ; tmp4 = _mm_packs_epi16(tmp4, tmp5); packsswb xmm4, xmm5 ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
packsswb xmm6, xmm7 ; tmp6 = _mm_packs_epi16(tmp6, tmp7); packsswb xmm6, xmm7 ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
pmovmskb r11d, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0; pmovmskb r11d, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
pmovmskb r12d, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16; pmovmskb r12d, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
pmovmskb r13d, xmm4 ; index = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32; pmovmskb r13d, xmm4 ; index = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
pmovmskb r14d, xmm6 ; index = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48; pmovmskb r14d, xmm6 ; index = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
shl r12, 16 shl r12, 16
shl r14, 16 shl r14, 16
or r11, r12 or r11, r12
or r13, r14 or r13, r14
shl r13, 32 shl r13, 32
or r11, r13 or r11, r13
not r11 ; index = ~index; not r11 ; index = ~index;
;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11 ;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
;jmp .EFN ;jmp .EFN
mov r13d, INT [r15 + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; mov r13d, INT [r15 + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
lea rsi, [t1] lea rsi, [t1]
.BLOOP: .BLOOP:
bsf r12, r11 ; r = __builtin_ctzl(index); bsf r12, r11 ; r = __builtin_ctzl(index);
jz .ELOOP jz .ELOOP
mov rcx, r12 mov rcx, r12
lea rsi, [rsi+r12*2] ; k += r; lea rsi, [rsi+r12*2] ; k += r;
shr r11, cl ; index >>= r; shr r11, cl ; index >>= r;
movzx rdi, word [rsi] ; temp = t1[k]; movzx rdi, word [rsi] ; temp = t1[k];
lea rbx, [rel jpeg_nbits_table] lea rbx, [rel jpeg_nbits_table]
movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp); movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp);
.BRLOOP: .BRLOOP:
cmp r12, 16 ; while (r > 15) { cmp r12, 16 ; while (r > 15) {
jl .ERLOOP jl .ERLOOP
EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0) EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0)
sub r12, 16 ; r -= 16; sub r12, 16 ; r -= 16;
jmp .BRLOOP jmp .BRLOOP
.ERLOOP: .ERLOOP:
; Emit Huffman symbol for run length / number of bits ; Emit Huffman symbol for run length / number of bits
CHECKBUF31 ; uses rcx, rdx CHECKBUF31 ; uses rcx, rdx
shl r12, 4 ; temp3 = (r << 4) + nbits; shl r12, 4 ; temp3 = (r << 4) + nbits;
add r12, rdi add r12, rdi
mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3]; mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3];
movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3]; movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3];
PUT_BITS rbx PUT_BITS rbx
;EMIT_CODE(code, size) ;EMIT_CODE(code, size)
movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k]; movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k];
; Mask off any extra bits in code ; Mask off any extra bits in code
mov rcx, rdi mov rcx, rdi
mov rdx, 1 mov rdx, 1
shl rdx, cl shl rdx, cl
dec rdx dec rdx
and rbx, rdx ; temp2 &= (((JLONG) 1)<<nbits) - 1; and rbx, rdx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
PUT_BITS rbx ; PUT_BITS(temp2, nbits) PUT_BITS rbx ; PUT_BITS(temp2, nbits)
shr r11, 1 ; index >>= 1; shr r11, 1 ; index >>= 1;
add rsi, 2 ; ++k; add rsi, 2 ; ++k;
jmp .BLOOP jmp .BLOOP
.ELOOP: .ELOOP:
; If the last coef(s) were zero, emit an end-of-block code ; If the last coef(s) were zero, emit an end-of-block code
lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k; lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
cmp rdi, rsi ; if (r > 0) { cmp rdi, rsi ; if (r > 0) {
je .EFN je .EFN
mov ebx, INT [r15] ; code = actbl->ehufco[0]; mov ebx, INT [r15] ; code = actbl->ehufco[0];
movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0]; movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0];
EMIT_BITS rbx, r12d EMIT_BITS rbx, r12d
.EFN: .EFN:
pop r10 pop r10
; Save put_buffer & put_bits ; Save put_buffer & put_bits
mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer; mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer;
mov DWORD [r10+24], put_bits ; state->cur.put_bits = put_bits; mov DWORD [r10+24], put_bits ; state->cur.put_bits = put_bits;
pop rbx pop rbx
%ifdef WIN64 %ifdef WIN64
movaps xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD] movaps xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD]
movaps xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD] movaps xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD]
movaps xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD] movaps xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD]
movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD] movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
add rsp, 4*SIZEOF_XMMWORD add rsp, 4*SIZEOF_XMMWORD
%endif %endif
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -23,20 +23,20 @@
%include "jsimdext.inc" %include "jsimdext.inc"
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 16 alignz 16
global EXTN(jconst_huff_encode_one_block) global EXTN(jconst_huff_encode_one_block)
EXTN(jconst_huff_encode_one_block): EXTN(jconst_huff_encode_one_block):
%include "jpeg_nbits_table.inc" %include "jpeg_nbits_table.inc"
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 32 BITS 32
; These macros perform the same task as the emit_bits() function in the ; These macros perform the same task as the emit_bits() function in the
; original libjpeg code. In addition to reducing overhead by explicitly ; original libjpeg code. In addition to reducing overhead by explicitly
@@ -46,105 +46,105 @@ EXTN(jconst_huff_encode_one_block):
; bytes can be stored in a 64-bit bit buffer before it has to be emptied. ; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
%macro EMIT_BYTE 0 %macro EMIT_BYTE 0
sub put_bits, 8 ; put_bits -= 8; sub put_bits, 8 ; put_bits -= 8;
mov edx, put_buffer mov edx, put_buffer
mov ecx, put_bits mov ecx, put_bits
shr edx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits); shr edx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
mov byte [eax], dl ; *buffer++ = c; mov byte [eax], dl ; *buffer++ = c;
add eax, 1 add eax, 1
cmp dl, 0xFF ; need to stuff a zero byte? cmp dl, 0xFF ; need to stuff a zero byte?
jne %%.EMIT_BYTE_END jne %%.EMIT_BYTE_END
mov byte [eax], 0 ; *buffer++ = 0; mov byte [eax], 0 ; *buffer++ = 0;
add eax, 1 add eax, 1
%%.EMIT_BYTE_END: %%.EMIT_BYTE_END:
%endmacro %endmacro
%macro PUT_BITS 1 %macro PUT_BITS 1
add put_bits, ecx ; put_bits += size; add put_bits, ecx ; put_bits += size;
shl put_buffer, cl ; put_buffer = (put_buffer << size); shl put_buffer, cl ; put_buffer = (put_buffer << size);
or put_buffer, %1 or put_buffer, %1
%endmacro %endmacro
%macro CHECKBUF15 0 %macro CHECKBUF15 0
cmp put_bits, 16 ; if (put_bits > 31) { cmp put_bits, 16 ; if (put_bits > 31) {
jl %%.CHECKBUF15_END jl %%.CHECKBUF15_END
mov eax, POINTER [esp+buffer] mov eax, POINTER [esp+buffer]
EMIT_BYTE EMIT_BYTE
EMIT_BYTE EMIT_BYTE
mov POINTER [esp+buffer], eax mov POINTER [esp+buffer], eax
%%.CHECKBUF15_END: %%.CHECKBUF15_END:
%endmacro %endmacro
%macro EMIT_BITS 1 %macro EMIT_BITS 1
PUT_BITS %1 PUT_BITS %1
CHECKBUF15 CHECKBUF15
%endmacro %endmacro
%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3) %macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
pxor xmm4, xmm4 ; __m128i neg = _mm_setzero_si128(); pxor xmm4, xmm4 ; __m128i neg = _mm_setzero_si128();
pxor xmm5, xmm5 ; __m128i neg = _mm_setzero_si128(); pxor xmm5, xmm5 ; __m128i neg = _mm_setzero_si128();
pxor xmm6, xmm6 ; __m128i neg = _mm_setzero_si128(); pxor xmm6, xmm6 ; __m128i neg = _mm_setzero_si128();
pxor xmm7, xmm7 ; __m128i neg = _mm_setzero_si128(); pxor xmm7, xmm7 ; __m128i neg = _mm_setzero_si128();
pinsrw %34, word [esi + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0]; pinsrw %34, word [esi + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8]; pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16]; pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24]; pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
pinsrw %34, word [esi + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1]; pinsrw %34, word [esi + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9]; pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17]; pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25]; pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
pinsrw %34, word [esi + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2]; pinsrw %34, word [esi + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10]; pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18]; pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26]; pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
pinsrw %34, word [esi + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3]; pinsrw %34, word [esi + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11]; pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19]; pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27]; pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
pinsrw %34, word [esi + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4]; pinsrw %34, word [esi + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12]; pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20]; pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28]; pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
pinsrw %34, word [esi + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5]; pinsrw %34, word [esi + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13]; pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21]; pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29]; pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
pinsrw %34, word [esi + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6]; pinsrw %34, word [esi + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14]; pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22]; pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30]; pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
pinsrw %34, word [esi + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7]; pinsrw %34, word [esi + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15]; pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23]; pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
%if %1 != 32 %if %1 != 32
pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31]; pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
%else %else
pinsrw %37, ecx, 7 ; xmm_shadow[31] = block[jno31]; pinsrw %37, ecx, 7 ; xmm_shadow[31] = block[jno31];
%endif %endif
pcmpgtw xmm4, %34 ; neg = _mm_cmpgt_epi16(neg, x1); pcmpgtw xmm4, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm5, %35 ; neg = _mm_cmpgt_epi16(neg, x1); pcmpgtw xmm5, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm6, %36 ; neg = _mm_cmpgt_epi16(neg, x1); pcmpgtw xmm6, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm7, %37 ; neg = _mm_cmpgt_epi16(neg, x1); pcmpgtw xmm7, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
paddw %34, xmm4 ; x1 = _mm_add_epi16(x1, neg); paddw %34, xmm4 ; x1 = _mm_add_epi16(x1, neg);
paddw %35, xmm5 ; x1 = _mm_add_epi16(x1, neg); paddw %35, xmm5 ; x1 = _mm_add_epi16(x1, neg);
paddw %36, xmm6 ; x1 = _mm_add_epi16(x1, neg); paddw %36, xmm6 ; x1 = _mm_add_epi16(x1, neg);
paddw %37, xmm7 ; x1 = _mm_add_epi16(x1, neg); paddw %37, xmm7 ; x1 = _mm_add_epi16(x1, neg);
pxor %34, xmm4 ; x1 = _mm_xor_si128(x1, neg); pxor %34, xmm4 ; x1 = _mm_xor_si128(x1, neg);
pxor %35, xmm5 ; x1 = _mm_xor_si128(x1, neg); pxor %35, xmm5 ; x1 = _mm_xor_si128(x1, neg);
pxor %36, xmm6 ; x1 = _mm_xor_si128(x1, neg); pxor %36, xmm6 ; x1 = _mm_xor_si128(x1, neg);
pxor %37, xmm7 ; x1 = _mm_xor_si128(x1, neg); pxor %37, xmm7 ; x1 = _mm_xor_si128(x1, neg);
pxor xmm4, %34 ; neg = _mm_xor_si128(neg, x1); pxor xmm4, %34 ; neg = _mm_xor_si128(neg, x1);
pxor xmm5, %35 ; neg = _mm_xor_si128(neg, x1); pxor xmm5, %35 ; neg = _mm_xor_si128(neg, x1);
pxor xmm6, %36 ; neg = _mm_xor_si128(neg, x1); pxor xmm6, %36 ; neg = _mm_xor_si128(neg, x1);
pxor xmm7, %37 ; neg = _mm_xor_si128(neg, x1); pxor xmm7, %37 ; neg = _mm_xor_si128(neg, x1);
movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1); movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1); movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1); movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1); movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg); movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg); movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg); movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg); movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
%endmacro %endmacro
; ;
@@ -163,264 +163,264 @@ EXTN(jconst_huff_encode_one_block):
; eax + 24 = c_derived_tbl *dctbl ; eax + 24 = c_derived_tbl *dctbl
; eax + 28 = c_derived_tbl *actbl ; eax + 28 = c_derived_tbl *actbl
%define pad 6*SIZEOF_DWORD ; Align to 16 bytes %define pad 6*SIZEOF_DWORD ; Align to 16 bytes
%define t1 pad %define t1 pad
%define t2 t1+(DCTSIZE2*SIZEOF_WORD) %define t2 t1+(DCTSIZE2*SIZEOF_WORD)
%define block t2+(DCTSIZE2*SIZEOF_WORD) %define block t2+(DCTSIZE2*SIZEOF_WORD)
%define actbl block+SIZEOF_DWORD %define actbl block+SIZEOF_DWORD
%define buffer actbl+SIZEOF_DWORD %define buffer actbl+SIZEOF_DWORD
%define temp buffer+SIZEOF_DWORD %define temp buffer+SIZEOF_DWORD
%define temp2 temp+SIZEOF_DWORD %define temp2 temp+SIZEOF_DWORD
%define temp3 temp2+SIZEOF_DWORD %define temp3 temp2+SIZEOF_DWORD
%define temp4 temp3+SIZEOF_DWORD %define temp4 temp3+SIZEOF_DWORD
%define temp5 temp4+SIZEOF_DWORD %define temp5 temp4+SIZEOF_DWORD
%define gotptr temp5+SIZEOF_DWORD ; void *gotptr %define gotptr temp5+SIZEOF_DWORD ; void *gotptr
%define put_buffer ebx %define put_buffer ebx
%define put_bits edi %define put_bits edi
align 16 align 16
global EXTN(jsimd_huff_encode_one_block_sse2) global EXTN(jsimd_huff_encode_one_block_sse2)
EXTN(jsimd_huff_encode_one_block_sse2): EXTN(jsimd_huff_encode_one_block_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax,esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp],eax
mov ebp,esp ; ebp = aligned ebp mov ebp,esp ; ebp = aligned ebp
sub esp, temp5+9*SIZEOF_DWORD-pad sub esp, temp5+9*SIZEOF_DWORD-pad
push ebx push ebx
push ecx push ecx
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
push ebp push ebp
mov esi, POINTER [eax+8] ; (working_state *state) mov esi, POINTER [eax+8] ; (working_state *state)
mov put_buffer, DWORD [esi+8] ; put_buffer = state->cur.put_buffer; mov put_buffer, DWORD [esi+8] ; put_buffer = state->cur.put_buffer;
mov put_bits, DWORD [esi+12] ; put_bits = state->cur.put_bits; mov put_bits, DWORD [esi+12] ; put_bits = state->cur.put_bits;
push esi ; esi is now scratch push esi ; esi is now scratch
get_GOT edx ; get GOT address get_GOT edx ; get GOT address
movpic POINTER [esp+gotptr], edx ; save GOT address movpic POINTER [esp+gotptr], edx ; save GOT address
mov ecx, POINTER [eax+28] mov ecx, POINTER [eax+28]
mov edx, POINTER [eax+16] mov edx, POINTER [eax+16]
mov esi, POINTER [eax+12] mov esi, POINTER [eax+12]
mov POINTER [esp+actbl], ecx mov POINTER [esp+actbl], ecx
mov POINTER [esp+block], edx mov POINTER [esp+block], edx
mov POINTER [esp+buffer], esi mov POINTER [esp+buffer], esi
; Encode the DC coefficient difference per section F.1.2.1 ; Encode the DC coefficient difference per section F.1.2.1
mov esi, POINTER [esp+block] ; block mov esi, POINTER [esp+block] ; block
movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val; movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val;
sub ecx, DWORD [eax+20] sub ecx, DWORD [eax+20]
mov esi, ecx mov esi, ecx
; This is a well-known technique for obtaining the absolute value ; This is a well-known technique for obtaining the absolute value
; without a branch. It is derived from an assembly language technique ; with out a branch. It is derived from an assembly language technique
; presented in "How to Optimize for the Pentium Processors", ; presented in "How to Optimize for the Pentium Processors",
; Copyright (c) 1996, 1997 by Agner Fog. ; Copyright (c) 1996, 1997 by Agner Fog.
mov edx, ecx mov edx, ecx
sar edx, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); sar edx, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
xor ecx, edx ; temp ^= temp3; xor ecx, edx ; temp ^= temp3;
sub ecx, edx ; temp -= temp3; sub ecx, edx ; temp -= temp3;
; For a negative input, want temp2 = bitwise complement of abs(input) ; For a negative input, want temp2 = bitwise complement of abs(input)
; This code assumes we are on a two's complement machine ; This code assumes we are on a two's complement machine
add esi, edx ; temp2 += temp3; add esi, edx ; temp2 += temp3;
mov DWORD [esp+temp], esi ; backup temp2 in temp mov DWORD [esp+temp], esi ; backup temp2 in temp
; Find the number of bits needed for the magnitude of the coefficient ; Find the number of bits needed for the magnitude of the coefficient
movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp) movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp)
movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)] ; nbits = JPEG_NBITS(temp); movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)] ; nbits = JPEG_NBITS(temp);
mov DWORD [esp+temp2], edx ; backup nbits in temp2 mov DWORD [esp+temp2], edx ; backup nbits in temp2
; Emit the Huffman-coded symbol for the number of bits ; Emit the Huffman-coded symbol for the number of bits
mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore
mov eax, INT [ebp + edx * 4] ; code = dctbl->ehufco[nbits]; mov eax, INT [ebp + edx * 4] ; code = dctbl->ehufco[nbits];
movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits]; movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits];
EMIT_BITS eax ; EMIT_BITS(code, size) EMIT_BITS eax ; EMIT_BITS(code, size)
mov ecx, DWORD [esp+temp2] ; restore nbits mov ecx, DWORD [esp+temp2] ; restore nbits
; Mask off any extra bits in code ; Mask off any extra bits in code
mov eax, 1 mov eax, 1
shl eax, cl shl eax, cl
dec eax dec eax
and eax, DWORD [esp+temp] ; temp2 &= (((JLONG) 1)<<nbits) - 1; and eax, DWORD [esp+temp] ; temp2 &= (((JLONG) 1)<<nbits) - 1;
; Emit that number of bits of the value, if positive, ; Emit that number of bits of the value, if positive,
; or the complement of its magnitude, if negative. ; or the complement of its magnitude, if negative.
EMIT_BITS eax ; EMIT_BITS(temp2, nbits) EMIT_BITS eax ; EMIT_BITS(temp2, nbits)
; Prepare data ; Prepare data
xor ecx, ecx xor ecx, ecx
mov esi, POINTER [esp+block] mov esi, POINTER [esp+block]
kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \ kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \ 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
27, 20, 13, 6, 7, 14, 21, 28, 35, \ 27, 20, 13, 6, 7, 14, 21, 28, 35, \
xmm0, xmm1, xmm2, xmm3 xmm0, xmm1, xmm2, xmm3
kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \ kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \ 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
53, 60, 61, 54, 47, 55, 62, 63, 63, \ 53, 60, 61, 54, 47, 55, 62, 63, 63, \
xmm0, xmm1, xmm2, xmm3 xmm0, xmm1, xmm2, xmm3
pxor xmm7, xmm7 pxor xmm7, xmm7
movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0)); movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8)); movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16)); movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24)); movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero); pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero); pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero); pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero); pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1); packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3); packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0; pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16; pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
shl ecx, 16 shl ecx, 16
or edx, ecx or edx, ecx
not edx ; index = ~index; not edx ; index = ~index;
lea esi, [esp+t1] lea esi, [esp+t1]
mov ebp, POINTER [esp+actbl] ; ebp = actbl mov ebp, POINTER [esp+actbl] ; ebp = actbl
.BLOOP: .BLOOP:
bsf ecx, edx ; r = __builtin_ctzl(index); bsf ecx, edx ; r = __builtin_ctzl(index);
jz .ELOOP jz .ELOOP
lea esi, [esi+ecx*2] ; k += r; lea esi, [esi+ecx*2] ; k += r;
shr edx, cl ; index >>= r; shr edx, cl ; index >>= r;
mov DWORD [esp+temp3], edx mov DWORD [esp+temp3], edx
.BRLOOP: .BRLOOP:
cmp ecx, 16 ; while (r > 15) { cmp ecx, 16 ; while (r > 15) {
jl .ERLOOP jl .ERLOOP
sub ecx, 16 ; r -= 16; sub ecx, 16 ; r -= 16;
mov DWORD [esp+temp], ecx mov DWORD [esp+temp], ecx
mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0) EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
mov ecx, DWORD [esp+temp] mov ecx, DWORD [esp+temp]
jmp .BRLOOP jmp .BRLOOP
.ERLOOP: .ERLOOP:
movsx eax, word [esi] ; temp = t1[k]; movsx eax, word [esi] ; temp = t1[k];
movpic edx, POINTER [esp+gotptr] ; load GOT address (edx) movpic edx, POINTER [esp+gotptr] ; load GOT address (edx)
movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)] ; nbits = JPEG_NBITS(temp); movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)] ; nbits = JPEG_NBITS(temp);
mov DWORD [esp+temp2], eax mov DWORD [esp+temp2], eax
; Emit Huffman symbol for run length / number of bits ; Emit Huffman symbol for run length / number of bits
shl ecx, 4 ; temp3 = (r << 4) + nbits; shl ecx, 4 ; temp3 = (r << 4) + nbits;
add ecx, eax add ecx, eax
mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3]; mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3];
movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3]; movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3];
EMIT_BITS eax EMIT_BITS eax
movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k]; movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
; Mask off any extra bits in code ; Mask off any extra bits in code
mov ecx, DWORD [esp+temp2] mov ecx, DWORD [esp+temp2]
mov eax, 1 mov eax, 1
shl eax, cl shl eax, cl
dec eax dec eax
and eax, edx ; temp2 &= (((JLONG) 1)<<nbits) - 1; and eax, edx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
EMIT_BITS eax ; PUT_BITS(temp2, nbits) EMIT_BITS eax ; PUT_BITS(temp2, nbits)
mov edx, DWORD [esp+temp3] mov edx, DWORD [esp+temp3]
add esi, 2 ; ++k; add esi, 2 ; ++k;
shr edx, 1 ; index >>= 1; shr edx, 1 ; index >>= 1;
jmp .BLOOP jmp .BLOOP
.ELOOP: .ELOOP:
movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0)); movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8)); movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16)); movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24)); movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero); pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero); pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero); pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero); pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1); packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3); packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0; pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16; pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
shl ecx, 16 shl ecx, 16
or edx, ecx or edx, ecx
not edx ; index = ~index; not edx ; index = ~index;
lea eax, [esp + t1 + (DCTSIZE2/2) * 2] lea eax, [esp + t1 + (DCTSIZE2/2) * 2]
sub eax, esi sub eax, esi
shr eax, 1 shr eax, 1
bsf ecx, edx ; r = __builtin_ctzl(index); bsf ecx, edx ; r = __builtin_ctzl(index);
jz .ELOOP2 jz .ELOOP2
shr edx, cl ; index >>= r; shr edx, cl ; index >>= r;
add ecx, eax add ecx, eax
lea esi, [esi+ecx*2] ; k += r; lea esi, [esi+ecx*2] ; k += r;
mov DWORD [esp+temp3], edx mov DWORD [esp+temp3], edx
jmp .BRLOOP2 jmp .BRLOOP2
.BLOOP2: .BLOOP2:
bsf ecx, edx ; r = __builtin_ctzl(index); bsf ecx, edx ; r = __builtin_ctzl(index);
jz .ELOOP2 jz .ELOOP2
lea esi, [esi+ecx*2] ; k += r; lea esi, [esi+ecx*2] ; k += r;
shr edx, cl ; index >>= r; shr edx, cl ; index >>= r;
mov DWORD [esp+temp3], edx mov DWORD [esp+temp3], edx
.BRLOOP2: .BRLOOP2:
cmp ecx, 16 ; while (r > 15) { cmp ecx, 16 ; while (r > 15) {
jl .ERLOOP2 jl .ERLOOP2
sub ecx, 16 ; r -= 16; sub ecx, 16 ; r -= 16;
mov DWORD [esp+temp], ecx mov DWORD [esp+temp], ecx
mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0) EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
mov ecx, DWORD [esp+temp] mov ecx, DWORD [esp+temp]
jmp .BRLOOP2 jmp .BRLOOP2
.ERLOOP2: .ERLOOP2:
movsx eax, word [esi] ; temp = t1[k]; movsx eax, word [esi] ; temp = t1[k];
bsr eax, eax ; nbits = 32 - __builtin_clz(temp); bsr eax, eax ; nbits = 32 - __builtin_clz(temp);
inc eax inc eax
mov DWORD [esp+temp2], eax mov DWORD [esp+temp2], eax
; Emit Huffman symbol for run length / number of bits ; Emit Huffman symbol for run length / number of bits
shl ecx, 4 ; temp3 = (r << 4) + nbits; shl ecx, 4 ; temp3 = (r << 4) + nbits;
add ecx, eax add ecx, eax
mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3]; mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3];
movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3]; movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3];
EMIT_BITS eax EMIT_BITS eax
movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k]; movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
; Mask off any extra bits in code ; Mask off any extra bits in code
mov ecx, DWORD [esp+temp2] mov ecx, DWORD [esp+temp2]
mov eax, 1 mov eax, 1
shl eax, cl shl eax, cl
dec eax dec eax
and eax, edx ; temp2 &= (((JLONG) 1)<<nbits) - 1; and eax, edx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
EMIT_BITS eax ; PUT_BITS(temp2, nbits) EMIT_BITS eax ; PUT_BITS(temp2, nbits)
mov edx, DWORD [esp+temp3] mov edx, DWORD [esp+temp3]
add esi, 2 ; ++k; add esi, 2 ; ++k;
shr edx, 1 ; index >>= 1; shr edx, 1 ; index >>= 1;
jmp .BLOOP2 jmp .BLOOP2
.ELOOP2: .ELOOP2:
; If the last coef(s) were zero, emit an end-of-block code ; If the last coef(s) were zero, emit an end-of-block code
lea edx, [esp + t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k; lea edx, [esp + t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
cmp edx, esi ; if (r > 0) { cmp edx, esi ; if (r > 0) {
je .EFN je .EFN
mov eax, INT [ebp] ; code = actbl->ehufco[0]; mov eax, INT [ebp] ; code = actbl->ehufco[0];
movzx ecx, byte [ebp + 1024] ; size = actbl->ehufsi[0]; movzx ecx, byte [ebp + 1024] ; size = actbl->ehufsi[0];
EMIT_BITS eax EMIT_BITS eax
.EFN: .EFN:
mov eax, [esp+buffer] mov eax, [esp+buffer]
pop esi pop esi
; Save put_buffer & put_bits ; Save put_buffer & put_bits
mov DWORD [esi+8], put_buffer ; state->cur.put_buffer = put_buffer; mov DWORD [esi+8], put_buffer ; state->cur.put_buffer = put_buffer;
mov DWORD [esi+12], put_bits ; state->cur.put_bits = put_bits; mov DWORD [esi+12], put_bits ; state->cur.put_bits = put_bits;
pop ebp pop ebp
pop edi pop edi
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
pop ecx pop ecx
pop ebx pop ebx
mov esp,ebp ; esp <- aligned ebp mov esp,ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -19,8 +19,8 @@
%include "jsimdext.inc" %include "jsimdext.inc"
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 64 BITS 64
; ;
; Downsample pixel values of a single component. ; Downsample pixel values of a single component.
; This version handles the common case of 2:1 horizontal and 1:1 vertical, ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
@@ -39,130 +39,130 @@
; r14 = JSAMPARRAY input_data ; r14 = JSAMPARRAY input_data
; r15 = JSAMPARRAY output_data ; r15 = JSAMPARRAY output_data
align 16 align 16
global EXTN(jsimd_h2v1_downsample_sse2) global EXTN(jsimd_h2v1_downsample_sse2)
EXTN(jsimd_h2v1_downsample_sse2): EXTN(jsimd_h2v1_downsample_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
mov ecx, r13d mov ecx, r13d
shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
jz near .return jz near .return
mov edx, r10d mov edx, r10d
; -- expand_right_edge ; -- expand_right_edge
push rcx push rcx
shl rcx,1 ; output_cols * 2 shl rcx, 1 ; output_cols * 2
sub rcx,rdx sub rcx, rdx
jle short .expand_end jle short .expand_end
mov rax, r11 mov rax, r11
test rax,rax test rax, rax
jle short .expand_end jle short .expand_end
cld cld
mov rsi, r14 ; input_data mov rsi, r14 ; input_data
.expandloop: .expandloop:
push rax push rax
push rcx push rcx
mov rdi, JSAMPROW [rsi] mov rdi, JSAMPROW [rsi]
add rdi,rdx add rdi, rdx
mov al, JSAMPLE [rdi-1] mov al, JSAMPLE [rdi-1]
rep stosb rep stosb
pop rcx pop rcx
pop rax pop rax
add rsi, byte SIZEOF_JSAMPROW add rsi, byte SIZEOF_JSAMPROW
dec rax dec rax
jg short .expandloop jg short .expandloop
.expand_end: .expand_end:
pop rcx ; output_cols pop rcx ; output_cols
; -- h2v1_downsample ; -- h2v1_downsample
mov eax, r12d ; rowctr mov eax, r12d ; rowctr
test eax,eax test eax, eax
jle near .return jle near .return
mov rdx, 0x00010000 ; bias pattern mov rdx, 0x00010000 ; bias pattern
movd xmm7,edx movd xmm7, edx
pcmpeqw xmm6,xmm6 pcmpeqw xmm6, xmm6
pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov rsi, r14 ; input_data mov rsi, r14 ; input_data
mov rdi, r15 ; output_data mov rdi, r15 ; output_data
.rowloop: .rowloop:
push rcx push rcx
push rdi push rdi
push rsi push rsi
mov rsi, JSAMPROW [rsi] ; inptr mov rsi, JSAMPROW [rsi] ; inptr
mov rdi, JSAMPROW [rdi] ; outptr mov rdi, JSAMPROW [rdi] ; outptr
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jae short .columnloop jae short .columnloop
.columnloop_r8: .columnloop_r8:
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
pxor xmm1,xmm1 pxor xmm1, xmm1
mov rcx, SIZEOF_XMMWORD mov rcx, SIZEOF_XMMWORD
jmp short .downsample jmp short .downsample
.columnloop: .columnloop:
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
.downsample: .downsample:
movdqa xmm2,xmm0 movdqa xmm2, xmm0
movdqa xmm3,xmm1 movdqa xmm3, xmm1
pand xmm0,xmm6 pand xmm0, xmm6
psrlw xmm2,BYTE_BIT psrlw xmm2, BYTE_BIT
pand xmm1,xmm6 pand xmm1, xmm6
psrlw xmm3,BYTE_BIT psrlw xmm3, BYTE_BIT
paddw xmm0,xmm2 paddw xmm0, xmm2
paddw xmm1,xmm3 paddw xmm1, xmm3
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm1,xmm7 paddw xmm1, xmm7
psrlw xmm0,1 psrlw xmm0, 1
psrlw xmm1,1 psrlw xmm1, 1
packuswb xmm0,xmm1 packuswb xmm0, xmm1
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
sub rcx, byte SIZEOF_XMMWORD ; outcol sub rcx, byte SIZEOF_XMMWORD ; outcol
add rsi, byte 2*SIZEOF_XMMWORD ; inptr add rsi, byte 2*SIZEOF_XMMWORD ; inptr
add rdi, byte 1*SIZEOF_XMMWORD ; outptr add rdi, byte 1*SIZEOF_XMMWORD ; outptr
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jae short .columnloop jae short .columnloop
test rcx,rcx test rcx, rcx
jnz short .columnloop_r8 jnz short .columnloop_r8
pop rsi pop rsi
pop rdi pop rdi
pop rcx pop rcx
add rsi, byte SIZEOF_JSAMPROW ; input_data add rsi, byte SIZEOF_JSAMPROW ; input_data
add rdi, byte SIZEOF_JSAMPROW ; output_data add rdi, byte SIZEOF_JSAMPROW ; output_data
dec rax ; rowctr dec rax ; rowctr
jg near .rowloop jg near .rowloop
.return: .return:
uncollect_args uncollect_args
pop rbp pop rbp
ret ret
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
; ;
@@ -183,147 +183,147 @@ EXTN(jsimd_h2v1_downsample_sse2):
; r14 = JSAMPARRAY input_data ; r14 = JSAMPARRAY input_data
; r15 = JSAMPARRAY output_data ; r15 = JSAMPARRAY output_data
align 16 align 16
global EXTN(jsimd_h2v2_downsample_sse2) global EXTN(jsimd_h2v2_downsample_sse2)
EXTN(jsimd_h2v2_downsample_sse2): EXTN(jsimd_h2v2_downsample_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
mov ecx, r13d mov ecx, r13d
shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
jz near .return jz near .return
mov edx, r10d mov edx, r10d
; -- expand_right_edge ; -- expand_right_edge
push rcx push rcx
shl rcx,1 ; output_cols * 2 shl rcx, 1 ; output_cols * 2
sub rcx,rdx sub rcx, rdx
jle short .expand_end jle short .expand_end
mov rax, r11 mov rax, r11
test rax,rax test rax, rax
jle short .expand_end jle short .expand_end
cld cld
mov rsi, r14 ; input_data mov rsi, r14 ; input_data
.expandloop: .expandloop:
push rax push rax
push rcx push rcx
mov rdi, JSAMPROW [rsi] mov rdi, JSAMPROW [rsi]
add rdi,rdx add rdi, rdx
mov al, JSAMPLE [rdi-1] mov al, JSAMPLE [rdi-1]
rep stosb rep stosb
pop rcx pop rcx
pop rax pop rax
add rsi, byte SIZEOF_JSAMPROW add rsi, byte SIZEOF_JSAMPROW
dec rax dec rax
jg short .expandloop jg short .expandloop
.expand_end: .expand_end:
pop rcx ; output_cols pop rcx ; output_cols
; -- h2v2_downsample ; -- h2v2_downsample
mov eax, r12d ; rowctr mov eax, r12d ; rowctr
test rax,rax test rax, rax
jle near .return jle near .return
mov rdx, 0x00020001 ; bias pattern mov rdx, 0x00020001 ; bias pattern
movd xmm7,edx movd xmm7, edx
pcmpeqw xmm6,xmm6 pcmpeqw xmm6, xmm6
pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov rsi, r14 ; input_data mov rsi, r14 ; input_data
mov rdi, r15 ; output_data mov rdi, r15 ; output_data
.rowloop: .rowloop:
push rcx push rcx
push rdi push rdi
push rsi push rsi
mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
mov rdi, JSAMPROW [rdi] ; outptr mov rdi, JSAMPROW [rdi] ; outptr
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jae short .columnloop jae short .columnloop
.columnloop_r8: .columnloop_r8:
movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
pxor xmm2,xmm2 pxor xmm2, xmm2
pxor xmm3,xmm3 pxor xmm3, xmm3
mov rcx, SIZEOF_XMMWORD mov rcx, SIZEOF_XMMWORD
jmp short .downsample jmp short .downsample
.columnloop: .columnloop:
movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
.downsample: .downsample:
movdqa xmm4,xmm0 movdqa xmm4, xmm0
movdqa xmm5,xmm1 movdqa xmm5, xmm1
pand xmm0,xmm6 pand xmm0, xmm6
psrlw xmm4,BYTE_BIT psrlw xmm4, BYTE_BIT
pand xmm1,xmm6 pand xmm1, xmm6
psrlw xmm5,BYTE_BIT psrlw xmm5, BYTE_BIT
paddw xmm0,xmm4 paddw xmm0, xmm4
paddw xmm1,xmm5 paddw xmm1, xmm5
movdqa xmm4,xmm2 movdqa xmm4, xmm2
movdqa xmm5,xmm3 movdqa xmm5, xmm3
pand xmm2,xmm6 pand xmm2, xmm6
psrlw xmm4,BYTE_BIT psrlw xmm4, BYTE_BIT
pand xmm3,xmm6 pand xmm3, xmm6
psrlw xmm5,BYTE_BIT psrlw xmm5, BYTE_BIT
paddw xmm2,xmm4 paddw xmm2, xmm4
paddw xmm3,xmm5 paddw xmm3, xmm5
paddw xmm0,xmm1 paddw xmm0, xmm1
paddw xmm2,xmm3 paddw xmm2, xmm3
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm2,xmm7 paddw xmm2, xmm7
psrlw xmm0,2 psrlw xmm0, 2
psrlw xmm2,2 psrlw xmm2, 2
packuswb xmm0,xmm2 packuswb xmm0, xmm2
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
sub rcx, byte SIZEOF_XMMWORD ; outcol sub rcx, byte SIZEOF_XMMWORD ; outcol
add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
add rdi, byte 1*SIZEOF_XMMWORD ; outptr add rdi, byte 1*SIZEOF_XMMWORD ; outptr
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
test rcx,rcx test rcx, rcx
jnz near .columnloop_r8 jnz near .columnloop_r8
pop rsi pop rsi
pop rdi pop rdi
pop rcx pop rcx
add rsi, byte 2*SIZEOF_JSAMPROW ; input_data add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
add rdi, byte 1*SIZEOF_JSAMPROW ; output_data add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
dec rax ; rowctr dec rax ; rowctr
jg near .rowloop jg near .rowloop
.return: .return:
uncollect_args uncollect_args
pop rbp pop rbp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -18,8 +18,8 @@
%include "jsimdext.inc" %include "jsimdext.inc"
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 32 BITS 32
; ;
; Downsample pixel values of a single component. ; Downsample pixel values of a single component.
; This version handles the common case of 2:1 horizontal and 1:1 vertical, ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
@@ -38,141 +38,141 @@
%define input_data(b) (b)+24 ; JSAMPARRAY input_data %define input_data(b) (b)+24 ; JSAMPARRAY input_data
%define output_data(b) (b)+28 ; JSAMPARRAY output_data %define output_data(b) (b)+28 ; JSAMPARRAY output_data
align 16 align 16
global EXTN(jsimd_h2v1_downsample_sse2) global EXTN(jsimd_h2v1_downsample_sse2)
EXTN(jsimd_h2v1_downsample_sse2): EXTN(jsimd_h2v1_downsample_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
; push ebx ; unused ; push ebx ; unused
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
mov ecx, JDIMENSION [width_blks(ebp)] mov ecx, JDIMENSION [width_blks(ebp)]
shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
jz near .return jz near .return
mov edx, JDIMENSION [img_width(ebp)] mov edx, JDIMENSION [img_width(ebp)]
; -- expand_right_edge ; -- expand_right_edge
push ecx push ecx
shl ecx,1 ; output_cols * 2 shl ecx, 1 ; output_cols * 2
sub ecx,edx sub ecx, edx
jle short .expand_end jle short .expand_end
mov eax, INT [max_v_samp(ebp)] mov eax, INT [max_v_samp(ebp)]
test eax,eax test eax, eax
jle short .expand_end jle short .expand_end
cld cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16,7 alignx 16, 7
.expandloop: .expandloop:
push eax push eax
push ecx push ecx
mov edi, JSAMPROW [esi] mov edi, JSAMPROW [esi]
add edi,edx add edi, edx
mov al, JSAMPLE [edi-1] mov al, JSAMPLE [edi-1]
rep stosb rep stosb
pop ecx pop ecx
pop eax pop eax
add esi, byte SIZEOF_JSAMPROW add esi, byte SIZEOF_JSAMPROW
dec eax dec eax
jg short .expandloop jg short .expandloop
.expand_end: .expand_end:
pop ecx ; output_cols pop ecx ; output_cols
; -- h2v1_downsample ; -- h2v1_downsample
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
test eax,eax test eax, eax
jle near .return jle near .return
mov edx, 0x00010000 ; bias pattern mov edx, 0x00010000 ; bias pattern
movd xmm7,edx movd xmm7, edx
pcmpeqw xmm6,xmm6 pcmpeqw xmm6, xmm6
pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
push ecx push ecx
push edi push edi
push esi push esi
mov esi, JSAMPROW [esi] ; inptr mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr mov edi, JSAMPROW [edi] ; outptr
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop jae short .columnloop
alignx 16,7 alignx 16, 7
.columnloop_r8: .columnloop_r8:
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
pxor xmm1,xmm1 pxor xmm1, xmm1
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jmp short .downsample jmp short .downsample
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
.downsample: .downsample:
movdqa xmm2,xmm0 movdqa xmm2, xmm0
movdqa xmm3,xmm1 movdqa xmm3, xmm1
pand xmm0,xmm6 pand xmm0, xmm6
psrlw xmm2,BYTE_BIT psrlw xmm2, BYTE_BIT
pand xmm1,xmm6 pand xmm1, xmm6
psrlw xmm3,BYTE_BIT psrlw xmm3, BYTE_BIT
paddw xmm0,xmm2 paddw xmm0, xmm2
paddw xmm1,xmm3 paddw xmm1, xmm3
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm1,xmm7 paddw xmm1, xmm7
psrlw xmm0,1 psrlw xmm0, 1
psrlw xmm1,1 psrlw xmm1, 1
packuswb xmm0,xmm1 packuswb xmm0, xmm1
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
sub ecx, byte SIZEOF_XMMWORD ; outcol sub ecx, byte SIZEOF_XMMWORD ; outcol
add esi, byte 2*SIZEOF_XMMWORD ; inptr add esi, byte 2*SIZEOF_XMMWORD ; inptr
add edi, byte 1*SIZEOF_XMMWORD ; outptr add edi, byte 1*SIZEOF_XMMWORD ; outptr
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop jae short .columnloop
test ecx,ecx test ecx, ecx
jnz short .columnloop_r8 jnz short .columnloop_r8
pop esi pop esi
pop edi pop edi
pop ecx pop ecx
add esi, byte SIZEOF_JSAMPROW ; input_data add esi, byte SIZEOF_JSAMPROW ; input_data
add edi, byte SIZEOF_JSAMPROW ; output_data add edi, byte SIZEOF_JSAMPROW ; output_data
dec eax ; rowctr dec eax ; rowctr
jg near .rowloop jg near .rowloop
.return: .return:
pop edi pop edi
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
; pop ebx ; unused ; pop ebx ; unused
pop ebp pop ebp
ret ret
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
; ;
@@ -193,158 +193,158 @@ EXTN(jsimd_h2v1_downsample_sse2):
%define input_data(b) (b)+24 ; JSAMPARRAY input_data %define input_data(b) (b)+24 ; JSAMPARRAY input_data
%define output_data(b) (b)+28 ; JSAMPARRAY output_data %define output_data(b) (b)+28 ; JSAMPARRAY output_data
align 16 align 16
global EXTN(jsimd_h2v2_downsample_sse2) global EXTN(jsimd_h2v2_downsample_sse2)
EXTN(jsimd_h2v2_downsample_sse2): EXTN(jsimd_h2v2_downsample_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
; push ebx ; unused ; push ebx ; unused
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
mov ecx, JDIMENSION [width_blks(ebp)] mov ecx, JDIMENSION [width_blks(ebp)]
shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
jz near .return jz near .return
mov edx, JDIMENSION [img_width(ebp)] mov edx, JDIMENSION [img_width(ebp)]
; -- expand_right_edge ; -- expand_right_edge
push ecx push ecx
shl ecx,1 ; output_cols * 2 shl ecx, 1 ; output_cols * 2
sub ecx,edx sub ecx, edx
jle short .expand_end jle short .expand_end
mov eax, INT [max_v_samp(ebp)] mov eax, INT [max_v_samp(ebp)]
test eax,eax test eax, eax
jle short .expand_end jle short .expand_end
cld cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16,7 alignx 16, 7
.expandloop: .expandloop:
push eax push eax
push ecx push ecx
mov edi, JSAMPROW [esi] mov edi, JSAMPROW [esi]
add edi,edx add edi, edx
mov al, JSAMPLE [edi-1] mov al, JSAMPLE [edi-1]
rep stosb rep stosb
pop ecx pop ecx
pop eax pop eax
add esi, byte SIZEOF_JSAMPROW add esi, byte SIZEOF_JSAMPROW
dec eax dec eax
jg short .expandloop jg short .expandloop
.expand_end: .expand_end:
pop ecx ; output_cols pop ecx ; output_cols
; -- h2v2_downsample ; -- h2v2_downsample
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
test eax,eax test eax, eax
jle near .return jle near .return
mov edx, 0x00020001 ; bias pattern mov edx, 0x00020001 ; bias pattern
movd xmm7,edx movd xmm7, edx
pcmpeqw xmm6,xmm6 pcmpeqw xmm6, xmm6
pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
push ecx push ecx
push edi push edi
push esi push esi
mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
mov edi, JSAMPROW [edi] ; outptr mov edi, JSAMPROW [edi] ; outptr
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop jae short .columnloop
alignx 16,7 alignx 16, 7
.columnloop_r8: .columnloop_r8:
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
pxor xmm2,xmm2 pxor xmm2, xmm2
pxor xmm3,xmm3 pxor xmm3, xmm3
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jmp short .downsample jmp short .downsample
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
.downsample: .downsample:
movdqa xmm4,xmm0 movdqa xmm4, xmm0
movdqa xmm5,xmm1 movdqa xmm5, xmm1
pand xmm0,xmm6 pand xmm0, xmm6
psrlw xmm4,BYTE_BIT psrlw xmm4, BYTE_BIT
pand xmm1,xmm6 pand xmm1, xmm6
psrlw xmm5,BYTE_BIT psrlw xmm5, BYTE_BIT
paddw xmm0,xmm4 paddw xmm0, xmm4
paddw xmm1,xmm5 paddw xmm1, xmm5
movdqa xmm4,xmm2 movdqa xmm4, xmm2
movdqa xmm5,xmm3 movdqa xmm5, xmm3
pand xmm2,xmm6 pand xmm2, xmm6
psrlw xmm4,BYTE_BIT psrlw xmm4, BYTE_BIT
pand xmm3,xmm6 pand xmm3, xmm6
psrlw xmm5,BYTE_BIT psrlw xmm5, BYTE_BIT
paddw xmm2,xmm4 paddw xmm2, xmm4
paddw xmm3,xmm5 paddw xmm3, xmm5
paddw xmm0,xmm1 paddw xmm0, xmm1
paddw xmm2,xmm3 paddw xmm2, xmm3
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm2,xmm7 paddw xmm2, xmm7
psrlw xmm0,2 psrlw xmm0, 2
psrlw xmm2,2 psrlw xmm2, 2
packuswb xmm0,xmm2 packuswb xmm0, xmm2
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
sub ecx, byte SIZEOF_XMMWORD ; outcol sub ecx, byte SIZEOF_XMMWORD ; outcol
add edx, byte 2*SIZEOF_XMMWORD ; inptr0 add edx, byte 2*SIZEOF_XMMWORD ; inptr0
add esi, byte 2*SIZEOF_XMMWORD ; inptr1 add esi, byte 2*SIZEOF_XMMWORD ; inptr1
add edi, byte 1*SIZEOF_XMMWORD ; outptr add edi, byte 1*SIZEOF_XMMWORD ; outptr
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
test ecx,ecx test ecx, ecx
jnz near .columnloop_r8 jnz near .columnloop_r8
pop esi pop esi
pop edi pop edi
pop ecx pop ecx
add esi, byte 2*SIZEOF_JSAMPROW ; input_data add esi, byte 2*SIZEOF_JSAMPROW ; input_data
add edi, byte 1*SIZEOF_JSAMPROW ; output_data add edi, byte 1*SIZEOF_JSAMPROW ; output_data
dec eax ; rowctr dec eax ; rowctr
jg near .rowloop jg near .rowloop
.return: .return:
pop edi pop edi
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
; pop ebx ; unused ; pop ebx ; unused
pop ebp pop ebp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -34,407 +34,407 @@
; r13 = JSAMPARRAY output_buf ; r13 = JSAMPARRAY output_buf
; r14 = int num_rows ; r14 = int num_rows
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
align 16 align 16
global EXTN(jsimd_ycc_rgb_convert_sse2) global EXTN(jsimd_ycc_rgb_convert_sse2)
EXTN(jsimd_ycc_rgb_convert_sse2): EXTN(jsimd_ycc_rgb_convert_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
push rbx push rbx
mov ecx, r10d ; num_cols mov ecx, r10d ; num_cols
test rcx,rcx test rcx, rcx
jz near .return jz near .return
push rcx push rcx
mov rdi, r11 mov rdi, r11
mov ecx, r12d mov ecx, r12d
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
pop rcx pop rcx
mov rdi, r13 mov rdi, r13
mov eax, r14d mov eax, r14d
test rax,rax test rax, rax
jle near .return jle near .return
.rowloop: .rowloop:
push rax push rax
push rdi push rdi
push rdx push rdx
push rbx push rbx
push rsi push rsi
push rcx ; col push rcx ; col
mov rsi, JSAMPROW [rsi] ; inptr0 mov rsi, JSAMPROW [rsi] ; inptr0
mov rbx, JSAMPROW [rbx] ; inptr1 mov rbx, JSAMPROW [rbx] ; inptr1
mov rdx, JSAMPROW [rdx] ; inptr2 mov rdx, JSAMPROW [rdx] ; inptr2
mov rdi, JSAMPROW [rdi] ; outptr mov rdi, JSAMPROW [rdi] ; outptr
.columnloop: .columnloop:
movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF) movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF)
movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF) movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF)
pcmpeqw xmm4,xmm4 pcmpeqw xmm4, xmm4
pcmpeqw xmm7,xmm7 pcmpeqw xmm7, xmm7
psrlw xmm4,BYTE_BIT psrlw xmm4, BYTE_BIT
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
paddw xmm4,xmm7 paddw xmm4, xmm7
paddw xmm5,xmm7 paddw xmm5, xmm7
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm1,xmm7 paddw xmm1, xmm7
; (Original) ; (Original)
; R = Y + 1.40200 * Cr ; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr ; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb ; B = Y + 1.77200 * Cb
; ;
; (This implementation) ; (This implementation)
; R = Y + 0.40200 * Cr + Cr ; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb ; B = Y - 0.22800 * Cb + Cb + Cb
movdqa xmm2,xmm4 ; xmm2=CbE movdqa xmm2, xmm4 ; xmm2=CbE
movdqa xmm3,xmm5 ; xmm3=CbO movdqa xmm3, xmm5 ; xmm3=CbO
paddw xmm4,xmm4 ; xmm4=2*CbE paddw xmm4, xmm4 ; xmm4=2*CbE
paddw xmm5,xmm5 ; xmm5=2*CbO paddw xmm5, xmm5 ; xmm5=2*CbO
movdqa xmm6,xmm0 ; xmm6=CrE movdqa xmm6, xmm0 ; xmm6=CrE
movdqa xmm7,xmm1 ; xmm7=CrO movdqa xmm7, xmm1 ; xmm7=CrO
paddw xmm0,xmm0 ; xmm0=2*CrE paddw xmm0, xmm0 ; xmm0=2*CrE
paddw xmm1,xmm1 ; xmm1=2*CrO paddw xmm1, xmm1 ; xmm1=2*CrO
pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800)) pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800))
pmulhw xmm5,[rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800)) pmulhw xmm5, [rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800))
pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200)) pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200))
pmulhw xmm1,[rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200)) pmulhw xmm1, [rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200))
paddw xmm4,[rel PW_ONE] paddw xmm4, [rel PW_ONE]
paddw xmm5,[rel PW_ONE] paddw xmm5, [rel PW_ONE]
psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
paddw xmm0,[rel PW_ONE] paddw xmm0, [rel PW_ONE]
paddw xmm1,[rel PW_ONE] paddw xmm1, [rel PW_ONE]
psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
paddw xmm4,xmm2 paddw xmm4, xmm2
paddw xmm5,xmm3 paddw xmm5, xmm3
paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
movdqa xmm4,xmm2 movdqa xmm4, xmm2
movdqa xmm5,xmm3 movdqa xmm5, xmm3
punpcklwd xmm2,xmm6 punpcklwd xmm2, xmm6
punpckhwd xmm4,xmm6 punpckhwd xmm4, xmm6
pmaddwd xmm2,[rel PW_MF0344_F0285] pmaddwd xmm2, [rel PW_MF0344_F0285]
pmaddwd xmm4,[rel PW_MF0344_F0285] pmaddwd xmm4, [rel PW_MF0344_F0285]
punpcklwd xmm3,xmm7 punpcklwd xmm3, xmm7
punpckhwd xmm5,xmm7 punpckhwd xmm5, xmm7
pmaddwd xmm3,[rel PW_MF0344_F0285] pmaddwd xmm3, [rel PW_MF0344_F0285]
pmaddwd xmm5,[rel PW_MF0344_F0285] pmaddwd xmm5, [rel PW_MF0344_F0285]
paddd xmm2,[rel PD_ONEHALF] paddd xmm2, [rel PD_ONEHALF]
paddd xmm4,[rel PD_ONEHALF] paddd xmm4, [rel PD_ONEHALF]
psrad xmm2,SCALEBITS psrad xmm2, SCALEBITS
psrad xmm4,SCALEBITS psrad xmm4, SCALEBITS
paddd xmm3,[rel PD_ONEHALF] paddd xmm3, [rel PD_ONEHALF]
paddd xmm5,[rel PD_ONEHALF] paddd xmm5, [rel PD_ONEHALF]
psrad xmm3,SCALEBITS psrad xmm3, SCALEBITS
psrad xmm5,SCALEBITS psrad xmm5, SCALEBITS
packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF) movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF)
pcmpeqw xmm4,xmm4 pcmpeqw xmm4, xmm4
psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
movdqa xmmG,xmmA movdqa xmmG, xmmA
movdqa xmmH,xmmA movdqa xmmH, xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
movdqa xmmC,xmmD movdqa xmmC, xmmD
movdqa xmmB,xmmD movdqa xmmB, xmmD
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
movdqa xmmF,xmmE movdqa xmmF, xmmE
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB,xmmE movdqa xmmB, xmmE
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB,xmmF movdqa xmmB, xmmF
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st32 jb short .column_st32
test rdi, SIZEOF_XMMWORD-1 test rdi, SIZEOF_XMMWORD-1
jnz short .out1 jnz short .out1
; --(aligned)------------------- ; --(aligned)-------------------
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
jmp short .out0 jmp short .out0
.out1: ; --(unaligned)----------------- .out1: ; --(unaligned)-----------------
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
.out0: .out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow jz near .nextrow
add rsi, byte SIZEOF_XMMWORD ; inptr0 add rsi, byte SIZEOF_XMMWORD ; inptr0
add rbx, byte SIZEOF_XMMWORD ; inptr1 add rbx, byte SIZEOF_XMMWORD ; inptr1
add rdx, byte SIZEOF_XMMWORD ; inptr2 add rdx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
.column_st32: .column_st32:
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16 jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF movdqa xmmA, xmmF
sub rcx, byte 2*SIZEOF_XMMWORD sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15 jmp short .column_st15
.column_st16: .column_st16:
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15 jb short .column_st15
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA, xmmD
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
.column_st15: .column_st15:
; Store the lower 8 bytes of xmmA to the output when it has enough ; Store the lower 8 bytes of xmmA to the output when it has enough
; space. ; space.
cmp rcx, byte SIZEOF_MMWORD cmp rcx, byte SIZEOF_MMWORD
jb short .column_st7 jb short .column_st7
movq XMM_MMWORD [rdi], xmmA movq XMM_MMWORD [rdi], xmmA
add rdi, byte SIZEOF_MMWORD add rdi, byte SIZEOF_MMWORD
sub rcx, byte SIZEOF_MMWORD sub rcx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD psrldq xmmA, SIZEOF_MMWORD
.column_st7: .column_st7:
; Store the lower 4 bytes of xmmA to the output when it has enough ; Store the lower 4 bytes of xmmA to the output when it has enough
; space. ; space.
cmp rcx, byte SIZEOF_DWORD cmp rcx, byte SIZEOF_DWORD
jb short .column_st3 jb short .column_st3
movd XMM_DWORD [rdi], xmmA movd XMM_DWORD [rdi], xmmA
add rdi, byte SIZEOF_DWORD add rdi, byte SIZEOF_DWORD
sub rcx, byte SIZEOF_DWORD sub rcx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD psrldq xmmA, SIZEOF_DWORD
.column_st3: .column_st3:
; Store the lower 2 bytes of rax to the output when it has enough ; Store the lower 2 bytes of rax to the output when it has enough
; space. ; space.
movd eax, xmmA movd eax, xmmA
cmp rcx, byte SIZEOF_WORD cmp rcx, byte SIZEOF_WORD
jb short .column_st1 jb short .column_st1
mov WORD [rdi], ax mov WORD [rdi], ax
add rdi, byte SIZEOF_WORD add rdi, byte SIZEOF_WORD
sub rcx, byte SIZEOF_WORD sub rcx, byte SIZEOF_WORD
shr rax, 16 shr rax, 16
.column_st1: .column_st1:
; Store the lower 1 byte of rax to the output when it has enough ; Store the lower 1 byte of rax to the output when it has enough
; space. ; space.
test rcx, rcx test rcx, rcx
jz short .nextrow jz short .nextrow
mov BYTE [rdi], al mov BYTE [rdi], al
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF %ifdef RGBX_FILLER_0XFF
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else %else
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif %endif
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG,xmmB movdqa xmmG, xmmB
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH,xmmC movdqa xmmH, xmmC
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st32 jb short .column_st32
test rdi, SIZEOF_XMMWORD-1 test rdi, SIZEOF_XMMWORD-1
jnz short .out1 jnz short .out1
; --(aligned)------------------- ; --(aligned)-------------------
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
jmp short .out0 jmp short .out0
.out1: ; --(unaligned)----------------- .out1: ; --(unaligned)-----------------
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
.out0: .out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow jz near .nextrow
add rsi, byte SIZEOF_XMMWORD ; inptr0 add rsi, byte SIZEOF_XMMWORD ; inptr0
add rbx, byte SIZEOF_XMMWORD ; inptr1 add rbx, byte SIZEOF_XMMWORD ; inptr1
add rdx, byte SIZEOF_XMMWORD ; inptr2 add rdx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
.column_st32: .column_st32:
cmp rcx, byte SIZEOF_XMMWORD/2 cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16 jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC movdqa xmmA, xmmC
movdqa xmmD,xmmH movdqa xmmD, xmmH
sub rcx, byte SIZEOF_XMMWORD/2 sub rcx, byte SIZEOF_XMMWORD/2
.column_st16: .column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4 cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15 jb short .column_st15
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA, xmmD
sub rcx, byte SIZEOF_XMMWORD/4 sub rcx, byte SIZEOF_XMMWORD/4
.column_st15: .column_st15:
; Store two pixels (8 bytes) of xmmA to the output when it has enough ; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space. ; space.
cmp rcx, byte SIZEOF_XMMWORD/8 cmp rcx, byte SIZEOF_XMMWORD/8
jb short .column_st7 jb short .column_st7
movq MMWORD [rdi], xmmA movq MMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD/8*4 add rdi, byte SIZEOF_XMMWORD/8*4
sub rcx, byte SIZEOF_XMMWORD/8 sub rcx, byte SIZEOF_XMMWORD/8
psrldq xmmA, SIZEOF_XMMWORD/8*4 psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7: .column_st7:
; Store one pixel (4 bytes) of xmmA to the output when it has enough ; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space. ; space.
test rcx, rcx test rcx, rcx
jz short .nextrow jz short .nextrow
movd XMM_DWORD [rdi], xmmA movd XMM_DWORD [rdi], xmmA
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------
.nextrow: .nextrow:
pop rcx pop rcx
pop rsi pop rsi
pop rbx pop rbx
pop rdx pop rdx
pop rdi pop rdi
pop rax pop rax
add rsi, byte SIZEOF_JSAMPROW add rsi, byte SIZEOF_JSAMPROW
add rbx, byte SIZEOF_JSAMPROW add rbx, byte SIZEOF_JSAMPROW
add rdx, byte SIZEOF_JSAMPROW add rdx, byte SIZEOF_JSAMPROW
add rdi, byte SIZEOF_JSAMPROW ; output_buf add rdi, byte SIZEOF_JSAMPROW ; output_buf
dec rax ; num_rows dec rax ; num_rows
jg near .rowloop jg near .rowloop
sfence ; flush the write buffer sfence ; flush the write buffer
.return: .return:
pop rbx pop rbx
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -28,432 +28,432 @@
; JSAMPARRAY output_buf, int num_rows) ; JSAMPARRAY output_buf, int num_rows)
; ;
%define out_width(b) (b)+8 ; JDIMENSION out_width %define out_width(b) (b)+8 ; JDIMENSION out_width
%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf %define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
%define input_row(b) (b)+16 ; JDIMENSION input_row %define input_row(b) (b)+16 ; JDIMENSION input_row
%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf %define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
%define num_rows(b) (b)+24 ; int num_rows %define num_rows(b) (b)+24 ; int num_rows
%define original_ebp ebp+0 %define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
align 16 align 16
global EXTN(jsimd_ycc_rgb_convert_sse2) global EXTN(jsimd_ycc_rgb_convert_sse2)
EXTN(jsimd_ycc_rgb_convert_sse2): EXTN(jsimd_ycc_rgb_convert_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address pushpic eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [out_width(eax)] ; num_cols mov ecx, JDIMENSION [out_width(eax)] ; num_cols
test ecx,ecx test ecx, ecx
jz near .return jz near .return
push ecx push ecx
mov edi, JSAMPIMAGE [input_buf(eax)] mov edi, JSAMPIMAGE [input_buf(eax)]
mov ecx, JDIMENSION [input_row(eax)] mov ecx, JDIMENSION [input_row(eax)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
lea esi, [esi+ecx*SIZEOF_JSAMPROW] lea esi, [esi+ecx*SIZEOF_JSAMPROW]
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
lea edx, [edx+ecx*SIZEOF_JSAMPROW] lea edx, [edx+ecx*SIZEOF_JSAMPROW]
pop ecx pop ecx
mov edi, JSAMPARRAY [output_buf(eax)] mov edi, JSAMPARRAY [output_buf(eax)]
mov eax, INT [num_rows(eax)] mov eax, INT [num_rows(eax)]
test eax,eax test eax, eax
jle near .return jle near .return
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
push eax push eax
push edi push edi
push edx push edx
push ebx push ebx
push esi push esi
push ecx ; col push ecx ; col
mov esi, JSAMPROW [esi] ; inptr0 mov esi, JSAMPROW [esi] ; inptr0
mov ebx, JSAMPROW [ebx] ; inptr1 mov ebx, JSAMPROW [ebx] ; inptr1
mov edx, JSAMPROW [edx] ; inptr2 mov edx, JSAMPROW [edx] ; inptr2
mov edi, JSAMPROW [edi] ; outptr mov edi, JSAMPROW [edi] ; outptr
movpic eax, POINTER [gotptr] ; load GOT address (eax) movpic eax, POINTER [gotptr] ; load GOT address (eax)
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF) movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF) movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
pcmpeqw xmm4,xmm4 pcmpeqw xmm4, xmm4
pcmpeqw xmm7,xmm7 pcmpeqw xmm7, xmm7
psrlw xmm4,BYTE_BIT psrlw xmm4, BYTE_BIT
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
paddw xmm4,xmm7 paddw xmm4, xmm7
paddw xmm5,xmm7 paddw xmm5, xmm7
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm1,xmm7 paddw xmm1, xmm7
; (Original) ; (Original)
; R = Y + 1.40200 * Cr ; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr ; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb ; B = Y + 1.77200 * Cb
; ;
; (This implementation) ; (This implementation)
; R = Y + 0.40200 * Cr + Cr ; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb ; B = Y - 0.22800 * Cb + Cb + Cb
movdqa xmm2,xmm4 ; xmm2=CbE movdqa xmm2, xmm4 ; xmm2=CbE
movdqa xmm3,xmm5 ; xmm3=CbO movdqa xmm3, xmm5 ; xmm3=CbO
paddw xmm4,xmm4 ; xmm4=2*CbE paddw xmm4, xmm4 ; xmm4=2*CbE
paddw xmm5,xmm5 ; xmm5=2*CbO paddw xmm5, xmm5 ; xmm5=2*CbO
movdqa xmm6,xmm0 ; xmm6=CrE movdqa xmm6, xmm0 ; xmm6=CrE
movdqa xmm7,xmm1 ; xmm7=CrO movdqa xmm7, xmm1 ; xmm7=CrO
paddw xmm0,xmm0 ; xmm0=2*CrE paddw xmm0, xmm0 ; xmm0=2*CrE
paddw xmm1,xmm1 ; xmm1=2*CrO paddw xmm1, xmm1 ; xmm1=2*CrO
pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800)) pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800)) pmulhw xmm5, [GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200)) pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200)) pmulhw xmm1, [GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
paddw xmm4,[GOTOFF(eax,PW_ONE)] paddw xmm4, [GOTOFF(eax,PW_ONE)]
paddw xmm5,[GOTOFF(eax,PW_ONE)] paddw xmm5, [GOTOFF(eax,PW_ONE)]
psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
paddw xmm0,[GOTOFF(eax,PW_ONE)] paddw xmm0, [GOTOFF(eax,PW_ONE)]
paddw xmm1,[GOTOFF(eax,PW_ONE)] paddw xmm1, [GOTOFF(eax,PW_ONE)]
psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
paddw xmm4,xmm2 paddw xmm4, xmm2
paddw xmm5,xmm3 paddw xmm5, xmm3
paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
movdqa xmm4,xmm2 movdqa xmm4, xmm2
movdqa xmm5,xmm3 movdqa xmm5, xmm3
punpcklwd xmm2,xmm6 punpcklwd xmm2, xmm6
punpckhwd xmm4,xmm6 punpckhwd xmm4, xmm6
pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)] pmaddwd xmm4, [GOTOFF(eax,PW_MF0344_F0285)]
punpcklwd xmm3,xmm7 punpcklwd xmm3, xmm7
punpckhwd xmm5,xmm7 punpckhwd xmm5, xmm7
pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)] pmaddwd xmm3, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
paddd xmm4,[GOTOFF(eax,PD_ONEHALF)] paddd xmm4, [GOTOFF(eax,PD_ONEHALF)]
psrad xmm2,SCALEBITS psrad xmm2, SCALEBITS
psrad xmm4,SCALEBITS psrad xmm4, SCALEBITS
paddd xmm3,[GOTOFF(eax,PD_ONEHALF)] paddd xmm3, [GOTOFF(eax,PD_ONEHALF)]
paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
psrad xmm3,SCALEBITS psrad xmm3, SCALEBITS
psrad xmm5,SCALEBITS psrad xmm5, SCALEBITS
packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF) movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
pcmpeqw xmm4,xmm4 pcmpeqw xmm4, xmm4
psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
movdqa xmmG,xmmA movdqa xmmG, xmmA
movdqa xmmH,xmmA movdqa xmmH, xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
movdqa xmmC,xmmD movdqa xmmC, xmmD
movdqa xmmB,xmmD movdqa xmmB, xmmD
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
movdqa xmmF,xmmE movdqa xmmF, xmmE
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB,xmmE movdqa xmmB, xmmE
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB,xmmF movdqa xmmB, xmmF
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32 jb short .column_st32
test edi, SIZEOF_XMMWORD-1 test edi, SIZEOF_XMMWORD-1
jnz short .out1 jnz short .out1
; --(aligned)------------------- ; --(aligned)-------------------
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
jmp short .out0 jmp short .out0
.out1: ; --(unaligned)----------------- .out1: ; --(unaligned)-----------------
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0: .out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow jz near .nextrow
add esi, byte SIZEOF_XMMWORD ; inptr0 add esi, byte SIZEOF_XMMWORD ; inptr0
add ebx, byte SIZEOF_XMMWORD ; inptr1 add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2 add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16,7 alignx 16, 7
.column_st32: .column_st32:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16 jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF movdqa xmmA, xmmF
sub ecx, byte 2*SIZEOF_XMMWORD sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15 jmp short .column_st15
.column_st16: .column_st16:
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15 jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA, xmmD
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
.column_st15: .column_st15:
; Store the lower 8 bytes of xmmA to the output when it has enough ; Store the lower 8 bytes of xmmA to the output when it has enough
; space. ; space.
cmp ecx, byte SIZEOF_MMWORD cmp ecx, byte SIZEOF_MMWORD
jb short .column_st7 jb short .column_st7
movq XMM_MMWORD [edi], xmmA movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_MMWORD add edi, byte SIZEOF_MMWORD
sub ecx, byte SIZEOF_MMWORD sub ecx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD psrldq xmmA, SIZEOF_MMWORD
.column_st7: .column_st7:
; Store the lower 4 bytes of xmmA to the output when it has enough ; Store the lower 4 bytes of xmmA to the output when it has enough
; space. ; space.
cmp ecx, byte SIZEOF_DWORD cmp ecx, byte SIZEOF_DWORD
jb short .column_st3 jb short .column_st3
movd XMM_DWORD [edi], xmmA movd XMM_DWORD [edi], xmmA
add edi, byte SIZEOF_DWORD add edi, byte SIZEOF_DWORD
sub ecx, byte SIZEOF_DWORD sub ecx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD psrldq xmmA, SIZEOF_DWORD
.column_st3: .column_st3:
; Store the lower 2 bytes of eax to the output when it has enough ; Store the lower 2 bytes of eax to the output when it has enough
; space. ; space.
movd eax, xmmA movd eax, xmmA
cmp ecx, byte SIZEOF_WORD cmp ecx, byte SIZEOF_WORD
jb short .column_st1 jb short .column_st1
mov WORD [edi], ax mov WORD [edi], ax
add edi, byte SIZEOF_WORD add edi, byte SIZEOF_WORD
sub ecx, byte SIZEOF_WORD sub ecx, byte SIZEOF_WORD
shr eax, 16 shr eax, 16
.column_st1: .column_st1:
; Store the lower 1 byte of eax to the output when it has enough ; Store the lower 1 byte of eax to the output when it has enough
; space. ; space.
test ecx, ecx test ecx, ecx
jz short .nextrow jz short .nextrow
mov BYTE [edi], al mov BYTE [edi], al
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF %ifdef RGBX_FILLER_0XFF
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else %else
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif %endif
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG,xmmB movdqa xmmG, xmmB
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH,xmmC movdqa xmmH, xmmC
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32 jb short .column_st32
test edi, SIZEOF_XMMWORD-1 test edi, SIZEOF_XMMWORD-1
jnz short .out1 jnz short .out1
; --(aligned)------------------- ; --(aligned)-------------------
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
jmp short .out0 jmp short .out0
.out1: ; --(unaligned)----------------- .out1: ; --(unaligned)-----------------
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0: .out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow jz near .nextrow
add esi, byte SIZEOF_XMMWORD ; inptr0 add esi, byte SIZEOF_XMMWORD ; inptr0
add ebx, byte SIZEOF_XMMWORD ; inptr1 add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2 add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16,7 alignx 16, 7
.column_st32: .column_st32:
cmp ecx, byte SIZEOF_XMMWORD/2 cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16 jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC movdqa xmmA, xmmC
movdqa xmmD,xmmH movdqa xmmD, xmmH
sub ecx, byte SIZEOF_XMMWORD/2 sub ecx, byte SIZEOF_XMMWORD/2
.column_st16: .column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4 cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15 jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA, xmmD
sub ecx, byte SIZEOF_XMMWORD/4 sub ecx, byte SIZEOF_XMMWORD/4
.column_st15: .column_st15:
; Store two pixels (8 bytes) of xmmA to the output when it has enough ; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space. ; space.
cmp ecx, byte SIZEOF_XMMWORD/8 cmp ecx, byte SIZEOF_XMMWORD/8
jb short .column_st7 jb short .column_st7
movq XMM_MMWORD [edi], xmmA movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD/8*4 add edi, byte SIZEOF_XMMWORD/8*4
sub ecx, byte SIZEOF_XMMWORD/8 sub ecx, byte SIZEOF_XMMWORD/8
psrldq xmmA, SIZEOF_XMMWORD/8*4 psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7: .column_st7:
; Store one pixel (4 bytes) of xmmA to the output when it has enough ; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space. ; space.
test ecx, ecx test ecx, ecx
jz short .nextrow jz short .nextrow
movd XMM_DWORD [edi], xmmA movd XMM_DWORD [edi], xmmA
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------
alignx 16,7 alignx 16, 7
.nextrow: .nextrow:
pop ecx pop ecx
pop esi pop esi
pop ebx pop ebx
pop edx pop edx
pop edi pop edi
pop eax pop eax
add esi, byte SIZEOF_JSAMPROW add esi, byte SIZEOF_JSAMPROW
add ebx, byte SIZEOF_JSAMPROW add ebx, byte SIZEOF_JSAMPROW
add edx, byte SIZEOF_JSAMPROW add edx, byte SIZEOF_JSAMPROW
add edi, byte SIZEOF_JSAMPROW ; output_buf add edi, byte SIZEOF_JSAMPROW ; output_buf
dec eax ; num_rows dec eax ; num_rows
jg near .rowloop jg near .rowloop
sfence ; flush the write buffer sfence ; flush the write buffer
.return: .return:
pop edi pop edi
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
pop ebx pop ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -20,21 +20,21 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%define SCALEBITS 16 %define SCALEBITS 16
F_0_344 equ 22554 ; FIX(0.34414) F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414) F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200) F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200) F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 16 alignz 16
global EXTN(jconst_ycc_rgb_convert_sse2) global EXTN(jconst_ycc_rgb_convert_sse2)
EXTN(jconst_ycc_rgb_convert_sse2): EXTN(jconst_ycc_rgb_convert_sse2):
@@ -44,11 +44,11 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
PW_ONE times 8 dw 1 PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 64 BITS 64
%include "jdcolext-sse2-64.asm" %include "jdcolext-sse2-64.asm"

View File

@@ -20,21 +20,21 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%define SCALEBITS 16 %define SCALEBITS 16
F_0_344 equ 22554 ; FIX(0.34414) F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414) F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200) F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200) F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 16 alignz 16
global EXTN(jconst_ycc_rgb_convert_sse2) global EXTN(jconst_ycc_rgb_convert_sse2)
EXTN(jconst_ycc_rgb_convert_sse2): EXTN(jconst_ycc_rgb_convert_sse2):
@@ -44,11 +44,11 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
PW_ONE times 8 dw 1 PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 32 BITS 32
%include "jdcolext-sse2.asm" %include "jdcolext-sse2.asm"

View File

@@ -17,11 +17,11 @@
; ;
%define RANGE_MASK (MAXJSAMPLE * 4 + 3) ; 2 bits wider than legal samples %define RANGE_MASK (MAXJSAMPLE * 4 + 3) ; 2 bits wider than legal samples
%define ROW(n,b,s) ((b)+(n)*(s)) %define ROW(n,b,s) ((b)+(n)*(s))
%define COL(n,b,s) ((b)+(n)*(s)*DCTSIZE) %define COL(n,b,s) ((b)+(n)*(s)*DCTSIZE)
%define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD) %define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
%define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD) %define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
%define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD) %define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------

View File

@@ -20,21 +20,21 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%define SCALEBITS 16 %define SCALEBITS 16
F_0_344 equ 22554 ; FIX(0.34414) F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414) F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200) F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200) F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 16 alignz 16
global EXTN(jconst_merged_upsample_sse2) global EXTN(jconst_merged_upsample_sse2)
EXTN(jconst_merged_upsample_sse2): EXTN(jconst_merged_upsample_sse2):
@@ -44,11 +44,11 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
PW_ONE times 8 dw 1 PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 64 BITS 64
%include "jdmrgext-sse2-64.asm" %include "jdmrgext-sse2-64.asm"

View File

@@ -20,21 +20,21 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%define SCALEBITS 16 %define SCALEBITS 16
F_0_344 equ 22554 ; FIX(0.34414) F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414) F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200) F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200) F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 16 alignz 16
global EXTN(jconst_merged_upsample_sse2) global EXTN(jconst_merged_upsample_sse2)
EXTN(jconst_merged_upsample_sse2): EXTN(jconst_merged_upsample_sse2):
@@ -44,11 +44,11 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
PW_ONE times 8 dw 1 PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 32 BITS 32
%include "jdmrgext-sse2.asm" %include "jdmrgext-sse2.asm"

View File

@@ -34,399 +34,399 @@
; r12 = JDIMENSION in_row_group_ctr ; r12 = JDIMENSION in_row_group_ctr
; r13 = JSAMPARRAY output_buf ; r13 = JSAMPARRAY output_buf
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 3 %define WK_NUM 3
align 16 align 16
global EXTN(jsimd_h2v1_merged_upsample_sse2) global EXTN(jsimd_h2v1_merged_upsample_sse2)
EXTN(jsimd_h2v1_merged_upsample_sse2): EXTN(jsimd_h2v1_merged_upsample_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
push rbx push rbx
mov ecx, r10d ; col mov ecx, r10d ; col
test rcx,rcx test rcx, rcx
jz near .return jz near .return
push rcx push rcx
mov rdi, r11 mov rdi, r11
mov ecx, r12d mov ecx, r12d
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
mov rdi, r13 mov rdi, r13
mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0 mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1 mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2 mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
mov rdi, JSAMPROW [rdi] ; outptr mov rdi, JSAMPROW [rdi] ; outptr
pop rcx ; col pop rcx ; col
.columnloop: .columnloop:
movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF) movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF)
movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF) movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF)
pxor xmm1,xmm1 ; xmm1=(all 0's) pxor xmm1, xmm1 ; xmm1=(all 0's)
pcmpeqw xmm3,xmm3 pcmpeqw xmm3, xmm3
psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
movdqa xmm4,xmm6 movdqa xmm4, xmm6
punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
movdqa xmm0,xmm7 movdqa xmm0, xmm7
punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
paddw xmm6,xmm3 paddw xmm6, xmm3
paddw xmm4,xmm3 paddw xmm4, xmm3
paddw xmm7,xmm3 paddw xmm7, xmm3
paddw xmm0,xmm3 paddw xmm0, xmm3
; (Original) ; (Original)
; R = Y + 1.40200 * Cr ; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr ; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb ; B = Y + 1.77200 * Cb
; ;
; (This implementation) ; (This implementation)
; R = Y + 0.40200 * Cr + Cr ; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb ; B = Y - 0.22800 * Cb + Cb + Cb
movdqa xmm5,xmm6 ; xmm5=CbH movdqa xmm5, xmm6 ; xmm5=CbH
movdqa xmm2,xmm4 ; xmm2=CbL movdqa xmm2, xmm4 ; xmm2=CbL
paddw xmm6,xmm6 ; xmm6=2*CbH paddw xmm6, xmm6 ; xmm6=2*CbH
paddw xmm4,xmm4 ; xmm4=2*CbL paddw xmm4, xmm4 ; xmm4=2*CbL
movdqa xmm1,xmm7 ; xmm1=CrH movdqa xmm1, xmm7 ; xmm1=CrH
movdqa xmm3,xmm0 ; xmm3=CrL movdqa xmm3, xmm0 ; xmm3=CrL
paddw xmm7,xmm7 ; xmm7=2*CrH paddw xmm7, xmm7 ; xmm7=2*CrH
paddw xmm0,xmm0 ; xmm0=2*CrL paddw xmm0, xmm0 ; xmm0=2*CrL
pmulhw xmm6,[rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800)) pmulhw xmm6, [rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800))
pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800)) pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800))
pmulhw xmm7,[rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200)) pmulhw xmm7, [rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200)) pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
paddw xmm6,[rel PW_ONE] paddw xmm6, [rel PW_ONE]
paddw xmm4,[rel PW_ONE] paddw xmm4, [rel PW_ONE]
psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
paddw xmm7,[rel PW_ONE] paddw xmm7, [rel PW_ONE]
paddw xmm0,[rel PW_ONE] paddw xmm0, [rel PW_ONE]
psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
paddw xmm6,xmm5 paddw xmm6, xmm5
paddw xmm4,xmm2 paddw xmm4, xmm2
paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
movdqa xmm6,xmm5 movdqa xmm6, xmm5
movdqa xmm7,xmm2 movdqa xmm7, xmm2
punpcklwd xmm5,xmm1 punpcklwd xmm5, xmm1
punpckhwd xmm6,xmm1 punpckhwd xmm6, xmm1
pmaddwd xmm5,[rel PW_MF0344_F0285] pmaddwd xmm5, [rel PW_MF0344_F0285]
pmaddwd xmm6,[rel PW_MF0344_F0285] pmaddwd xmm6, [rel PW_MF0344_F0285]
punpcklwd xmm2,xmm3 punpcklwd xmm2, xmm3
punpckhwd xmm7,xmm3 punpckhwd xmm7, xmm3
pmaddwd xmm2,[rel PW_MF0344_F0285] pmaddwd xmm2, [rel PW_MF0344_F0285]
pmaddwd xmm7,[rel PW_MF0344_F0285] pmaddwd xmm7, [rel PW_MF0344_F0285]
paddd xmm5,[rel PD_ONEHALF] paddd xmm5, [rel PD_ONEHALF]
paddd xmm6,[rel PD_ONEHALF] paddd xmm6, [rel PD_ONEHALF]
psrad xmm5,SCALEBITS psrad xmm5, SCALEBITS
psrad xmm6,SCALEBITS psrad xmm6, SCALEBITS
paddd xmm2,[rel PD_ONEHALF] paddd xmm2, [rel PD_ONEHALF]
paddd xmm7,[rel PD_ONEHALF] paddd xmm7, [rel PD_ONEHALF]
psrad xmm2,SCALEBITS psrad xmm2, SCALEBITS
psrad xmm7,SCALEBITS psrad xmm7, SCALEBITS
packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
mov al,2 ; Yctr mov al, 2 ; Yctr
jmp short .Yloop_1st jmp short .Yloop_1st
.Yloop_2nd: .Yloop_2nd:
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
.Yloop_1st: .Yloop_1st:
movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF) movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF)
pcmpeqw xmm6,xmm6 pcmpeqw xmm6, xmm6
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
movdqa xmmG,xmmA movdqa xmmG, xmmA
movdqa xmmH,xmmA movdqa xmmH, xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
movdqa xmmC,xmmD movdqa xmmC, xmmD
movdqa xmmB,xmmD movdqa xmmB, xmmD
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
movdqa xmmF,xmmE movdqa xmmF, xmmE
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB,xmmE movdqa xmmB, xmmE
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB,xmmF movdqa xmmB, xmmF
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st32 jb short .column_st32
test rdi, SIZEOF_XMMWORD-1 test rdi, SIZEOF_XMMWORD-1
jnz short .out1 jnz short .out1
; --(aligned)------------------- ; --(aligned)-------------------
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
jmp short .out0 jmp short .out0
.out1: ; --(unaligned)----------------- .out1: ; --(unaligned)-----------------
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
.out0: .out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn jz near .endcolumn
add rsi, byte SIZEOF_XMMWORD ; inptr0 add rsi, byte SIZEOF_XMMWORD ; inptr0
dec al ; Yctr dec al ; Yctr
jnz near .Yloop_2nd jnz near .Yloop_2nd
add rbx, byte SIZEOF_XMMWORD ; inptr1 add rbx, byte SIZEOF_XMMWORD ; inptr1
add rdx, byte SIZEOF_XMMWORD ; inptr2 add rdx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
.column_st32: .column_st32:
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16 jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF movdqa xmmA, xmmF
sub rcx, byte 2*SIZEOF_XMMWORD sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15 jmp short .column_st15
.column_st16: .column_st16:
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15 jb short .column_st15
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA, xmmD
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
.column_st15: .column_st15:
; Store the lower 8 bytes of xmmA to the output when it has enough ; Store the lower 8 bytes of xmmA to the output when it has enough
; space. ; space.
cmp rcx, byte SIZEOF_MMWORD cmp rcx, byte SIZEOF_MMWORD
jb short .column_st7 jb short .column_st7
movq XMM_MMWORD [rdi], xmmA movq XMM_MMWORD [rdi], xmmA
add rdi, byte SIZEOF_MMWORD add rdi, byte SIZEOF_MMWORD
sub rcx, byte SIZEOF_MMWORD sub rcx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD psrldq xmmA, SIZEOF_MMWORD
.column_st7: .column_st7:
; Store the lower 4 bytes of xmmA to the output when it has enough ; Store the lower 4 bytes of xmmA to the output when it has enough
; space. ; space.
cmp rcx, byte SIZEOF_DWORD cmp rcx, byte SIZEOF_DWORD
jb short .column_st3 jb short .column_st3
movd XMM_DWORD [rdi], xmmA movd XMM_DWORD [rdi], xmmA
add rdi, byte SIZEOF_DWORD add rdi, byte SIZEOF_DWORD
sub rcx, byte SIZEOF_DWORD sub rcx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD psrldq xmmA, SIZEOF_DWORD
.column_st3: .column_st3:
; Store the lower 2 bytes of rax to the output when it has enough ; Store the lower 2 bytes of rax to the output when it has enough
; space. ; space.
movd eax, xmmA movd eax, xmmA
cmp rcx, byte SIZEOF_WORD cmp rcx, byte SIZEOF_WORD
jb short .column_st1 jb short .column_st1
mov WORD [rdi], ax mov WORD [rdi], ax
add rdi, byte SIZEOF_WORD add rdi, byte SIZEOF_WORD
sub rcx, byte SIZEOF_WORD sub rcx, byte SIZEOF_WORD
shr rax, 16 shr rax, 16
.column_st1: .column_st1:
; Store the lower 1 byte of rax to the output when it has enough ; Store the lower 1 byte of rax to the output when it has enough
; space. ; space.
test rcx, rcx test rcx, rcx
jz short .endcolumn jz short .endcolumn
mov BYTE [rdi], al mov BYTE [rdi], al
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF %ifdef RGBX_FILLER_0XFF
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else %else
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif %endif
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG,xmmB movdqa xmmG, xmmB
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH,xmmC movdqa xmmH, xmmC
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st32 jb short .column_st32
test rdi, SIZEOF_XMMWORD-1 test rdi, SIZEOF_XMMWORD-1
jnz short .out1 jnz short .out1
; --(aligned)------------------- ; --(aligned)-------------------
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
jmp short .out0 jmp short .out0
.out1: ; --(unaligned)----------------- .out1: ; --(unaligned)-----------------
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
.out0: .out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn jz near .endcolumn
add rsi, byte SIZEOF_XMMWORD ; inptr0 add rsi, byte SIZEOF_XMMWORD ; inptr0
dec al ; Yctr dec al ; Yctr
jnz near .Yloop_2nd jnz near .Yloop_2nd
add rbx, byte SIZEOF_XMMWORD ; inptr1 add rbx, byte SIZEOF_XMMWORD ; inptr1
add rdx, byte SIZEOF_XMMWORD ; inptr2 add rdx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
.column_st32: .column_st32:
cmp rcx, byte SIZEOF_XMMWORD/2 cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16 jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC movdqa xmmA, xmmC
movdqa xmmD,xmmH movdqa xmmD, xmmH
sub rcx, byte SIZEOF_XMMWORD/2 sub rcx, byte SIZEOF_XMMWORD/2
.column_st16: .column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4 cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15 jb short .column_st15
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA, xmmD
sub rcx, byte SIZEOF_XMMWORD/4 sub rcx, byte SIZEOF_XMMWORD/4
.column_st15: .column_st15:
; Store two pixels (8 bytes) of xmmA to the output when it has enough ; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space. ; space.
cmp rcx, byte SIZEOF_XMMWORD/8 cmp rcx, byte SIZEOF_XMMWORD/8
jb short .column_st7 jb short .column_st7
movq XMM_MMWORD [rdi], xmmA movq XMM_MMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD/8*4 add rdi, byte SIZEOF_XMMWORD/8*4
sub rcx, byte SIZEOF_XMMWORD/8 sub rcx, byte SIZEOF_XMMWORD/8
psrldq xmmA, SIZEOF_XMMWORD/8*4 psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7: .column_st7:
; Store one pixel (4 bytes) of xmmA to the output when it has enough ; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space. ; space.
test rcx, rcx test rcx, rcx
jz short .endcolumn jz short .endcolumn
movd XMM_DWORD [rdi], xmmA movd XMM_DWORD [rdi], xmmA
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------
.endcolumn: .endcolumn:
sfence ; flush the write buffer sfence ; flush the write buffer
.return: .return:
pop rbx pop rbx
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
; ;
@@ -444,94 +444,94 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
; r12 = JDIMENSION in_row_group_ctr ; r12 = JDIMENSION in_row_group_ctr
; r13 = JSAMPARRAY output_buf ; r13 = JSAMPARRAY output_buf
align 16 align 16
global EXTN(jsimd_h2v2_merged_upsample_sse2) global EXTN(jsimd_h2v2_merged_upsample_sse2)
EXTN(jsimd_h2v2_merged_upsample_sse2): EXTN(jsimd_h2v2_merged_upsample_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
push rbx push rbx
mov eax, r10d mov eax, r10d
mov rdi, r11 mov rdi, r11
mov ecx, r12d mov ecx, r12d
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
mov rdi, r13 mov rdi, r13
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
push rdx ; inptr2 push rdx ; inptr2
push rbx ; inptr1 push rbx ; inptr1
push rsi ; inptr00 push rsi ; inptr00
mov rbx,rsp mov rbx, rsp
push rdi push rdi
push rcx push rcx
push rax push rax
%ifdef WIN64 %ifdef WIN64
mov r8, rcx mov r8, rcx
mov r9, rdi mov r9, rdi
mov rcx, rax mov rcx, rax
mov rdx, rbx mov rdx, rbx
%else %else
mov rdx, rcx mov rdx, rcx
mov rcx, rdi mov rcx, rdi
mov rdi, rax mov rdi, rax
mov rsi, rbx mov rsi, rbx
%endif %endif
call EXTN(jsimd_h2v1_merged_upsample_sse2) call EXTN(jsimd_h2v1_merged_upsample_sse2)
pop rax pop rax
pop rcx pop rcx
pop rdi pop rdi
pop rsi pop rsi
pop rbx pop rbx
pop rdx pop rdx
add rdi, byte SIZEOF_JSAMPROW ; outptr1 add rdi, byte SIZEOF_JSAMPROW ; outptr1
add rsi, byte SIZEOF_JSAMPROW ; inptr01 add rsi, byte SIZEOF_JSAMPROW ; inptr01
push rdx ; inptr2 push rdx ; inptr2
push rbx ; inptr1 push rbx ; inptr1
push rsi ; inptr00 push rsi ; inptr00
mov rbx,rsp mov rbx, rsp
push rdi push rdi
push rcx push rcx
push rax push rax
%ifdef WIN64 %ifdef WIN64
mov r8, rcx mov r8, rcx
mov r9, rdi mov r9, rdi
mov rcx, rax mov rcx, rax
mov rdx, rbx mov rdx, rbx
%else %else
mov rdx, rcx mov rdx, rcx
mov rcx, rdi mov rcx, rdi
mov rdi, rax mov rdi, rax
mov rsi, rbx mov rsi, rbx
%endif %endif
call EXTN(jsimd_h2v1_merged_upsample_sse2) call EXTN(jsimd_h2v1_merged_upsample_sse2)
pop rax pop rax
pop rcx pop rcx
pop rdi pop rdi
pop rsi pop rsi
pop rbx pop rbx
pop rdx pop rdx
pop rbx pop rbx
uncollect_args uncollect_args
pop rbp pop rbp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -29,422 +29,422 @@
; JSAMPARRAY output_buf); ; JSAMPARRAY output_buf);
; ;
%define output_width(b) (b)+8 ; JDIMENSION output_width %define output_width(b) (b)+8 ; JDIMENSION output_width
%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf %define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr %define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf %define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
%define original_ebp ebp+0 %define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 3 %define WK_NUM 3
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
align 16 align 16
global EXTN(jsimd_h2v1_merged_upsample_sse2) global EXTN(jsimd_h2v1_merged_upsample_sse2)
EXTN(jsimd_h2v1_merged_upsample_sse2): EXTN(jsimd_h2v1_merged_upsample_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address pushpic eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [output_width(eax)] ; col mov ecx, JDIMENSION [output_width(eax)] ; col
test ecx,ecx test ecx, ecx
jz near .return jz near .return
push ecx push ecx
mov edi, JSAMPIMAGE [input_buf(eax)] mov edi, JSAMPIMAGE [input_buf(eax)]
mov ecx, JDIMENSION [in_row_group_ctr(eax)] mov ecx, JDIMENSION [in_row_group_ctr(eax)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
mov edi, JSAMPARRAY [output_buf(eax)] mov edi, JSAMPARRAY [output_buf(eax)]
mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
mov edi, JSAMPROW [edi] ; outptr mov edi, JSAMPROW [edi] ; outptr
pop ecx ; col pop ecx ; col
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movpic eax, POINTER [gotptr] ; load GOT address (eax) movpic eax, POINTER [gotptr] ; load GOT address (eax)
movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF) movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF)
movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF) movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF)
pxor xmm1,xmm1 ; xmm1=(all 0's) pxor xmm1, xmm1 ; xmm1=(all 0's)
pcmpeqw xmm3,xmm3 pcmpeqw xmm3, xmm3
psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
movdqa xmm4,xmm6 movdqa xmm4, xmm6
punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
movdqa xmm0,xmm7 movdqa xmm0, xmm7
punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
paddw xmm6,xmm3 paddw xmm6, xmm3
paddw xmm4,xmm3 paddw xmm4, xmm3
paddw xmm7,xmm3 paddw xmm7, xmm3
paddw xmm0,xmm3 paddw xmm0, xmm3
; (Original) ; (Original)
; R = Y + 1.40200 * Cr ; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr ; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb ; B = Y + 1.77200 * Cb
; ;
; (This implementation) ; (This implementation)
; R = Y + 0.40200 * Cr + Cr ; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb ; B = Y - 0.22800 * Cb + Cb + Cb
movdqa xmm5,xmm6 ; xmm5=CbH movdqa xmm5, xmm6 ; xmm5=CbH
movdqa xmm2,xmm4 ; xmm2=CbL movdqa xmm2, xmm4 ; xmm2=CbL
paddw xmm6,xmm6 ; xmm6=2*CbH paddw xmm6, xmm6 ; xmm6=2*CbH
paddw xmm4,xmm4 ; xmm4=2*CbL paddw xmm4, xmm4 ; xmm4=2*CbL
movdqa xmm1,xmm7 ; xmm1=CrH movdqa xmm1, xmm7 ; xmm1=CrH
movdqa xmm3,xmm0 ; xmm3=CrL movdqa xmm3, xmm0 ; xmm3=CrL
paddw xmm7,xmm7 ; xmm7=2*CrH paddw xmm7, xmm7 ; xmm7=2*CrH
paddw xmm0,xmm0 ; xmm0=2*CrL paddw xmm0, xmm0 ; xmm0=2*CrL
pmulhw xmm6,[GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800)) pmulhw xmm6, [GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800))
pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800)) pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800))
pmulhw xmm7,[GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200)) pmulhw xmm7, [GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200))
pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200)) pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200))
paddw xmm6,[GOTOFF(eax,PW_ONE)] paddw xmm6, [GOTOFF(eax,PW_ONE)]
paddw xmm4,[GOTOFF(eax,PW_ONE)] paddw xmm4, [GOTOFF(eax,PW_ONE)]
psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
paddw xmm7,[GOTOFF(eax,PW_ONE)] paddw xmm7, [GOTOFF(eax,PW_ONE)]
paddw xmm0,[GOTOFF(eax,PW_ONE)] paddw xmm0, [GOTOFF(eax,PW_ONE)]
psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
paddw xmm6,xmm5 paddw xmm6, xmm5
paddw xmm4,xmm2 paddw xmm4, xmm2
paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
movdqa xmm6,xmm5 movdqa xmm6, xmm5
movdqa xmm7,xmm2 movdqa xmm7, xmm2
punpcklwd xmm5,xmm1 punpcklwd xmm5, xmm1
punpckhwd xmm6,xmm1 punpckhwd xmm6, xmm1
pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm6,[GOTOFF(eax,PW_MF0344_F0285)] pmaddwd xmm6, [GOTOFF(eax,PW_MF0344_F0285)]
punpcklwd xmm2,xmm3 punpcklwd xmm2, xmm3
punpckhwd xmm7,xmm3 punpckhwd xmm7, xmm3
pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm7,[GOTOFF(eax,PW_MF0344_F0285)] pmaddwd xmm7, [GOTOFF(eax,PW_MF0344_F0285)]
paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
paddd xmm6,[GOTOFF(eax,PD_ONEHALF)] paddd xmm6, [GOTOFF(eax,PD_ONEHALF)]
psrad xmm5,SCALEBITS psrad xmm5, SCALEBITS
psrad xmm6,SCALEBITS psrad xmm6, SCALEBITS
paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
paddd xmm7,[GOTOFF(eax,PD_ONEHALF)] paddd xmm7, [GOTOFF(eax,PD_ONEHALF)]
psrad xmm2,SCALEBITS psrad xmm2, SCALEBITS
psrad xmm7,SCALEBITS psrad xmm7, SCALEBITS
packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
mov al,2 ; Yctr mov al, 2 ; Yctr
jmp short .Yloop_1st jmp short .Yloop_1st
alignx 16,7 alignx 16, 7
.Yloop_2nd: .Yloop_2nd:
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
alignx 16,7 alignx 16, 7
.Yloop_1st: .Yloop_1st:
movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF) movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF)
pcmpeqw xmm6,xmm6 pcmpeqw xmm6, xmm6
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
movdqa xmmG,xmmA movdqa xmmG, xmmA
movdqa xmmH,xmmA movdqa xmmH, xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
movdqa xmmC,xmmD movdqa xmmC, xmmD
movdqa xmmB,xmmD movdqa xmmB, xmmD
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
movdqa xmmF,xmmE movdqa xmmF, xmmE
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB,xmmE movdqa xmmB, xmmE
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB,xmmF movdqa xmmB, xmmF
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32 jb short .column_st32
test edi, SIZEOF_XMMWORD-1 test edi, SIZEOF_XMMWORD-1
jnz short .out1 jnz short .out1
; --(aligned)------------------- ; --(aligned)-------------------
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
jmp short .out0 jmp short .out0
.out1: ; --(unaligned)----------------- .out1: ; --(unaligned)-----------------
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0: .out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn jz near .endcolumn
add esi, byte SIZEOF_XMMWORD ; inptr0 add esi, byte SIZEOF_XMMWORD ; inptr0
dec al ; Yctr dec al ; Yctr
jnz near .Yloop_2nd jnz near .Yloop_2nd
add ebx, byte SIZEOF_XMMWORD ; inptr1 add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2 add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16,7 alignx 16, 7
.column_st32: .column_st32:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16 jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF movdqa xmmA, xmmF
sub ecx, byte 2*SIZEOF_XMMWORD sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15 jmp short .column_st15
.column_st16: .column_st16:
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15 jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA, xmmD
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
.column_st15: .column_st15:
; Store the lower 8 bytes of xmmA to the output when it has enough ; Store the lower 8 bytes of xmmA to the output when it has enough
; space. ; space.
cmp ecx, byte SIZEOF_MMWORD cmp ecx, byte SIZEOF_MMWORD
jb short .column_st7 jb short .column_st7
movq XMM_MMWORD [edi], xmmA movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_MMWORD add edi, byte SIZEOF_MMWORD
sub ecx, byte SIZEOF_MMWORD sub ecx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD psrldq xmmA, SIZEOF_MMWORD
.column_st7: .column_st7:
; Store the lower 4 bytes of xmmA to the output when it has enough ; Store the lower 4 bytes of xmmA to the output when it has enough
; space. ; space.
cmp ecx, byte SIZEOF_DWORD cmp ecx, byte SIZEOF_DWORD
jb short .column_st3 jb short .column_st3
movd XMM_DWORD [edi], xmmA movd XMM_DWORD [edi], xmmA
add edi, byte SIZEOF_DWORD add edi, byte SIZEOF_DWORD
sub ecx, byte SIZEOF_DWORD sub ecx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD psrldq xmmA, SIZEOF_DWORD
.column_st3: .column_st3:
; Store the lower 2 bytes of eax to the output when it has enough ; Store the lower 2 bytes of eax to the output when it has enough
; space. ; space.
movd eax, xmmA movd eax, xmmA
cmp ecx, byte SIZEOF_WORD cmp ecx, byte SIZEOF_WORD
jb short .column_st1 jb short .column_st1
mov WORD [edi], ax mov WORD [edi], ax
add edi, byte SIZEOF_WORD add edi, byte SIZEOF_WORD
sub ecx, byte SIZEOF_WORD sub ecx, byte SIZEOF_WORD
shr eax, 16 shr eax, 16
.column_st1: .column_st1:
; Store the lower 1 byte of eax to the output when it has enough ; Store the lower 1 byte of eax to the output when it has enough
; space. ; space.
test ecx, ecx test ecx, ecx
jz short .endcolumn jz short .endcolumn
mov BYTE [edi], al mov BYTE [edi], al
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF %ifdef RGBX_FILLER_0XFF
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else %else
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif %endif
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG,xmmB movdqa xmmG, xmmB
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH,xmmC movdqa xmmH, xmmC
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32 jb short .column_st32
test edi, SIZEOF_XMMWORD-1 test edi, SIZEOF_XMMWORD-1
jnz short .out1 jnz short .out1
; --(aligned)------------------- ; --(aligned)-------------------
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
jmp short .out0 jmp short .out0
.out1: ; --(unaligned)----------------- .out1: ; --(unaligned)-----------------
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0: .out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn jz near .endcolumn
add esi, byte SIZEOF_XMMWORD ; inptr0 add esi, byte SIZEOF_XMMWORD ; inptr0
dec al ; Yctr dec al ; Yctr
jnz near .Yloop_2nd jnz near .Yloop_2nd
add ebx, byte SIZEOF_XMMWORD ; inptr1 add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2 add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16,7 alignx 16, 7
.column_st32: .column_st32:
cmp ecx, byte SIZEOF_XMMWORD/2 cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16 jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC movdqa xmmA, xmmC
movdqa xmmD,xmmH movdqa xmmD, xmmH
sub ecx, byte SIZEOF_XMMWORD/2 sub ecx, byte SIZEOF_XMMWORD/2
.column_st16: .column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4 cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15 jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA, xmmD
sub ecx, byte SIZEOF_XMMWORD/4 sub ecx, byte SIZEOF_XMMWORD/4
.column_st15: .column_st15:
; Store two pixels (8 bytes) of xmmA to the output when it has enough ; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space. ; space.
cmp ecx, byte SIZEOF_XMMWORD/8 cmp ecx, byte SIZEOF_XMMWORD/8
jb short .column_st7 jb short .column_st7
movq XMM_MMWORD [edi], xmmA movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD/8*4 add edi, byte SIZEOF_XMMWORD/8*4
sub ecx, byte SIZEOF_XMMWORD/8 sub ecx, byte SIZEOF_XMMWORD/8
psrldq xmmA, SIZEOF_XMMWORD/8*4 psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7: .column_st7:
; Store one pixel (4 bytes) of xmmA to the output when it has enough ; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space. ; space.
test ecx, ecx test ecx, ecx
jz short .endcolumn jz short .endcolumn
movd XMM_DWORD [edi], xmmA movd XMM_DWORD [edi], xmmA
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------
.endcolumn: .endcolumn:
sfence ; flush the write buffer sfence ; flush the write buffer
.return: .return:
pop edi pop edi
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
pop ebx pop ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
; ;
@@ -457,62 +457,62 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
; JSAMPARRAY output_buf); ; JSAMPARRAY output_buf);
; ;
%define output_width(b) (b)+8 ; JDIMENSION output_width %define output_width(b) (b)+8 ; JDIMENSION output_width
%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf %define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr %define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf %define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
align 16 align 16
global EXTN(jsimd_h2v2_merged_upsample_sse2) global EXTN(jsimd_h2v2_merged_upsample_sse2)
EXTN(jsimd_h2v2_merged_upsample_sse2): EXTN(jsimd_h2v2_merged_upsample_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
mov eax, POINTER [output_width(ebp)] mov eax, POINTER [output_width(ebp)]
mov edi, JSAMPIMAGE [input_buf(ebp)] mov edi, JSAMPIMAGE [input_buf(ebp)]
mov ecx, JDIMENSION [in_row_group_ctr(ebp)] mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
mov edi, JSAMPARRAY [output_buf(ebp)] mov edi, JSAMPARRAY [output_buf(ebp)]
lea esi, [esi+ecx*SIZEOF_JSAMPROW] lea esi, [esi+ecx*SIZEOF_JSAMPROW]
push edx ; inptr2 push edx ; inptr2
push ebx ; inptr1 push ebx ; inptr1
push esi ; inptr00 push esi ; inptr00
mov ebx,esp mov ebx, esp
push edi ; output_buf (outptr0) push edi ; output_buf (outptr0)
push ecx ; in_row_group_ctr push ecx ; in_row_group_ctr
push ebx ; input_buf push ebx ; input_buf
push eax ; output_width push eax ; output_width
call near EXTN(jsimd_h2v1_merged_upsample_sse2) call near EXTN(jsimd_h2v1_merged_upsample_sse2)
add esi, byte SIZEOF_JSAMPROW ; inptr01 add esi, byte SIZEOF_JSAMPROW ; inptr01
add edi, byte SIZEOF_JSAMPROW ; outptr1 add edi, byte SIZEOF_JSAMPROW ; outptr1
mov POINTER [ebx+0*SIZEOF_POINTER], esi mov POINTER [ebx+0*SIZEOF_POINTER], esi
mov POINTER [ebx-1*SIZEOF_POINTER], edi mov POINTER [ebx-1*SIZEOF_POINTER], edi
call near EXTN(jsimd_h2v1_merged_upsample_sse2) call near EXTN(jsimd_h2v1_merged_upsample_sse2)
add esp, byte 7*SIZEOF_DWORD add esp, byte 7*SIZEOF_DWORD
pop edi pop edi
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
pop ebx pop ebx
pop ebp pop ebp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -25,32 +25,32 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1,%2,0x44 shufps %1, %2, 0x44
%endmacro %endmacro
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1,%2,0xEE shufps %1, %2, 0xEE
%endmacro %endmacro
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 16 alignz 16
global EXTN(jconst_fdct_float_sse) global EXTN(jconst_fdct_float_sse)
EXTN(jconst_fdct_float_sse): EXTN(jconst_fdct_float_sse):
PD_0_382 times 4 dd 0.382683432365089771728460 PD_0_382 times 4 dd 0.382683432365089771728460
PD_0_707 times 4 dd 0.707106781186547524400844 PD_0_707 times 4 dd 0.707106781186547524400844
PD_0_541 times 4 dd 0.541196100146196984399723 PD_0_541 times 4 dd 0.541196100146196984399723
PD_1_306 times 4 dd 1.306562964876376527856643 PD_1_306 times 4 dd 1.306562964876376527856643
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 64 BITS 64
; ;
; Perform the forward DCT on one block of samples. ; Perform the forward DCT on one block of samples.
; ;
@@ -60,298 +60,298 @@ PD_1_306 times 4 dd 1.306562964876376527856643
; r10 = FAST_FLOAT *data ; r10 = FAST_FLOAT *data
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
align 16 align 16
global EXTN(jsimd_fdct_float_sse) global EXTN(jsimd_fdct_float_sse)
EXTN(jsimd_fdct_float_sse): EXTN(jsimd_fdct_float_sse):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
mov rdx, r10 ; (FAST_FLOAT *) mov rdx, r10 ; (FAST_FLOAT *)
mov rcx, DCTSIZE/4 mov rcx, DCTSIZE/4
.rowloop: .rowloop:
movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)] movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)] movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
; xmm0=(20 21 22 23), xmm2=(24 25 26 27) ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
; xmm1=(30 31 32 33), xmm3=(34 35 36 37) ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
movaps xmm4,xmm0 ; transpose coefficients(phase 1) movaps xmm4, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31) unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33) unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
movaps xmm5,xmm2 ; transpose coefficients(phase 1) movaps xmm5, xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35) unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37) unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
; xmm6=(00 01 02 03), xmm1=(04 05 06 07) ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
; xmm7=(10 11 12 13), xmm3=(14 15 16 17) ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
movaps xmm4,xmm6 ; transpose coefficients(phase 1) movaps xmm4, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13) unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
movaps xmm2,xmm1 ; transpose coefficients(phase 1) movaps xmm2, xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15) unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17) unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
movaps xmm7,xmm6 ; transpose coefficients(phase 2) movaps xmm7, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0 unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1 unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
movaps xmm3,xmm2 ; transpose coefficients(phase 2) movaps xmm3, xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6 unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7 unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
movaps xmm0,xmm7 movaps xmm0, xmm7
movaps xmm5,xmm6 movaps xmm5, xmm6
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm7,xmm4 ; transpose coefficients(phase 2) movaps xmm7, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2 unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3 unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
movaps xmm6,xmm1 ; transpose coefficients(phase 2) movaps xmm6, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4 unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5 unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
movaps xmm2,xmm7 movaps xmm2, xmm7
movaps xmm3,xmm4 movaps xmm3, xmm4
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part ; -- Even part
movaps xmm1,xmm5 movaps xmm1, xmm5
movaps xmm6,xmm0 movaps xmm6, xmm0
subps xmm5,xmm7 ; xmm5=tmp13 subps xmm5, xmm7 ; xmm5=tmp13
subps xmm0,xmm4 ; xmm0=tmp12 subps xmm0, xmm4 ; xmm0=tmp12
addps xmm1,xmm7 ; xmm1=tmp10 addps xmm1, xmm7 ; xmm1=tmp10
addps xmm6,xmm4 ; xmm6=tmp11 addps xmm6, xmm4 ; xmm6=tmp11
addps xmm0,xmm5 addps xmm0, xmm5
mulps xmm0,[rel PD_0_707] ; xmm0=z1 mulps xmm0, [rel PD_0_707] ; xmm0=z1
movaps xmm7,xmm1 movaps xmm7, xmm1
movaps xmm4,xmm5 movaps xmm4, xmm5
subps xmm1,xmm6 ; xmm1=data4 subps xmm1, xmm6 ; xmm1=data4
subps xmm5,xmm0 ; xmm5=data6 subps xmm5, xmm0 ; xmm5=data6
addps xmm7,xmm6 ; xmm7=data0 addps xmm7, xmm6 ; xmm7=data0
addps xmm4,xmm0 ; xmm4=data2 addps xmm4, xmm0 ; xmm4=data2
movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
; -- Odd part ; -- Odd part
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
addps xmm2,xmm3 ; xmm2=tmp10 addps xmm2, xmm3 ; xmm2=tmp10
addps xmm3,xmm6 ; xmm3=tmp11 addps xmm3, xmm6 ; xmm3=tmp11
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
mulps xmm3,[rel PD_0_707] ; xmm3=z3 mulps xmm3, [rel PD_0_707] ; xmm3=z3
movaps xmm1,xmm2 ; xmm1=tmp10 movaps xmm1, xmm2 ; xmm1=tmp10
subps xmm2,xmm6 subps xmm2, xmm6
mulps xmm2,[rel PD_0_382] ; xmm2=z5 mulps xmm2, [rel PD_0_382] ; xmm2=z5
mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1,xmm2 ; xmm1=z2 addps xmm1, xmm2 ; xmm1=z2
addps xmm6,xmm2 ; xmm6=z4 addps xmm6, xmm2 ; xmm6=z4
movaps xmm5,xmm0 movaps xmm5, xmm0
subps xmm0,xmm3 ; xmm0=z13 subps xmm0, xmm3 ; xmm0=z13
addps xmm5,xmm3 ; xmm5=z11 addps xmm5, xmm3 ; xmm5=z11
movaps xmm7,xmm0 movaps xmm7, xmm0
movaps xmm4,xmm5 movaps xmm4, xmm5
subps xmm0,xmm1 ; xmm0=data3 subps xmm0, xmm1 ; xmm0=data3
subps xmm5,xmm6 ; xmm5=data7 subps xmm5, xmm6 ; xmm5=data7
addps xmm7,xmm1 ; xmm7=data5 addps xmm7, xmm1 ; xmm7=data5
addps xmm4,xmm6 ; xmm4=data1 addps xmm4, xmm6 ; xmm4=data1
movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7 movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
dec rcx dec rcx
jnz near .rowloop jnz near .rowloop
; ---- Pass 2: process columns. ; ---- Pass 2: process columns.
mov rdx, r10 ; (FAST_FLOAT *) mov rdx, r10 ; (FAST_FLOAT *)
mov rcx, DCTSIZE/4 mov rcx, DCTSIZE/4
.columnloop: .columnloop:
movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)] movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)] movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
; xmm0=(02 12 22 32), xmm2=(42 52 62 72) ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
; xmm1=(03 13 23 33), xmm3=(43 53 63 73) ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
movaps xmm4,xmm0 ; transpose coefficients(phase 1) movaps xmm4, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13) unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33) unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
movaps xmm5,xmm2 ; transpose coefficients(phase 1) movaps xmm5, xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53) unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73) unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)] movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
; xmm6=(00 10 20 30), xmm1=(40 50 60 70) ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
; xmm7=(01 11 21 31), xmm3=(41 51 61 71) ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
movaps xmm4,xmm6 ; transpose coefficients(phase 1) movaps xmm4, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11) unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31) unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
movaps xmm2,xmm1 ; transpose coefficients(phase 1) movaps xmm2, xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51) unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71) unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
movaps xmm7,xmm6 ; transpose coefficients(phase 2) movaps xmm7, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0 unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1 unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
movaps xmm3,xmm2 ; transpose coefficients(phase 2) movaps xmm3, xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6 unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7 unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
movaps xmm0,xmm7 movaps xmm0, xmm7
movaps xmm5,xmm6 movaps xmm5, xmm6
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm7,xmm4 ; transpose coefficients(phase 2) movaps xmm7, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2 unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3 unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
movaps xmm6,xmm1 ; transpose coefficients(phase 2) movaps xmm6, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4 unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5 unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
movaps xmm2,xmm7 movaps xmm2, xmm7
movaps xmm3,xmm4 movaps xmm3, xmm4
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part ; -- Even part
movaps xmm1,xmm5 movaps xmm1, xmm5
movaps xmm6,xmm0 movaps xmm6, xmm0
subps xmm5,xmm7 ; xmm5=tmp13 subps xmm5, xmm7 ; xmm5=tmp13
subps xmm0,xmm4 ; xmm0=tmp12 subps xmm0, xmm4 ; xmm0=tmp12
addps xmm1,xmm7 ; xmm1=tmp10 addps xmm1, xmm7 ; xmm1=tmp10
addps xmm6,xmm4 ; xmm6=tmp11 addps xmm6, xmm4 ; xmm6=tmp11
addps xmm0,xmm5 addps xmm0, xmm5
mulps xmm0,[rel PD_0_707] ; xmm0=z1 mulps xmm0, [rel PD_0_707] ; xmm0=z1
movaps xmm7,xmm1 movaps xmm7, xmm1
movaps xmm4,xmm5 movaps xmm4, xmm5
subps xmm1,xmm6 ; xmm1=data4 subps xmm1, xmm6 ; xmm1=data4
subps xmm5,xmm0 ; xmm5=data6 subps xmm5, xmm0 ; xmm5=data6
addps xmm7,xmm6 ; xmm7=data0 addps xmm7, xmm6 ; xmm7=data0
addps xmm4,xmm0 ; xmm4=data2 addps xmm4, xmm0 ; xmm4=data2
movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
; -- Odd part ; -- Odd part
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
addps xmm2,xmm3 ; xmm2=tmp10 addps xmm2, xmm3 ; xmm2=tmp10
addps xmm3,xmm6 ; xmm3=tmp11 addps xmm3, xmm6 ; xmm3=tmp11
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
mulps xmm3,[rel PD_0_707] ; xmm3=z3 mulps xmm3, [rel PD_0_707] ; xmm3=z3
movaps xmm1,xmm2 ; xmm1=tmp10 movaps xmm1, xmm2 ; xmm1=tmp10
subps xmm2,xmm6 subps xmm2, xmm6
mulps xmm2,[rel PD_0_382] ; xmm2=z5 mulps xmm2, [rel PD_0_382] ; xmm2=z5
mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1,xmm2 ; xmm1=z2 addps xmm1, xmm2 ; xmm1=z2
addps xmm6,xmm2 ; xmm6=z4 addps xmm6, xmm2 ; xmm6=z4
movaps xmm5,xmm0 movaps xmm5, xmm0
subps xmm0,xmm3 ; xmm0=z13 subps xmm0, xmm3 ; xmm0=z13
addps xmm5,xmm3 ; xmm5=z11 addps xmm5, xmm3 ; xmm5=z11
movaps xmm7,xmm0 movaps xmm7, xmm0
movaps xmm4,xmm5 movaps xmm4, xmm5
subps xmm0,xmm1 ; xmm0=data3 subps xmm0, xmm1 ; xmm0=data3
subps xmm5,xmm6 ; xmm5=data7 subps xmm5, xmm6 ; xmm5=data7
addps xmm7,xmm1 ; xmm7=data5 addps xmm7, xmm1 ; xmm7=data5
addps xmm4,xmm6 ; xmm4=data1 addps xmm4, xmm6 ; xmm4=data1
movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
add rdx, byte 4*SIZEOF_FAST_FLOAT add rdx, byte 4*SIZEOF_FAST_FLOAT
dec rcx dec rcx
jnz near .columnloop jnz near .columnloop
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -24,32 +24,32 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1,%2,0x44 shufps %1, %2, 0x44
%endmacro %endmacro
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1,%2,0xEE shufps %1, %2, 0xEE
%endmacro %endmacro
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 16 alignz 16
global EXTN(jconst_fdct_float_sse) global EXTN(jconst_fdct_float_sse)
EXTN(jconst_fdct_float_sse): EXTN(jconst_fdct_float_sse):
PD_0_382 times 4 dd 0.382683432365089771728460 PD_0_382 times 4 dd 0.382683432365089771728460
PD_0_707 times 4 dd 0.707106781186547524400844 PD_0_707 times 4 dd 0.707106781186547524400844
PD_0_541 times 4 dd 0.541196100146196984399723 PD_0_541 times 4 dd 0.541196100146196984399723
PD_1_306 times 4 dd 1.306562964876376527856643 PD_1_306 times 4 dd 1.306562964876376527856643
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 32 BITS 32
; ;
; Perform the forward DCT on one block of samples. ; Perform the forward DCT on one block of samples.
; ;
@@ -57,313 +57,313 @@ PD_1_306 times 4 dd 1.306562964876376527856643
; jsimd_fdct_float_sse (FAST_FLOAT *data) ; jsimd_fdct_float_sse (FAST_FLOAT *data)
; ;
%define data(b) (b)+8 ; FAST_FLOAT *data %define data(b) (b)+8 ; FAST_FLOAT *data
%define original_ebp ebp+0 %define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
align 16 align 16
global EXTN(jsimd_fdct_float_sse) global EXTN(jsimd_fdct_float_sse)
EXTN(jsimd_fdct_float_sse): EXTN(jsimd_fdct_float_sse):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx pushpic ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
; push esi ; unused ; push esi ; unused
; push edi ; unused ; push edi ; unused
get_GOT ebx ; get GOT address get_GOT ebx ; get GOT address
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE/4 mov ecx, DCTSIZE/4
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)] movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)] movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
; xmm0=(20 21 22 23), xmm2=(24 25 26 27) ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
; xmm1=(30 31 32 33), xmm3=(34 35 36 37) ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
movaps xmm4,xmm0 ; transpose coefficients(phase 1) movaps xmm4, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31) unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33) unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
movaps xmm5,xmm2 ; transpose coefficients(phase 1) movaps xmm5, xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35) unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37) unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
; xmm6=(00 01 02 03), xmm1=(04 05 06 07) ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
; xmm7=(10 11 12 13), xmm3=(14 15 16 17) ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
movaps xmm4,xmm6 ; transpose coefficients(phase 1) movaps xmm4, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13) unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
movaps xmm2,xmm1 ; transpose coefficients(phase 1) movaps xmm2, xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15) unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17) unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
movaps xmm7,xmm6 ; transpose coefficients(phase 2) movaps xmm7, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0 unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1 unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
movaps xmm3,xmm2 ; transpose coefficients(phase 2) movaps xmm3, xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6 unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7 unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
movaps xmm0,xmm7 movaps xmm0, xmm7
movaps xmm5,xmm6 movaps xmm5, xmm6
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm7,xmm4 ; transpose coefficients(phase 2) movaps xmm7, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2 unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3 unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
movaps xmm6,xmm1 ; transpose coefficients(phase 2) movaps xmm6, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4 unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5 unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
movaps xmm2,xmm7 movaps xmm2, xmm7
movaps xmm3,xmm4 movaps xmm3, xmm4
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part ; -- Even part
movaps xmm1,xmm5 movaps xmm1, xmm5
movaps xmm6,xmm0 movaps xmm6, xmm0
subps xmm5,xmm7 ; xmm5=tmp13 subps xmm5, xmm7 ; xmm5=tmp13
subps xmm0,xmm4 ; xmm0=tmp12 subps xmm0, xmm4 ; xmm0=tmp12
addps xmm1,xmm7 ; xmm1=tmp10 addps xmm1, xmm7 ; xmm1=tmp10
addps xmm6,xmm4 ; xmm6=tmp11 addps xmm6, xmm4 ; xmm6=tmp11
addps xmm0,xmm5 addps xmm0, xmm5
mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1 mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
movaps xmm7,xmm1 movaps xmm7, xmm1
movaps xmm4,xmm5 movaps xmm4, xmm5
subps xmm1,xmm6 ; xmm1=data4 subps xmm1, xmm6 ; xmm1=data4
subps xmm5,xmm0 ; xmm5=data6 subps xmm5, xmm0 ; xmm5=data6
addps xmm7,xmm6 ; xmm7=data0 addps xmm7, xmm6 ; xmm7=data0
addps xmm4,xmm0 ; xmm4=data2 addps xmm4, xmm0 ; xmm4=data2
movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
; -- Odd part ; -- Odd part
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
addps xmm2,xmm3 ; xmm2=tmp10 addps xmm2, xmm3 ; xmm2=tmp10
addps xmm3,xmm6 ; xmm3=tmp11 addps xmm3, xmm6 ; xmm3=tmp11
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3 mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
movaps xmm1,xmm2 ; xmm1=tmp10 movaps xmm1, xmm2 ; xmm1=tmp10
subps xmm2,xmm6 subps xmm2, xmm6
mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5 mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1,xmm2 ; xmm1=z2 addps xmm1, xmm2 ; xmm1=z2
addps xmm6,xmm2 ; xmm6=z4 addps xmm6, xmm2 ; xmm6=z4
movaps xmm5,xmm0 movaps xmm5, xmm0
subps xmm0,xmm3 ; xmm0=z13 subps xmm0, xmm3 ; xmm0=z13
addps xmm5,xmm3 ; xmm5=z11 addps xmm5, xmm3 ; xmm5=z11
movaps xmm7,xmm0 movaps xmm7, xmm0
movaps xmm4,xmm5 movaps xmm4, xmm5
subps xmm0,xmm1 ; xmm0=data3 subps xmm0, xmm1 ; xmm0=data3
subps xmm5,xmm6 ; xmm5=data7 subps xmm5, xmm6 ; xmm5=data7
addps xmm7,xmm1 ; xmm7=data5 addps xmm7, xmm1 ; xmm7=data5
addps xmm4,xmm6 ; xmm4=data1 addps xmm4, xmm6 ; xmm4=data1
movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7 movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
dec ecx dec ecx
jnz near .rowloop jnz near .rowloop
; ---- Pass 2: process columns. ; ---- Pass 2: process columns.
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE/4 mov ecx, DCTSIZE/4
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
; xmm0=(02 12 22 32), xmm2=(42 52 62 72) ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
; xmm1=(03 13 23 33), xmm3=(43 53 63 73) ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
movaps xmm4,xmm0 ; transpose coefficients(phase 1) movaps xmm4, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13) unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33) unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
movaps xmm5,xmm2 ; transpose coefficients(phase 1) movaps xmm5, xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53) unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73) unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
; xmm6=(00 10 20 30), xmm1=(40 50 60 70) ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
; xmm7=(01 11 21 31), xmm3=(41 51 61 71) ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
movaps xmm4,xmm6 ; transpose coefficients(phase 1) movaps xmm4, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11) unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31) unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
movaps xmm2,xmm1 ; transpose coefficients(phase 1) movaps xmm2, xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51) unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71) unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
movaps xmm7,xmm6 ; transpose coefficients(phase 2) movaps xmm7, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0 unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1 unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
movaps xmm3,xmm2 ; transpose coefficients(phase 2) movaps xmm3, xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6 unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7 unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
movaps xmm0,xmm7 movaps xmm0, xmm7
movaps xmm5,xmm6 movaps xmm5, xmm6
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm7,xmm4 ; transpose coefficients(phase 2) movaps xmm7, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2 unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3 unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
movaps xmm6,xmm1 ; transpose coefficients(phase 2) movaps xmm6, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4 unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5 unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
movaps xmm2,xmm7 movaps xmm2, xmm7
movaps xmm3,xmm4 movaps xmm3, xmm4
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part ; -- Even part
movaps xmm1,xmm5 movaps xmm1, xmm5
movaps xmm6,xmm0 movaps xmm6, xmm0
subps xmm5,xmm7 ; xmm5=tmp13 subps xmm5, xmm7 ; xmm5=tmp13
subps xmm0,xmm4 ; xmm0=tmp12 subps xmm0, xmm4 ; xmm0=tmp12
addps xmm1,xmm7 ; xmm1=tmp10 addps xmm1, xmm7 ; xmm1=tmp10
addps xmm6,xmm4 ; xmm6=tmp11 addps xmm6, xmm4 ; xmm6=tmp11
addps xmm0,xmm5 addps xmm0, xmm5
mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1 mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
movaps xmm7,xmm1 movaps xmm7, xmm1
movaps xmm4,xmm5 movaps xmm4, xmm5
subps xmm1,xmm6 ; xmm1=data4 subps xmm1, xmm6 ; xmm1=data4
subps xmm5,xmm0 ; xmm5=data6 subps xmm5, xmm0 ; xmm5=data6
addps xmm7,xmm6 ; xmm7=data0 addps xmm7, xmm6 ; xmm7=data0
addps xmm4,xmm0 ; xmm4=data2 addps xmm4, xmm0 ; xmm4=data2
movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
; -- Odd part ; -- Odd part
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
addps xmm2,xmm3 ; xmm2=tmp10 addps xmm2, xmm3 ; xmm2=tmp10
addps xmm3,xmm6 ; xmm3=tmp11 addps xmm3, xmm6 ; xmm3=tmp11
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3 mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
movaps xmm1,xmm2 ; xmm1=tmp10 movaps xmm1, xmm2 ; xmm1=tmp10
subps xmm2,xmm6 subps xmm2, xmm6
mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5 mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1,xmm2 ; xmm1=z2 addps xmm1, xmm2 ; xmm1=z2
addps xmm6,xmm2 ; xmm6=z4 addps xmm6, xmm2 ; xmm6=z4
movaps xmm5,xmm0 movaps xmm5, xmm0
subps xmm0,xmm3 ; xmm0=z13 subps xmm0, xmm3 ; xmm0=z13
addps xmm5,xmm3 ; xmm5=z11 addps xmm5, xmm3 ; xmm5=z11
movaps xmm7,xmm0 movaps xmm7, xmm0
movaps xmm4,xmm5 movaps xmm4, xmm5
subps xmm0,xmm1 ; xmm0=data3 subps xmm0, xmm1 ; xmm0=data3
subps xmm5,xmm6 ; xmm5=data7 subps xmm5, xmm6 ; xmm5=data7
addps xmm7,xmm1 ; xmm7=data5 addps xmm7, xmm1 ; xmm7=data5
addps xmm4,xmm6 ; xmm4=data1 addps xmm4, xmm6 ; xmm4=data1
movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7 movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
add edx, byte 4*SIZEOF_FAST_FLOAT add edx, byte 4*SIZEOF_FAST_FLOAT
dec ecx dec ecx
jnz near .columnloop jnz near .columnloop
; pop edi ; unused ; pop edi ; unused
; pop esi ; unused ; pop esi ; unused
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
poppic ebx poppic ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -26,46 +26,46 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%define CONST_BITS 8 ; 14 is also OK. %define CONST_BITS 8 ; 14 is also OK.
%if CONST_BITS == 8 %if CONST_BITS == 8
F_0_382 equ 98 ; FIX(0.382683433) F_0_382 equ 98 ; FIX(0.382683433)
F_0_541 equ 139 ; FIX(0.541196100) F_0_541 equ 139 ; FIX(0.541196100)
F_0_707 equ 181 ; FIX(0.707106781) F_0_707 equ 181 ; FIX(0.707106781)
F_1_306 equ 334 ; FIX(1.306562965) F_1_306 equ 334 ; FIX(1.306562965)
%else %else
; NASM cannot do compile-time arithmetic on floating-point constants. ; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) F_0_382 equ DESCALE( 410903207, 30-CONST_BITS) ; FIX(0.382683433)
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) F_0_707 equ DESCALE( 759250124, 30-CONST_BITS) ; FIX(0.707106781)
F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) F_1_306 equ DESCALE(1402911301, 30-CONST_BITS) ; FIX(1.306562965)
%endif %endif
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
%define PRE_MULTIPLY_SCALE_BITS 2 %define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
alignz 16 alignz 16
global EXTN(jconst_fdct_ifast_sse2) global EXTN(jconst_fdct_ifast_sse2)
EXTN(jconst_fdct_ifast_sse2): EXTN(jconst_fdct_ifast_sse2):
PW_F0707 times 8 dw F_0_707 << CONST_SHIFT PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
PW_F0382 times 8 dw F_0_382 << CONST_SHIFT PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
PW_F0541 times 8 dw F_0_541 << CONST_SHIFT PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
PW_F1306 times 8 dw F_1_306 << CONST_SHIFT PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 64 BITS 64
; ;
; Perform the forward DCT on one block of samples. ; Perform the forward DCT on one block of samples.
; ;
@@ -75,317 +75,317 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
; r10 = DCTELEM *data ; r10 = DCTELEM *data
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
align 16 align 16
global EXTN(jsimd_fdct_ifast_sse2) global EXTN(jsimd_fdct_ifast_sse2)
EXTN(jsimd_fdct_ifast_sse2): EXTN(jsimd_fdct_ifast_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
mov rdx, r10 ; (DCTELEM *) mov rdx, r10 ; (DCTELEM *)
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
movdqa xmm4,xmm0 ; transpose coefficients(phase 1) movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
movdqa xmm5,xmm2 ; transpose coefficients(phase 1) movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
movdqa xmm2,xmm6 ; transpose coefficients(phase 1) movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm5,xmm1 ; transpose coefficients(phase 1) movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
movdqa xmm7,xmm6 ; transpose coefficients(phase 2) movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
movdqa xmm3,xmm2 ; transpose coefficients(phase 2) movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
movdqa xmm7,xmm0 ; transpose coefficients(phase 2) movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
movdqa xmm2,xmm4 ; transpose coefficients(phase 2) movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
movdqa xmm1,xmm0 ; transpose coefficients(phase 3) movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
movdqa xmm5,xmm2 ; transpose coefficients(phase 3) movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
movdqa xmm6,xmm1 movdqa xmm6, xmm1
movdqa xmm3,xmm0 movdqa xmm3, xmm0
psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
movdqa xmm1,xmm7 ; transpose coefficients(phase 3) movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
movdqa xmm0,xmm4 ; transpose coefficients(phase 3) movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
movdqa xmm2,xmm1 movdqa xmm2, xmm1
movdqa xmm5,xmm7 movdqa xmm5, xmm7
paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
; -- Even part ; -- Even part
movdqa xmm4,xmm3 movdqa xmm4, xmm3
movdqa xmm0,xmm6 movdqa xmm0, xmm6
psubw xmm3,xmm1 ; xmm3=tmp13 psubw xmm3, xmm1 ; xmm3=tmp13
psubw xmm6,xmm7 ; xmm6=tmp12 psubw xmm6, xmm7 ; xmm6=tmp12
paddw xmm4,xmm1 ; xmm4=tmp10 paddw xmm4, xmm1 ; xmm4=tmp10
paddw xmm0,xmm7 ; xmm0=tmp11 paddw xmm0, xmm7 ; xmm0=tmp11
paddw xmm6,xmm3 paddw xmm6, xmm3
psllw xmm6,PRE_MULTIPLY_SCALE_BITS psllw xmm6, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm6,[rel PW_F0707] ; xmm6=z1 pmulhw xmm6, [rel PW_F0707] ; xmm6=z1
movdqa xmm1,xmm4 movdqa xmm1, xmm4
movdqa xmm7,xmm3 movdqa xmm7, xmm3
psubw xmm4,xmm0 ; xmm4=data4 psubw xmm4, xmm0 ; xmm4=data4
psubw xmm3,xmm6 ; xmm3=data6 psubw xmm3, xmm6 ; xmm3=data6
paddw xmm1,xmm0 ; xmm1=data0 paddw xmm1, xmm0 ; xmm1=data0
paddw xmm7,xmm6 ; xmm7=data2 paddw xmm7, xmm6 ; xmm7=data2
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
; -- Odd part ; -- Odd part
paddw xmm2,xmm5 ; xmm2=tmp10 paddw xmm2, xmm5 ; xmm2=tmp10
paddw xmm5,xmm0 ; xmm5=tmp11 paddw xmm5, xmm0 ; xmm5=tmp11
paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
psllw xmm2,PRE_MULTIPLY_SCALE_BITS psllw xmm2, PRE_MULTIPLY_SCALE_BITS
psllw xmm0,PRE_MULTIPLY_SCALE_BITS psllw xmm0, PRE_MULTIPLY_SCALE_BITS
psllw xmm5,PRE_MULTIPLY_SCALE_BITS psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[rel PW_F0707] ; xmm5=z3 pmulhw xmm5, [rel PW_F0707] ; xmm5=z3
movdqa xmm4,xmm2 ; xmm4=tmp10 movdqa xmm4, xmm2 ; xmm4=tmp10
psubw xmm2,xmm0 psubw xmm2, xmm0
pmulhw xmm2,[rel PW_F0382] ; xmm2=z5 pmulhw xmm2, [rel PW_F0382] ; xmm2=z5
pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) pmulhw xmm0, [rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4,xmm2 ; xmm4=z2 paddw xmm4, xmm2 ; xmm4=z2
paddw xmm0,xmm2 ; xmm0=z4 paddw xmm0, xmm2 ; xmm0=z4
movdqa xmm3,xmm6 movdqa xmm3, xmm6
psubw xmm6,xmm5 ; xmm6=z13 psubw xmm6, xmm5 ; xmm6=z13
paddw xmm3,xmm5 ; xmm3=z11 paddw xmm3, xmm5 ; xmm3=z11
movdqa xmm2,xmm6 movdqa xmm2, xmm6
movdqa xmm5,xmm3 movdqa xmm5, xmm3
psubw xmm6,xmm4 ; xmm6=data3 psubw xmm6, xmm4 ; xmm6=data3
psubw xmm3,xmm0 ; xmm3=data7 psubw xmm3, xmm0 ; xmm3=data7
paddw xmm2,xmm4 ; xmm2=data5 paddw xmm2, xmm4 ; xmm2=data5
paddw xmm5,xmm0 ; xmm5=data1 paddw xmm5, xmm0 ; xmm5=data1
; ---- Pass 2: process columns. ; ---- Pass 2: process columns.
; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
movdqa xmm4,xmm1 ; transpose coefficients(phase 1) movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
movdqa xmm0,xmm7 ; transpose coefficients(phase 1) movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
movdqa xmm7,xmm5 ; transpose coefficients(phase 1) movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
movdqa xmm0,xmm6 ; transpose coefficients(phase 1) movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
movdqa xmm2,xmm5 ; transpose coefficients(phase 2) movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
movdqa xmm3,xmm7 ; transpose coefficients(phase 2) movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
movdqa xmm2,xmm1 ; transpose coefficients(phase 2) movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
movdqa xmm7,xmm4 ; transpose coefficients(phase 2) movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
movdqa xmm6,xmm1 ; transpose coefficients(phase 3) movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
movdqa xmm0,xmm7 ; transpose coefficients(phase 3) movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
movdqa xmm5,xmm6 movdqa xmm5, xmm6
movdqa xmm3,xmm1 movdqa xmm3, xmm1
psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
movdqa xmm6,xmm2 ; transpose coefficients(phase 3) movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
movdqa xmm1,xmm4 ; transpose coefficients(phase 3) movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
movdqa xmm7,xmm6 movdqa xmm7, xmm6
movdqa xmm0,xmm2 movdqa xmm0, xmm2
paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
; -- Even part ; -- Even part
movdqa xmm4,xmm3 movdqa xmm4, xmm3
movdqa xmm1,xmm5 movdqa xmm1, xmm5
psubw xmm3,xmm6 ; xmm3=tmp13 psubw xmm3, xmm6 ; xmm3=tmp13
psubw xmm5,xmm2 ; xmm5=tmp12 psubw xmm5, xmm2 ; xmm5=tmp12
paddw xmm4,xmm6 ; xmm4=tmp10 paddw xmm4, xmm6 ; xmm4=tmp10
paddw xmm1,xmm2 ; xmm1=tmp11 paddw xmm1, xmm2 ; xmm1=tmp11
paddw xmm5,xmm3 paddw xmm5, xmm3
psllw xmm5,PRE_MULTIPLY_SCALE_BITS psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[rel PW_F0707] ; xmm5=z1 pmulhw xmm5, [rel PW_F0707] ; xmm5=z1
movdqa xmm6,xmm4 movdqa xmm6, xmm4
movdqa xmm2,xmm3 movdqa xmm2, xmm3
psubw xmm4,xmm1 ; xmm4=data4 psubw xmm4, xmm1 ; xmm4=data4
psubw xmm3,xmm5 ; xmm3=data6 psubw xmm3, xmm5 ; xmm3=data6
paddw xmm6,xmm1 ; xmm6=data0 paddw xmm6, xmm1 ; xmm6=data0
paddw xmm2,xmm5 ; xmm2=data2 paddw xmm2, xmm5 ; xmm2=data2
movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4 movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6 movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2 movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
; -- Odd part ; -- Odd part
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
paddw xmm7,xmm0 ; xmm7=tmp10 paddw xmm7, xmm0 ; xmm7=tmp10
paddw xmm0,xmm1 ; xmm0=tmp11 paddw xmm0, xmm1 ; xmm0=tmp11
paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
psllw xmm7,PRE_MULTIPLY_SCALE_BITS psllw xmm7, PRE_MULTIPLY_SCALE_BITS
psllw xmm1,PRE_MULTIPLY_SCALE_BITS psllw xmm1, PRE_MULTIPLY_SCALE_BITS
psllw xmm0,PRE_MULTIPLY_SCALE_BITS psllw xmm0, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm0,[rel PW_F0707] ; xmm0=z3 pmulhw xmm0, [rel PW_F0707] ; xmm0=z3
movdqa xmm4,xmm7 ; xmm4=tmp10 movdqa xmm4, xmm7 ; xmm4=tmp10
psubw xmm7,xmm1 psubw xmm7, xmm1
pmulhw xmm7,[rel PW_F0382] ; xmm7=z5 pmulhw xmm7, [rel PW_F0382] ; xmm7=z5
pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) pmulhw xmm1, [rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4,xmm7 ; xmm4=z2 paddw xmm4, xmm7 ; xmm4=z2
paddw xmm1,xmm7 ; xmm1=z4 paddw xmm1, xmm7 ; xmm1=z4
movdqa xmm3,xmm5 movdqa xmm3, xmm5
psubw xmm5,xmm0 ; xmm5=z13 psubw xmm5, xmm0 ; xmm5=z13
paddw xmm3,xmm0 ; xmm3=z11 paddw xmm3, xmm0 ; xmm3=z11
movdqa xmm6,xmm5 movdqa xmm6, xmm5
movdqa xmm2,xmm3 movdqa xmm2, xmm3
psubw xmm5,xmm4 ; xmm5=data3 psubw xmm5, xmm4 ; xmm5=data3
psubw xmm3,xmm1 ; xmm3=data7 psubw xmm3, xmm1 ; xmm3=data7
paddw xmm6,xmm4 ; xmm6=data5 paddw xmm6, xmm4 ; xmm6=data5
paddw xmm2,xmm1 ; xmm2=data1 paddw xmm2, xmm1 ; xmm2=data1
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5 movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6 movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -25,46 +25,46 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%define CONST_BITS 8 ; 14 is also OK. %define CONST_BITS 8 ; 14 is also OK.
%if CONST_BITS == 8 %if CONST_BITS == 8
F_0_382 equ 98 ; FIX(0.382683433) F_0_382 equ 98 ; FIX(0.382683433)
F_0_541 equ 139 ; FIX(0.541196100) F_0_541 equ 139 ; FIX(0.541196100)
F_0_707 equ 181 ; FIX(0.707106781) F_0_707 equ 181 ; FIX(0.707106781)
F_1_306 equ 334 ; FIX(1.306562965) F_1_306 equ 334 ; FIX(1.306562965)
%else %else
; NASM cannot do compile-time arithmetic on floating-point constants. ; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) F_0_382 equ DESCALE( 410903207, 30-CONST_BITS) ; FIX(0.382683433)
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) F_0_707 equ DESCALE( 759250124, 30-CONST_BITS) ; FIX(0.707106781)
F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) F_1_306 equ DESCALE(1402911301, 30-CONST_BITS) ; FIX(1.306562965)
%endif %endif
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
%define PRE_MULTIPLY_SCALE_BITS 2 %define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
alignz 16 alignz 16
global EXTN(jconst_fdct_ifast_sse2) global EXTN(jconst_fdct_ifast_sse2)
EXTN(jconst_fdct_ifast_sse2): EXTN(jconst_fdct_ifast_sse2):
PW_F0707 times 8 dw F_0_707 << CONST_SHIFT PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
PW_F0382 times 8 dw F_0_382 << CONST_SHIFT PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
PW_F0541 times 8 dw F_0_541 << CONST_SHIFT PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
PW_F1306 times 8 dw F_1_306 << CONST_SHIFT PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 32 BITS 32
; ;
; Perform the forward DCT on one block of samples. ; Perform the forward DCT on one block of samples.
; ;
@@ -72,332 +72,332 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
; jsimd_fdct_ifast_sse2 (DCTELEM *data) ; jsimd_fdct_ifast_sse2 (DCTELEM *data)
; ;
%define data(b) (b)+8 ; DCTELEM *data %define data(b) (b)+8 ; DCTELEM *data
%define original_ebp ebp+0 %define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
align 16 align 16
global EXTN(jsimd_fdct_ifast_sse2) global EXTN(jsimd_fdct_ifast_sse2)
EXTN(jsimd_fdct_ifast_sse2): EXTN(jsimd_fdct_ifast_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx pushpic ebx
; push ecx ; unused ; push ecx ; unused
; push edx ; need not be preserved ; push edx ; need not be preserved
; push esi ; unused ; push esi ; unused
; push edi ; unused ; push edi ; unused
get_GOT ebx ; get GOT address get_GOT ebx ; get GOT address
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
mov edx, POINTER [data(eax)] ; (DCTELEM *) mov edx, POINTER [data(eax)] ; (DCTELEM *)
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
movdqa xmm4,xmm0 ; transpose coefficients(phase 1) movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
movdqa xmm5,xmm2 ; transpose coefficients(phase 1) movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
movdqa xmm2,xmm6 ; transpose coefficients(phase 1) movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm5,xmm1 ; transpose coefficients(phase 1) movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
movdqa xmm7,xmm6 ; transpose coefficients(phase 2) movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
movdqa xmm3,xmm2 ; transpose coefficients(phase 2) movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
movdqa xmm7,xmm0 ; transpose coefficients(phase 2) movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
movdqa xmm2,xmm4 ; transpose coefficients(phase 2) movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
movdqa xmm1,xmm0 ; transpose coefficients(phase 3) movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
movdqa xmm5,xmm2 ; transpose coefficients(phase 3) movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
movdqa xmm6,xmm1 movdqa xmm6, xmm1
movdqa xmm3,xmm0 movdqa xmm3, xmm0
psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
movdqa xmm1,xmm7 ; transpose coefficients(phase 3) movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
movdqa xmm0,xmm4 ; transpose coefficients(phase 3) movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
movdqa xmm2,xmm1 movdqa xmm2, xmm1
movdqa xmm5,xmm7 movdqa xmm5, xmm7
paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
; -- Even part ; -- Even part
movdqa xmm4,xmm3 movdqa xmm4, xmm3
movdqa xmm0,xmm6 movdqa xmm0, xmm6
psubw xmm3,xmm1 ; xmm3=tmp13 psubw xmm3, xmm1 ; xmm3=tmp13
psubw xmm6,xmm7 ; xmm6=tmp12 psubw xmm6, xmm7 ; xmm6=tmp12
paddw xmm4,xmm1 ; xmm4=tmp10 paddw xmm4, xmm1 ; xmm4=tmp10
paddw xmm0,xmm7 ; xmm0=tmp11 paddw xmm0, xmm7 ; xmm0=tmp11
paddw xmm6,xmm3 paddw xmm6, xmm3
psllw xmm6,PRE_MULTIPLY_SCALE_BITS psllw xmm6, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1 pmulhw xmm6, [GOTOFF(ebx,PW_F0707)] ; xmm6=z1
movdqa xmm1,xmm4 movdqa xmm1, xmm4
movdqa xmm7,xmm3 movdqa xmm7, xmm3
psubw xmm4,xmm0 ; xmm4=data4 psubw xmm4, xmm0 ; xmm4=data4
psubw xmm3,xmm6 ; xmm3=data6 psubw xmm3, xmm6 ; xmm3=data6
paddw xmm1,xmm0 ; xmm1=data0 paddw xmm1, xmm0 ; xmm1=data0
paddw xmm7,xmm6 ; xmm7=data2 paddw xmm7, xmm6 ; xmm7=data2
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
; -- Odd part ; -- Odd part
paddw xmm2,xmm5 ; xmm2=tmp10 paddw xmm2, xmm5 ; xmm2=tmp10
paddw xmm5,xmm0 ; xmm5=tmp11 paddw xmm5, xmm0 ; xmm5=tmp11
paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
psllw xmm2,PRE_MULTIPLY_SCALE_BITS psllw xmm2, PRE_MULTIPLY_SCALE_BITS
psllw xmm0,PRE_MULTIPLY_SCALE_BITS psllw xmm0, PRE_MULTIPLY_SCALE_BITS
psllw xmm5,PRE_MULTIPLY_SCALE_BITS psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3 pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z3
movdqa xmm4,xmm2 ; xmm4=tmp10 movdqa xmm4, xmm2 ; xmm4=tmp10
psubw xmm2,xmm0 psubw xmm2, xmm0
pmulhw xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5 pmulhw xmm2, [GOTOFF(ebx,PW_F0382)] ; xmm2=z5
pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) pmulhw xmm0, [GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4,xmm2 ; xmm4=z2 paddw xmm4, xmm2 ; xmm4=z2
paddw xmm0,xmm2 ; xmm0=z4 paddw xmm0, xmm2 ; xmm0=z4
movdqa xmm3,xmm6 movdqa xmm3, xmm6
psubw xmm6,xmm5 ; xmm6=z13 psubw xmm6, xmm5 ; xmm6=z13
paddw xmm3,xmm5 ; xmm3=z11 paddw xmm3, xmm5 ; xmm3=z11
movdqa xmm2,xmm6 movdqa xmm2, xmm6
movdqa xmm5,xmm3 movdqa xmm5, xmm3
psubw xmm6,xmm4 ; xmm6=data3 psubw xmm6, xmm4 ; xmm6=data3
psubw xmm3,xmm0 ; xmm3=data7 psubw xmm3, xmm0 ; xmm3=data7
paddw xmm2,xmm4 ; xmm2=data5 paddw xmm2, xmm4 ; xmm2=data5
paddw xmm5,xmm0 ; xmm5=data1 paddw xmm5, xmm0 ; xmm5=data1
; ---- Pass 2: process columns. ; ---- Pass 2: process columns.
; mov edx, POINTER [data(eax)] ; (DCTELEM *) ; mov edx, POINTER [data(eax)] ; (DCTELEM *)
; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
movdqa xmm4,xmm1 ; transpose coefficients(phase 1) movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
movdqa xmm0,xmm7 ; transpose coefficients(phase 1) movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
movdqa xmm7,xmm5 ; transpose coefficients(phase 1) movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
movdqa xmm0,xmm6 ; transpose coefficients(phase 1) movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
movdqa xmm2,xmm5 ; transpose coefficients(phase 2) movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
movdqa xmm3,xmm7 ; transpose coefficients(phase 2) movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
movdqa xmm2,xmm1 ; transpose coefficients(phase 2) movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
movdqa xmm7,xmm4 ; transpose coefficients(phase 2) movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
movdqa xmm6,xmm1 ; transpose coefficients(phase 3) movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
movdqa xmm0,xmm7 ; transpose coefficients(phase 3) movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
movdqa xmm5,xmm6 movdqa xmm5, xmm6
movdqa xmm3,xmm1 movdqa xmm3, xmm1
psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
movdqa xmm6,xmm2 ; transpose coefficients(phase 3) movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
movdqa xmm1,xmm4 ; transpose coefficients(phase 3) movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
movdqa xmm7,xmm6 movdqa xmm7, xmm6
movdqa xmm0,xmm2 movdqa xmm0, xmm2
paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
; -- Even part ; -- Even part
movdqa xmm4,xmm3 movdqa xmm4, xmm3
movdqa xmm1,xmm5 movdqa xmm1, xmm5
psubw xmm3,xmm6 ; xmm3=tmp13 psubw xmm3, xmm6 ; xmm3=tmp13
psubw xmm5,xmm2 ; xmm5=tmp12 psubw xmm5, xmm2 ; xmm5=tmp12
paddw xmm4,xmm6 ; xmm4=tmp10 paddw xmm4, xmm6 ; xmm4=tmp10
paddw xmm1,xmm2 ; xmm1=tmp11 paddw xmm1, xmm2 ; xmm1=tmp11
paddw xmm5,xmm3 paddw xmm5, xmm3
psllw xmm5,PRE_MULTIPLY_SCALE_BITS psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1 pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z1
movdqa xmm6,xmm4 movdqa xmm6, xmm4
movdqa xmm2,xmm3 movdqa xmm2, xmm3
psubw xmm4,xmm1 ; xmm4=data4 psubw xmm4, xmm1 ; xmm4=data4
psubw xmm3,xmm5 ; xmm3=data6 psubw xmm3, xmm5 ; xmm3=data6
paddw xmm6,xmm1 ; xmm6=data0 paddw xmm6, xmm1 ; xmm6=data0
paddw xmm2,xmm5 ; xmm2=data2 paddw xmm2, xmm5 ; xmm2=data2
movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4 movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6 movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2 movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
; -- Odd part ; -- Odd part
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
paddw xmm7,xmm0 ; xmm7=tmp10 paddw xmm7, xmm0 ; xmm7=tmp10
paddw xmm0,xmm1 ; xmm0=tmp11 paddw xmm0, xmm1 ; xmm0=tmp11
paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
psllw xmm7,PRE_MULTIPLY_SCALE_BITS psllw xmm7, PRE_MULTIPLY_SCALE_BITS
psllw xmm1,PRE_MULTIPLY_SCALE_BITS psllw xmm1, PRE_MULTIPLY_SCALE_BITS
psllw xmm0,PRE_MULTIPLY_SCALE_BITS psllw xmm0, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3 pmulhw xmm0, [GOTOFF(ebx,PW_F0707)] ; xmm0=z3
movdqa xmm4,xmm7 ; xmm4=tmp10 movdqa xmm4, xmm7 ; xmm4=tmp10
psubw xmm7,xmm1 psubw xmm7, xmm1
pmulhw xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5 pmulhw xmm7, [GOTOFF(ebx,PW_F0382)] ; xmm7=z5
pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) pmulhw xmm1, [GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4,xmm7 ; xmm4=z2 paddw xmm4, xmm7 ; xmm4=z2
paddw xmm1,xmm7 ; xmm1=z4 paddw xmm1, xmm7 ; xmm1=z4
movdqa xmm3,xmm5 movdqa xmm3, xmm5
psubw xmm5,xmm0 ; xmm5=z13 psubw xmm5, xmm0 ; xmm5=z13
paddw xmm3,xmm0 ; xmm3=z11 paddw xmm3, xmm0 ; xmm3=z11
movdqa xmm6,xmm5 movdqa xmm6, xmm5
movdqa xmm2,xmm3 movdqa xmm2, xmm3
psubw xmm5,xmm4 ; xmm5=data3 psubw xmm5, xmm4 ; xmm5=data3
psubw xmm3,xmm1 ; xmm3=data7 psubw xmm3, xmm1 ; xmm3=data7
paddw xmm6,xmm4 ; xmm6=data5 paddw xmm6, xmm4 ; xmm6=data5
paddw xmm2,xmm1 ; xmm2=data1 paddw xmm2, xmm1 ; xmm2=data1
movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5 movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6 movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2 movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
; pop edi ; unused ; pop edi ; unused
; pop esi ; unused ; pop esi ; unused
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; unused ; pop ecx ; unused
poppic ebx poppic ebx
mov esp,ebp ; esp <- aligned ebp mov esp,ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -25,34 +25,34 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1,%2,0x44 shufps %1, %2, 0x44
%endmacro %endmacro
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1,%2,0xEE shufps %1, %2, 0xEE
%endmacro %endmacro
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 16 alignz 16
global EXTN(jconst_idct_float_sse2) global EXTN(jconst_idct_float_sse2)
EXTN(jconst_idct_float_sse2): EXTN(jconst_idct_float_sse2):
PD_1_414 times 4 dd 1.414213562373095048801689 PD_1_414 times 4 dd 1.414213562373095048801689
PD_1_847 times 4 dd 1.847759065022573512256366 PD_1_847 times 4 dd 1.847759065022573512256366
PD_1_082 times 4 dd 1.082392200292393968799446 PD_1_082 times 4 dd 1.082392200292393968799446
PD_M2_613 times 4 dd -2.613125929752753055713286 PD_M2_613 times 4 dd -2.613125929752753055713286
PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 64 BITS 64
; ;
; Perform dequantization and inverse DCT on one block of coefficients. ; Perform dequantization and inverse DCT on one block of coefficients.
; ;
@@ -66,417 +66,417 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r12 = JSAMPARRAY output_buf ; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col ; r13 = JDIMENSION output_col
%define original_rbp rbp+0 %define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
; FAST_FLOAT workspace[DCTSIZE2] ; FAST_FLOAT workspace[DCTSIZE2]
align 16 align 16
global EXTN(jsimd_idct_float_sse2) global EXTN(jsimd_idct_float_sse2)
EXTN(jsimd_idct_float_sse2): EXTN(jsimd_idct_float_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [workspace] lea rsp, [workspace]
collect_args collect_args
push rbx push rbx
; ---- Pass 1: process columns from input, store into work array. ; ---- Pass 1: process columns from input, store into work array.
mov rdx, r10 ; quantptr mov rdx, r10 ; quantptr
mov rsi, r11 ; inptr mov rsi, r11 ; inptr
lea rdi, [workspace] ; FAST_FLOAT *wsptr lea rdi, [workspace] ; FAST_FLOAT *wsptr
mov rcx, DCTSIZE/4 ; ctr mov rcx, DCTSIZE/4 ; ctr
.columnloop: .columnloop:
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
jnz near .columnDCT jnz near .columnDCT
movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
por xmm1,xmm2 por xmm1, xmm2
por xmm3,xmm4 por xmm3, xmm4
por xmm5,xmm6 por xmm5, xmm6
por xmm1,xmm3 por xmm1, xmm3
por xmm5,xmm7 por xmm5, xmm7
por xmm1,xmm5 por xmm1, xmm5
packsswb xmm1,xmm1 packsswb xmm1, xmm1
movd eax,xmm1 movd eax, xmm1
test rax,rax test rax, rax
jnz short .columnDCT jnz short .columnDCT
; -- AC terms all zero ; -- AC terms all zero
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm1,xmm0 movaps xmm1, xmm0
movaps xmm2,xmm0 movaps xmm2, xmm0
movaps xmm3,xmm0 movaps xmm3, xmm0
shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
jmp near .nextcolumn jmp near .nextcolumn
%endif %endif
.columnDCT: .columnDCT:
; -- Even part ; -- Even part
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm4,xmm0 movaps xmm4, xmm0
movaps xmm5,xmm1 movaps xmm5, xmm1
subps xmm0,xmm2 ; xmm0=tmp11 subps xmm0, xmm2 ; xmm0=tmp11
subps xmm1,xmm3 subps xmm1, xmm3
addps xmm4,xmm2 ; xmm4=tmp10 addps xmm4, xmm2 ; xmm4=tmp10
addps xmm5,xmm3 ; xmm5=tmp13 addps xmm5, xmm3 ; xmm5=tmp13
mulps xmm1,[rel PD_1_414] mulps xmm1, [rel PD_1_414]
subps xmm1,xmm5 ; xmm1=tmp12 subps xmm1, xmm5 ; xmm1=tmp12
movaps xmm6,xmm4 movaps xmm6, xmm4
movaps xmm7,xmm0 movaps xmm7, xmm0
subps xmm4,xmm5 ; xmm4=tmp3 subps xmm4, xmm5 ; xmm4=tmp3
subps xmm0,xmm1 ; xmm0=tmp2 subps xmm0, xmm1 ; xmm0=tmp2
addps xmm6,xmm5 ; xmm6=tmp0 addps xmm6, xmm5 ; xmm6=tmp0
addps xmm7,xmm1 ; xmm7=tmp1 addps xmm7, xmm1 ; xmm7=tmp1
movaps XMMWORD [wk(1)], xmm4 ; tmp3 movaps XMMWORD [wk(1)], xmm4 ; tmp3
movaps XMMWORD [wk(0)], xmm0 ; tmp2 movaps XMMWORD [wk(0)], xmm0 ; tmp2
; -- Odd part ; -- Odd part
movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm4,xmm2 movaps xmm4, xmm2
movaps xmm0,xmm5 movaps xmm0, xmm5
addps xmm2,xmm1 ; xmm2=z11 addps xmm2, xmm1 ; xmm2=z11
addps xmm5,xmm3 ; xmm5=z13 addps xmm5, xmm3 ; xmm5=z13
subps xmm4,xmm1 ; xmm4=z12 subps xmm4, xmm1 ; xmm4=z12
subps xmm0,xmm3 ; xmm0=z10 subps xmm0, xmm3 ; xmm0=z10
movaps xmm1,xmm2 movaps xmm1, xmm2
subps xmm2,xmm5 subps xmm2, xmm5
addps xmm1,xmm5 ; xmm1=tmp7 addps xmm1, xmm5 ; xmm1=tmp7
mulps xmm2,[rel PD_1_414] ; xmm2=tmp11 mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
movaps xmm3,xmm0 movaps xmm3, xmm0
addps xmm0,xmm4 addps xmm0, xmm4
mulps xmm0,[rel PD_1_847] ; xmm0=z5 mulps xmm0, [rel PD_1_847] ; xmm0=z5
mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930) mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200) mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
addps xmm3,xmm0 ; xmm3=tmp12 addps xmm3, xmm0 ; xmm3=tmp12
subps xmm4,xmm0 ; xmm4=tmp10 subps xmm4, xmm0 ; xmm4=tmp10
; -- Final output stage ; -- Final output stage
subps xmm3,xmm1 ; xmm3=tmp6 subps xmm3, xmm1 ; xmm3=tmp6
movaps xmm5,xmm6 movaps xmm5, xmm6
movaps xmm0,xmm7 movaps xmm0, xmm7
addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
subps xmm2,xmm3 ; xmm2=tmp5 subps xmm2, xmm3 ; xmm2=tmp5
movaps xmm1,xmm6 ; transpose coefficients(phase 1) movaps xmm1, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
movaps xmm3,xmm0 ; transpose coefficients(phase 1) movaps xmm3, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
addps xmm4,xmm2 ; xmm4=tmp4 addps xmm4, xmm2 ; xmm4=tmp4
movaps xmm0,xmm7 movaps xmm0, xmm7
movaps xmm3,xmm5 movaps xmm3, xmm5
addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
movaps xmm2,xmm7 ; transpose coefficients(phase 1) movaps xmm2, xmm7 ; transpose coefficients(phase 1)
unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
movaps xmm4,xmm5 ; transpose coefficients(phase 1) movaps xmm4, xmm5 ; transpose coefficients(phase 1)
unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
movaps xmm3,xmm6 ; transpose coefficients(phase 2) movaps xmm3, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
movaps xmm0,xmm1 ; transpose coefficients(phase 2) movaps xmm0, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
movaps xmm6,xmm5 ; transpose coefficients(phase 2) movaps xmm6, xmm5 ; transpose coefficients(phase 2)
unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
movaps xmm3,xmm4 ; transpose coefficients(phase 2) movaps xmm3, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
.nextcolumn: .nextcolumn:
add rsi, byte 4*SIZEOF_JCOEF ; coef_block add rsi, byte 4*SIZEOF_JCOEF ; coef_block
add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
dec rcx ; ctr dec rcx ; ctr
jnz near .columnloop jnz near .columnloop
; -- Prefetch the next coefficient block ; -- Prefetch the next coefficient block
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
; ---- Pass 2: process rows from work array, store into output array. ; ---- Pass 2: process rows from work array, store into output array.
mov rax, [original_rbp] mov rax, [original_rbp]
lea rsi, [workspace] ; FAST_FLOAT *wsptr lea rsi, [workspace] ; FAST_FLOAT *wsptr
mov rdi, r12 ; (JSAMPROW *) mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d mov eax, r13d
mov rcx, DCTSIZE/4 ; ctr mov rcx, DCTSIZE/4 ; ctr
.rowloop: .rowloop:
; -- Even part ; -- Even part
movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm4,xmm0 movaps xmm4, xmm0
movaps xmm5,xmm1 movaps xmm5, xmm1
subps xmm0,xmm2 ; xmm0=tmp11 subps xmm0, xmm2 ; xmm0=tmp11
subps xmm1,xmm3 subps xmm1, xmm3
addps xmm4,xmm2 ; xmm4=tmp10 addps xmm4, xmm2 ; xmm4=tmp10
addps xmm5,xmm3 ; xmm5=tmp13 addps xmm5, xmm3 ; xmm5=tmp13
mulps xmm1,[rel PD_1_414] mulps xmm1, [rel PD_1_414]
subps xmm1,xmm5 ; xmm1=tmp12 subps xmm1, xmm5 ; xmm1=tmp12
movaps xmm6,xmm4 movaps xmm6, xmm4
movaps xmm7,xmm0 movaps xmm7, xmm0
subps xmm4,xmm5 ; xmm4=tmp3 subps xmm4, xmm5 ; xmm4=tmp3
subps xmm0,xmm1 ; xmm0=tmp2 subps xmm0, xmm1 ; xmm0=tmp2
addps xmm6,xmm5 ; xmm6=tmp0 addps xmm6, xmm5 ; xmm6=tmp0
addps xmm7,xmm1 ; xmm7=tmp1 addps xmm7, xmm1 ; xmm7=tmp1
movaps XMMWORD [wk(1)], xmm4 ; tmp3 movaps XMMWORD [wk(1)], xmm4 ; tmp3
movaps XMMWORD [wk(0)], xmm0 ; tmp2 movaps XMMWORD [wk(0)], xmm0 ; tmp2
; -- Odd part ; -- Odd part
movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)] movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)] movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm4,xmm2 movaps xmm4, xmm2
movaps xmm0,xmm5 movaps xmm0, xmm5
addps xmm2,xmm1 ; xmm2=z11 addps xmm2, xmm1 ; xmm2=z11
addps xmm5,xmm3 ; xmm5=z13 addps xmm5, xmm3 ; xmm5=z13
subps xmm4,xmm1 ; xmm4=z12 subps xmm4, xmm1 ; xmm4=z12
subps xmm0,xmm3 ; xmm0=z10 subps xmm0, xmm3 ; xmm0=z10
movaps xmm1,xmm2 movaps xmm1, xmm2
subps xmm2,xmm5 subps xmm2, xmm5
addps xmm1,xmm5 ; xmm1=tmp7 addps xmm1, xmm5 ; xmm1=tmp7
mulps xmm2,[rel PD_1_414] ; xmm2=tmp11 mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
movaps xmm3,xmm0 movaps xmm3, xmm0
addps xmm0,xmm4 addps xmm0, xmm4
mulps xmm0,[rel PD_1_847] ; xmm0=z5 mulps xmm0, [rel PD_1_847] ; xmm0=z5
mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930) mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200) mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
addps xmm3,xmm0 ; xmm3=tmp12 addps xmm3, xmm0 ; xmm3=tmp12
subps xmm4,xmm0 ; xmm4=tmp10 subps xmm4, xmm0 ; xmm4=tmp10
; -- Final output stage ; -- Final output stage
subps xmm3,xmm1 ; xmm3=tmp6 subps xmm3, xmm1 ; xmm3=tmp6
movaps xmm5,xmm6 movaps xmm5, xmm6
movaps xmm0,xmm7 movaps xmm0, xmm7
addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
subps xmm2,xmm3 ; xmm2=tmp5 subps xmm2, xmm3 ; xmm2=tmp5
movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC] movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
pcmpeqd xmm3,xmm3 pcmpeqd xmm3, xmm3
psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
addps xmm4,xmm2 ; xmm4=tmp4 addps xmm4, xmm2 ; xmm4=tmp4
movaps xmm7,xmm1 movaps xmm7, xmm1
movaps xmm5,xmm3 movaps xmm5, xmm3
addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC] movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
pcmpeqd xmm4,xmm4 pcmpeqd xmm4, xmm4
psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
paddb xmm6,xmm2 paddb xmm6, xmm2
paddb xmm1,xmm2 paddb xmm1, xmm2
movdqa xmm4,xmm6 ; transpose coefficients(phase 2) movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
movdqa xmm7,xmm6 ; transpose coefficients(phase 3) movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
add rdi, byte 4*SIZEOF_JSAMPROW add rdi, byte 4*SIZEOF_JSAMPROW
dec rcx ; ctr dec rcx ; ctr
jnz near .rowloop jnz near .rowloop
pop rbx pop rbx
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -24,34 +24,34 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1,%2,0x44 shufps %1, %2, 0x44
%endmacro %endmacro
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1,%2,0xEE shufps %1, %2, 0xEE
%endmacro %endmacro
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 16 alignz 16
global EXTN(jconst_idct_float_sse2) global EXTN(jconst_idct_float_sse2)
EXTN(jconst_idct_float_sse2): EXTN(jconst_idct_float_sse2):
PD_1_414 times 4 dd 1.414213562373095048801689 PD_1_414 times 4 dd 1.414213562373095048801689
PD_1_847 times 4 dd 1.847759065022573512256366 PD_1_847 times 4 dd 1.847759065022573512256366
PD_1_082 times 4 dd 1.082392200292393968799446 PD_1_082 times 4 dd 1.082392200292393968799446
PD_M2_613 times 4 dd -2.613125929752753055713286 PD_M2_613 times 4 dd -2.613125929752753055713286
PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 32 BITS 32
; ;
; Perform dequantization and inverse DCT on one block of coefficients. ; Perform dequantization and inverse DCT on one block of coefficients.
; ;
@@ -60,438 +60,438 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; JSAMPARRAY output_buf, JDIMENSION output_col) ; JSAMPARRAY output_buf, JDIMENSION output_col)
; ;
%define dct_table(b) (b)+8 ; void *dct_table %define dct_table(b) (b)+8 ; void *dct_table
%define coef_block(b) (b)+12 ; JCOEFPTR coef_block %define coef_block(b) (b)+12 ; JCOEFPTR coef_block
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
%define output_col(b) (b)+20 ; JDIMENSION output_col %define output_col(b) (b)+20 ; JDIMENSION output_col
%define original_ebp ebp+0 %define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
; FAST_FLOAT workspace[DCTSIZE2] ; FAST_FLOAT workspace[DCTSIZE2]
align 16 align 16
global EXTN(jsimd_idct_float_sse2) global EXTN(jsimd_idct_float_sse2)
EXTN(jsimd_idct_float_sse2): EXTN(jsimd_idct_float_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [workspace] lea esp, [workspace]
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address get_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input, store into work array. ; ---- Pass 1: process columns from input, store into work array.
; mov eax, [original_ebp] ; mov eax, [original_ebp]
mov edx, POINTER [dct_table(eax)] ; quantptr mov edx, POINTER [dct_table(eax)] ; quantptr
mov esi, JCOEFPTR [coef_block(eax)] ; inptr mov esi, JCOEFPTR [coef_block(eax)] ; inptr
lea edi, [workspace] ; FAST_FLOAT *wsptr lea edi, [workspace] ; FAST_FLOAT *wsptr
mov ecx, DCTSIZE/4 ; ctr mov ecx, DCTSIZE/4 ; ctr
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz near .columnDCT jnz near .columnDCT
movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
por xmm1,xmm2 por xmm1, xmm2
por xmm3,xmm4 por xmm3, xmm4
por xmm5,xmm6 por xmm5, xmm6
por xmm1,xmm3 por xmm1, xmm3
por xmm5,xmm7 por xmm5, xmm7
por xmm1,xmm5 por xmm1, xmm5
packsswb xmm1,xmm1 packsswb xmm1, xmm1
movd eax,xmm1 movd eax, xmm1
test eax,eax test eax, eax
jnz short .columnDCT jnz short .columnDCT
; -- AC terms all zero ; -- AC terms all zero
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm1,xmm0 movaps xmm1, xmm0
movaps xmm2,xmm0 movaps xmm2, xmm0
movaps xmm3,xmm0 movaps xmm3, xmm0
shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
jmp near .nextcolumn jmp near .nextcolumn
alignx 16,7 alignx 16, 7
%endif %endif
.columnDCT: .columnDCT:
; -- Even part ; -- Even part
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm4,xmm0 movaps xmm4, xmm0
movaps xmm5,xmm1 movaps xmm5, xmm1
subps xmm0,xmm2 ; xmm0=tmp11 subps xmm0, xmm2 ; xmm0=tmp11
subps xmm1,xmm3 subps xmm1, xmm3
addps xmm4,xmm2 ; xmm4=tmp10 addps xmm4, xmm2 ; xmm4=tmp10
addps xmm5,xmm3 ; xmm5=tmp13 addps xmm5, xmm3 ; xmm5=tmp13
mulps xmm1,[GOTOFF(ebx,PD_1_414)] mulps xmm1, [GOTOFF(ebx,PD_1_414)]
subps xmm1,xmm5 ; xmm1=tmp12 subps xmm1, xmm5 ; xmm1=tmp12
movaps xmm6,xmm4 movaps xmm6, xmm4
movaps xmm7,xmm0 movaps xmm7, xmm0
subps xmm4,xmm5 ; xmm4=tmp3 subps xmm4, xmm5 ; xmm4=tmp3
subps xmm0,xmm1 ; xmm0=tmp2 subps xmm0, xmm1 ; xmm0=tmp2
addps xmm6,xmm5 ; xmm6=tmp0 addps xmm6, xmm5 ; xmm6=tmp0
addps xmm7,xmm1 ; xmm7=tmp1 addps xmm7, xmm1 ; xmm7=tmp1
movaps XMMWORD [wk(1)], xmm4 ; tmp3 movaps XMMWORD [wk(1)], xmm4 ; tmp3
movaps XMMWORD [wk(0)], xmm0 ; tmp2 movaps XMMWORD [wk(0)], xmm0 ; tmp2
; -- Odd part ; -- Odd part
movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm4,xmm2 movaps xmm4, xmm2
movaps xmm0,xmm5 movaps xmm0, xmm5
addps xmm2,xmm1 ; xmm2=z11 addps xmm2, xmm1 ; xmm2=z11
addps xmm5,xmm3 ; xmm5=z13 addps xmm5, xmm3 ; xmm5=z13
subps xmm4,xmm1 ; xmm4=z12 subps xmm4, xmm1 ; xmm4=z12
subps xmm0,xmm3 ; xmm0=z10 subps xmm0, xmm3 ; xmm0=z10
movaps xmm1,xmm2 movaps xmm1, xmm2
subps xmm2,xmm5 subps xmm2, xmm5
addps xmm1,xmm5 ; xmm1=tmp7 addps xmm1, xmm5 ; xmm1=tmp7
mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
movaps xmm3,xmm0 movaps xmm3, xmm0
addps xmm0,xmm4 addps xmm0, xmm4
mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
addps xmm3,xmm0 ; xmm3=tmp12 addps xmm3, xmm0 ; xmm3=tmp12
subps xmm4,xmm0 ; xmm4=tmp10 subps xmm4, xmm0 ; xmm4=tmp10
; -- Final output stage ; -- Final output stage
subps xmm3,xmm1 ; xmm3=tmp6 subps xmm3, xmm1 ; xmm3=tmp6
movaps xmm5,xmm6 movaps xmm5, xmm6
movaps xmm0,xmm7 movaps xmm0, xmm7
addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
subps xmm2,xmm3 ; xmm2=tmp5 subps xmm2, xmm3 ; xmm2=tmp5
movaps xmm1,xmm6 ; transpose coefficients(phase 1) movaps xmm1, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
movaps xmm3,xmm0 ; transpose coefficients(phase 1) movaps xmm3, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
addps xmm4,xmm2 ; xmm4=tmp4 addps xmm4, xmm2 ; xmm4=tmp4
movaps xmm0,xmm7 movaps xmm0, xmm7
movaps xmm3,xmm5 movaps xmm3, xmm5
addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
movaps xmm2,xmm7 ; transpose coefficients(phase 1) movaps xmm2, xmm7 ; transpose coefficients(phase 1)
unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
movaps xmm4,xmm5 ; transpose coefficients(phase 1) movaps xmm4, xmm5 ; transpose coefficients(phase 1)
unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
movaps xmm3,xmm6 ; transpose coefficients(phase 2) movaps xmm3, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
movaps xmm0,xmm1 ; transpose coefficients(phase 2) movaps xmm0, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
movaps xmm6,xmm5 ; transpose coefficients(phase 2) movaps xmm6, xmm5 ; transpose coefficients(phase 2)
unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
movaps xmm3,xmm4 ; transpose coefficients(phase 2) movaps xmm3, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
.nextcolumn: .nextcolumn:
add esi, byte 4*SIZEOF_JCOEF ; coef_block add esi, byte 4*SIZEOF_JCOEF ; coef_block
add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
dec ecx ; ctr dec ecx ; ctr
jnz near .columnloop jnz near .columnloop
; -- Prefetch the next coefficient block ; -- Prefetch the next coefficient block
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
; ---- Pass 2: process rows from work array, store into output array. ; ---- Pass 2: process rows from work array, store into output array.
mov eax, [original_ebp] mov eax, [original_ebp]
lea esi, [workspace] ; FAST_FLOAT *wsptr lea esi, [workspace] ; FAST_FLOAT *wsptr
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(eax)] mov eax, JDIMENSION [output_col(eax)]
mov ecx, DCTSIZE/4 ; ctr mov ecx, DCTSIZE/4 ; ctr
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
; -- Even part ; -- Even part
movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm4,xmm0 movaps xmm4, xmm0
movaps xmm5,xmm1 movaps xmm5, xmm1
subps xmm0,xmm2 ; xmm0=tmp11 subps xmm0, xmm2 ; xmm0=tmp11
subps xmm1,xmm3 subps xmm1, xmm3
addps xmm4,xmm2 ; xmm4=tmp10 addps xmm4, xmm2 ; xmm4=tmp10
addps xmm5,xmm3 ; xmm5=tmp13 addps xmm5, xmm3 ; xmm5=tmp13
mulps xmm1,[GOTOFF(ebx,PD_1_414)] mulps xmm1, [GOTOFF(ebx,PD_1_414)]
subps xmm1,xmm5 ; xmm1=tmp12 subps xmm1, xmm5 ; xmm1=tmp12
movaps xmm6,xmm4 movaps xmm6, xmm4
movaps xmm7,xmm0 movaps xmm7, xmm0
subps xmm4,xmm5 ; xmm4=tmp3 subps xmm4, xmm5 ; xmm4=tmp3
subps xmm0,xmm1 ; xmm0=tmp2 subps xmm0, xmm1 ; xmm0=tmp2
addps xmm6,xmm5 ; xmm6=tmp0 addps xmm6, xmm5 ; xmm6=tmp0
addps xmm7,xmm1 ; xmm7=tmp1 addps xmm7, xmm1 ; xmm7=tmp1
movaps XMMWORD [wk(1)], xmm4 ; tmp3 movaps XMMWORD [wk(1)], xmm4 ; tmp3
movaps XMMWORD [wk(0)], xmm0 ; tmp2 movaps XMMWORD [wk(0)], xmm0 ; tmp2
; -- Odd part ; -- Odd part
movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm4,xmm2 movaps xmm4, xmm2
movaps xmm0,xmm5 movaps xmm0, xmm5
addps xmm2,xmm1 ; xmm2=z11 addps xmm2, xmm1 ; xmm2=z11
addps xmm5,xmm3 ; xmm5=z13 addps xmm5, xmm3 ; xmm5=z13
subps xmm4,xmm1 ; xmm4=z12 subps xmm4, xmm1 ; xmm4=z12
subps xmm0,xmm3 ; xmm0=z10 subps xmm0, xmm3 ; xmm0=z10
movaps xmm1,xmm2 movaps xmm1, xmm2
subps xmm2,xmm5 subps xmm2, xmm5
addps xmm1,xmm5 ; xmm1=tmp7 addps xmm1, xmm5 ; xmm1=tmp7
mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
movaps xmm3,xmm0 movaps xmm3, xmm0
addps xmm0,xmm4 addps xmm0, xmm4
mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
addps xmm3,xmm0 ; xmm3=tmp12 addps xmm3, xmm0 ; xmm3=tmp12
subps xmm4,xmm0 ; xmm4=tmp10 subps xmm4, xmm0 ; xmm4=tmp10
; -- Final output stage ; -- Final output stage
subps xmm3,xmm1 ; xmm3=tmp6 subps xmm3, xmm1 ; xmm3=tmp6
movaps xmm5,xmm6 movaps xmm5, xmm6
movaps xmm0,xmm7 movaps xmm0, xmm7
addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
subps xmm2,xmm3 ; xmm2=tmp5 subps xmm2, xmm3 ; xmm2=tmp5
movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
pcmpeqd xmm3,xmm3 pcmpeqd xmm3, xmm3
psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
addps xmm4,xmm2 ; xmm4=tmp4 addps xmm4, xmm2 ; xmm4=tmp4
movaps xmm7,xmm1 movaps xmm7, xmm1
movaps xmm5,xmm3 movaps xmm5, xmm3
addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
pcmpeqd xmm4,xmm4 pcmpeqd xmm4, xmm4
psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
paddb xmm6,xmm2 paddb xmm6, xmm2
paddb xmm1,xmm2 paddb xmm1, xmm2
movdqa xmm4,xmm6 ; transpose coefficients(phase 2) movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
movdqa xmm7,xmm6 ; transpose coefficients(phase 3) movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
pushpic ebx ; save GOT address pushpic ebx ; save GOT address
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
poppic ebx ; restore GOT address poppic ebx ; restore GOT address
add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
add edi, byte 4*SIZEOF_JSAMPROW add edi, byte 4*SIZEOF_JSAMPROW
dec ecx ; ctr dec ecx ; ctr
jnz near .rowloop jnz near .rowloop
pop edi pop edi
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
pop ebx pop ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -26,54 +26,54 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%define CONST_BITS 8 ; 14 is also OK. %define CONST_BITS 8 ; 14 is also OK.
%define PASS1_BITS 2 %define PASS1_BITS 2
%if IFAST_SCALE_BITS != PASS1_BITS %if IFAST_SCALE_BITS != PASS1_BITS
%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." %error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
%endif %endif
%if CONST_BITS == 8 %if CONST_BITS == 8
F_1_082 equ 277 ; FIX(1.082392200) F_1_082 equ 277 ; FIX(1.082392200)
F_1_414 equ 362 ; FIX(1.414213562) F_1_414 equ 362 ; FIX(1.414213562)
F_1_847 equ 473 ; FIX(1.847759065) F_1_847 equ 473 ; FIX(1.847759065)
F_2_613 equ 669 ; FIX(2.613125930) F_2_613 equ 669 ; FIX(2.613125930)
F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
%else %else
; NASM cannot do compile-time arithmetic on floating-point constants. ; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) F_1_082 equ DESCALE(1162209775, 30-CONST_BITS) ; FIX(1.082392200)
F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) F_1_414 equ DESCALE(1518500249, 30-CONST_BITS) ; FIX(1.414213562)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) F_2_613 equ DESCALE(2805822602, 30-CONST_BITS) ; FIX(2.613125930)
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
%endif %endif
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
%define PRE_MULTIPLY_SCALE_BITS 2 %define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
alignz 16 alignz 16
global EXTN(jconst_idct_ifast_sse2) global EXTN(jconst_idct_ifast_sse2)
EXTN(jconst_idct_ifast_sse2): EXTN(jconst_idct_ifast_sse2):
PW_F1414 times 8 dw F_1_414 << CONST_SHIFT PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
PW_F1847 times 8 dw F_1_847 << CONST_SHIFT PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
PW_F1082 times 8 dw F_1_082 << CONST_SHIFT PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
PB_CENTERJSAMP times 16 db CENTERJSAMPLE PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 64 BITS 64
; ;
; Perform dequantization and inverse DCT on one block of coefficients. ; Perform dequantization and inverse DCT on one block of coefficients.
; ;
@@ -87,405 +87,405 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r12 = JSAMPARRAY output_buf ; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col ; r13 = JDIMENSION output_col
%define original_rbp rbp+0 %define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
align 16 align 16
global EXTN(jsimd_idct_ifast_sse2) global EXTN(jsimd_idct_ifast_sse2)
EXTN(jsimd_idct_ifast_sse2): EXTN(jsimd_idct_ifast_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
mov rdx, r10 ; quantptr mov rdx, r10 ; quantptr
mov rsi, r11 ; inptr mov rsi, r11 ; inptr
%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
jnz near .columnDCT jnz near .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
por xmm1,xmm0 por xmm1, xmm0
packsswb xmm1,xmm1 packsswb xmm1, xmm1
packsswb xmm1,xmm1 packsswb xmm1, xmm1
movd eax,xmm1 movd eax, xmm1
test rax,rax test rax, rax
jnz short .columnDCT jnz short .columnDCT
; -- AC terms all zero ; -- AC terms all zero
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
jmp near .column_end jmp near .column_end
%endif %endif
.columnDCT: .columnDCT:
; -- Even part ; -- Even part
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm4,xmm0 movdqa xmm4, xmm0
movdqa xmm5,xmm1 movdqa xmm5, xmm1
psubw xmm0,xmm2 ; xmm0=tmp11 psubw xmm0, xmm2 ; xmm0=tmp11
psubw xmm1,xmm3 psubw xmm1, xmm3
paddw xmm4,xmm2 ; xmm4=tmp10 paddw xmm4, xmm2 ; xmm4=tmp10
paddw xmm5,xmm3 ; xmm5=tmp13 paddw xmm5, xmm3 ; xmm5=tmp13
psllw xmm1,PRE_MULTIPLY_SCALE_BITS psllw xmm1, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm1,[rel PW_F1414] pmulhw xmm1, [rel PW_F1414]
psubw xmm1,xmm5 ; xmm1=tmp12 psubw xmm1, xmm5 ; xmm1=tmp12
movdqa xmm6,xmm4 movdqa xmm6, xmm4
movdqa xmm7,xmm0 movdqa xmm7, xmm0
psubw xmm4,xmm5 ; xmm4=tmp3 psubw xmm4, xmm5 ; xmm4=tmp3
psubw xmm0,xmm1 ; xmm0=tmp2 psubw xmm0, xmm1 ; xmm0=tmp2
paddw xmm6,xmm5 ; xmm6=tmp0 paddw xmm6, xmm5 ; xmm6=tmp0
paddw xmm7,xmm1 ; xmm7=tmp1 paddw xmm7, xmm1 ; xmm7=tmp1
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
; -- Odd part ; -- Odd part
movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm4,xmm2 movdqa xmm4, xmm2
movdqa xmm0,xmm5 movdqa xmm0, xmm5
psubw xmm2,xmm1 ; xmm2=z12 psubw xmm2, xmm1 ; xmm2=z12
psubw xmm5,xmm3 ; xmm5=z10 psubw xmm5, xmm3 ; xmm5=z10
paddw xmm4,xmm1 ; xmm4=z11 paddw xmm4, xmm1 ; xmm4=z11
paddw xmm0,xmm3 ; xmm0=z13 paddw xmm0, xmm3 ; xmm0=z13
movdqa xmm1,xmm5 ; xmm1=z10(unscaled) movdqa xmm1, xmm5 ; xmm1=z10(unscaled)
psllw xmm2,PRE_MULTIPLY_SCALE_BITS psllw xmm2, PRE_MULTIPLY_SCALE_BITS
psllw xmm5,PRE_MULTIPLY_SCALE_BITS psllw xmm5, PRE_MULTIPLY_SCALE_BITS
movdqa xmm3,xmm4 movdqa xmm3, xmm4
psubw xmm4,xmm0 psubw xmm4, xmm0
paddw xmm3,xmm0 ; xmm3=tmp7 paddw xmm3, xmm0 ; xmm3=tmp7
psllw xmm4,PRE_MULTIPLY_SCALE_BITS psllw xmm4, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm4,[rel PW_F1414] ; xmm4=tmp11 pmulhw xmm4, [rel PW_F1414] ; xmm4=tmp11
; To avoid overflow... ; To avoid overflow...
; ;
; (Original) ; (Original)
; tmp12 = -2.613125930 * z10 + z5; ; tmp12 = -2.613125930 * z10 + z5;
; ;
; (This implementation) ; (This implementation)
; tmp12 = (-1.613125930 - 1) * z10 + z5; ; tmp12 = (-1.613125930 - 1) * z10 + z5;
; = -1.613125930 * z10 - z10 + z5; ; = -1.613125930 * z10 - z10 + z5;
movdqa xmm0,xmm5 movdqa xmm0, xmm5
paddw xmm5,xmm2 paddw xmm5, xmm2
pmulhw xmm5,[rel PW_F1847] ; xmm5=z5 pmulhw xmm5, [rel PW_F1847] ; xmm5=z5
pmulhw xmm0,[rel PW_MF1613] pmulhw xmm0, [rel PW_MF1613]
pmulhw xmm2,[rel PW_F1082] pmulhw xmm2, [rel PW_F1082]
psubw xmm0,xmm1 psubw xmm0, xmm1
psubw xmm2,xmm5 ; xmm2=tmp10 psubw xmm2, xmm5 ; xmm2=tmp10
paddw xmm0,xmm5 ; xmm0=tmp12 paddw xmm0, xmm5 ; xmm0=tmp12
; -- Final output stage ; -- Final output stage
psubw xmm0,xmm3 ; xmm0=tmp6 psubw xmm0, xmm3 ; xmm0=tmp6
movdqa xmm1,xmm6 movdqa xmm1, xmm6
movdqa xmm5,xmm7 movdqa xmm5, xmm7
paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
psubw xmm4,xmm0 ; xmm4=tmp5 psubw xmm4, xmm0 ; xmm4=tmp5
movdqa xmm3,xmm6 ; transpose coefficients(phase 1) movdqa xmm3, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
movdqa xmm0,xmm5 ; transpose coefficients(phase 1) movdqa xmm0, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
paddw xmm2,xmm4 ; xmm2=tmp4 paddw xmm2, xmm4 ; xmm2=tmp4
movdqa xmm5,xmm7 movdqa xmm5, xmm7
movdqa xmm0,xmm1 movdqa xmm0, xmm1
paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
movdqa xmm4,xmm7 ; transpose coefficients(phase 1) movdqa xmm4, xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
movdqa xmm2,xmm1 ; transpose coefficients(phase 1) movdqa xmm2, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm0,xmm3 ; transpose coefficients(phase 2) movdqa xmm0, xmm3 ; transpose coefficients(phase 2)
punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
movdqa xmm5,xmm6 ; transpose coefficients(phase 2) movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
movdqa xmm3,xmm1 ; transpose coefficients(phase 2) movdqa xmm3, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
movdqa xmm0,xmm2 ; transpose coefficients(phase 2) movdqa xmm0, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
movdqa xmm4,xmm6 ; transpose coefficients(phase 3) movdqa xmm4, xmm6 ; transpose coefficients(phase 3)
punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
movdqa xmm7,xmm5 ; transpose coefficients(phase 3) movdqa xmm7, xmm5 ; transpose coefficients(phase 3)
punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
movdqa xmm4,xmm1 ; transpose coefficients(phase 3) movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
movdqa xmm7,xmm3 ; transpose coefficients(phase 3) movdqa xmm7, xmm3 ; transpose coefficients(phase 3)
punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
.column_end: .column_end:
; -- Prefetch the next coefficient block ; -- Prefetch the next coefficient block
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
; ---- Pass 2: process rows from work array, store into output array. ; ---- Pass 2: process rows from work array, store into output array.
mov rax, [original_rbp] mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *) mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d mov eax, r13d
; -- Even part ; -- Even part
; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
movdqa xmm2,xmm6 movdqa xmm2, xmm6
movdqa xmm0,xmm5 movdqa xmm0, xmm5
psubw xmm6,xmm1 ; xmm6=tmp11 psubw xmm6, xmm1 ; xmm6=tmp11
psubw xmm5,xmm3 psubw xmm5, xmm3
paddw xmm2,xmm1 ; xmm2=tmp10 paddw xmm2, xmm1 ; xmm2=tmp10
paddw xmm0,xmm3 ; xmm0=tmp13 paddw xmm0, xmm3 ; xmm0=tmp13
psllw xmm5,PRE_MULTIPLY_SCALE_BITS psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[rel PW_F1414] pmulhw xmm5, [rel PW_F1414]
psubw xmm5,xmm0 ; xmm5=tmp12 psubw xmm5, xmm0 ; xmm5=tmp12
movdqa xmm1,xmm2 movdqa xmm1, xmm2
movdqa xmm3,xmm6 movdqa xmm3, xmm6
psubw xmm2,xmm0 ; xmm2=tmp3 psubw xmm2, xmm0 ; xmm2=tmp3
psubw xmm6,xmm5 ; xmm6=tmp2 psubw xmm6, xmm5 ; xmm6=tmp2
paddw xmm1,xmm0 ; xmm1=tmp0 paddw xmm1, xmm0 ; xmm1=tmp0
paddw xmm3,xmm5 ; xmm3=tmp1 paddw xmm3, xmm5 ; xmm3=tmp1
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
; -- Odd part ; -- Odd part
; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
movdqa xmm2,xmm0 movdqa xmm2, xmm0
movdqa xmm6,xmm4 movdqa xmm6, xmm4
psubw xmm0,xmm7 ; xmm0=z12 psubw xmm0, xmm7 ; xmm0=z12
psubw xmm4,xmm5 ; xmm4=z10 psubw xmm4, xmm5 ; xmm4=z10
paddw xmm2,xmm7 ; xmm2=z11 paddw xmm2, xmm7 ; xmm2=z11
paddw xmm6,xmm5 ; xmm6=z13 paddw xmm6, xmm5 ; xmm6=z13
movdqa xmm7,xmm4 ; xmm7=z10(unscaled) movdqa xmm7, xmm4 ; xmm7=z10(unscaled)
psllw xmm0,PRE_MULTIPLY_SCALE_BITS psllw xmm0, PRE_MULTIPLY_SCALE_BITS
psllw xmm4,PRE_MULTIPLY_SCALE_BITS psllw xmm4, PRE_MULTIPLY_SCALE_BITS
movdqa xmm5,xmm2 movdqa xmm5, xmm2
psubw xmm2,xmm6 psubw xmm2, xmm6
paddw xmm5,xmm6 ; xmm5=tmp7 paddw xmm5, xmm6 ; xmm5=tmp7
psllw xmm2,PRE_MULTIPLY_SCALE_BITS psllw xmm2, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm2,[rel PW_F1414] ; xmm2=tmp11 pmulhw xmm2, [rel PW_F1414] ; xmm2=tmp11
; To avoid overflow... ; To avoid overflow...
; ;
; (Original) ; (Original)
; tmp12 = -2.613125930 * z10 + z5; ; tmp12 = -2.613125930 * z10 + z5;
; ;
; (This implementation) ; (This implementation)
; tmp12 = (-1.613125930 - 1) * z10 + z5; ; tmp12 = (-1.613125930 - 1) * z10 + z5;
; = -1.613125930 * z10 - z10 + z5; ; = -1.613125930 * z10 - z10 + z5;
movdqa xmm6,xmm4 movdqa xmm6, xmm4
paddw xmm4,xmm0 paddw xmm4, xmm0
pmulhw xmm4,[rel PW_F1847] ; xmm4=z5 pmulhw xmm4, [rel PW_F1847] ; xmm4=z5
pmulhw xmm6,[rel PW_MF1613] pmulhw xmm6, [rel PW_MF1613]
pmulhw xmm0,[rel PW_F1082] pmulhw xmm0, [rel PW_F1082]
psubw xmm6,xmm7 psubw xmm6, xmm7
psubw xmm0,xmm4 ; xmm0=tmp10 psubw xmm0, xmm4 ; xmm0=tmp10
paddw xmm6,xmm4 ; xmm6=tmp12 paddw xmm6, xmm4 ; xmm6=tmp12
; -- Final output stage ; -- Final output stage
psubw xmm6,xmm5 ; xmm6=tmp6 psubw xmm6, xmm5 ; xmm6=tmp6
movdqa xmm7,xmm1 movdqa xmm7, xmm1
movdqa xmm4,xmm3 movdqa xmm4, xmm3
paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
psraw xmm1,(PASS1_BITS+3) ; descale psraw xmm1, (PASS1_BITS+3) ; descale
psraw xmm3,(PASS1_BITS+3) ; descale psraw xmm3, (PASS1_BITS+3) ; descale
psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
psraw xmm7,(PASS1_BITS+3) ; descale psraw xmm7, (PASS1_BITS+3) ; descale
psraw xmm4,(PASS1_BITS+3) ; descale psraw xmm4, (PASS1_BITS+3) ; descale
psubw xmm2,xmm6 ; xmm2=tmp5 psubw xmm2, xmm6 ; xmm2=tmp5
packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
paddw xmm0,xmm2 ; xmm0=tmp4 paddw xmm0, xmm2 ; xmm0=tmp4
movdqa xmm4,xmm5 movdqa xmm4, xmm5
movdqa xmm7,xmm6 movdqa xmm7, xmm6
paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
psraw xmm5,(PASS1_BITS+3) ; descale psraw xmm5, (PASS1_BITS+3) ; descale
psraw xmm6,(PASS1_BITS+3) ; descale psraw xmm6, (PASS1_BITS+3) ; descale
psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
psraw xmm4,(PASS1_BITS+3) ; descale psraw xmm4, (PASS1_BITS+3) ; descale
psraw xmm7,(PASS1_BITS+3) ; descale psraw xmm7, (PASS1_BITS+3) ; descale
movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
paddb xmm1,xmm2 paddb xmm1, xmm2
paddb xmm3,xmm2 paddb xmm3, xmm2
paddb xmm5,xmm2 paddb xmm5, xmm2
paddb xmm7,xmm2 paddb xmm7, xmm2
movdqa xmm0,xmm1 ; transpose coefficients(phase 1) movdqa xmm0, xmm1 ; transpose coefficients(phase 1)
punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
movdqa xmm6,xmm5 ; transpose coefficients(phase 1) movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
movdqa xmm4,xmm1 ; transpose coefficients(phase 2) movdqa xmm4, xmm1 ; transpose coefficients(phase 2)
punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
movdqa xmm2,xmm6 ; transpose coefficients(phase 2) movdqa xmm2, xmm6 ; transpose coefficients(phase 2)
punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
movdqa xmm3,xmm1 ; transpose coefficients(phase 3) movdqa xmm3, xmm1 ; transpose coefficients(phase 3)
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
movdqa xmm7,xmm4 ; transpose coefficients(phase 3) movdqa xmm7, xmm4 ; transpose coefficients(phase 3)
punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -25,54 +25,54 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%define CONST_BITS 8 ; 14 is also OK. %define CONST_BITS 8 ; 14 is also OK.
%define PASS1_BITS 2 %define PASS1_BITS 2
%if IFAST_SCALE_BITS != PASS1_BITS %if IFAST_SCALE_BITS != PASS1_BITS
%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." %error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
%endif %endif
%if CONST_BITS == 8 %if CONST_BITS == 8
F_1_082 equ 277 ; FIX(1.082392200) F_1_082 equ 277 ; FIX(1.082392200)
F_1_414 equ 362 ; FIX(1.414213562) F_1_414 equ 362 ; FIX(1.414213562)
F_1_847 equ 473 ; FIX(1.847759065) F_1_847 equ 473 ; FIX(1.847759065)
F_2_613 equ 669 ; FIX(2.613125930) F_2_613 equ 669 ; FIX(2.613125930)
F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
%else %else
; NASM cannot do compile-time arithmetic on floating-point constants. ; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) F_1_082 equ DESCALE(1162209775, 30-CONST_BITS) ; FIX(1.082392200)
F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) F_1_414 equ DESCALE(1518500249, 30-CONST_BITS) ; FIX(1.414213562)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) F_2_613 equ DESCALE(2805822602, 30-CONST_BITS) ; FIX(2.613125930)
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
%endif %endif
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
%define PRE_MULTIPLY_SCALE_BITS 2 %define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
alignz 16 alignz 16
global EXTN(jconst_idct_ifast_sse2) global EXTN(jconst_idct_ifast_sse2)
EXTN(jconst_idct_ifast_sse2): EXTN(jconst_idct_ifast_sse2):
PW_F1414 times 8 dw F_1_414 << CONST_SHIFT PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
PW_F1847 times 8 dw F_1_847 << CONST_SHIFT PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
PW_F1082 times 8 dw F_1_082 << CONST_SHIFT PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
PB_CENTERJSAMP times 16 db CENTERJSAMPLE PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 32 BITS 32
; ;
; Perform dequantization and inverse DCT on one block of coefficients. ; Perform dequantization and inverse DCT on one block of coefficients.
; ;
@@ -81,421 +81,421 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; JSAMPARRAY output_buf, JDIMENSION output_col) ; JSAMPARRAY output_buf, JDIMENSION output_col)
; ;
%define dct_table(b) (b)+8 ; jpeg_component_info *compptr %define dct_table(b) (b)+8 ; jpeg_component_info *compptr
%define coef_block(b) (b)+12 ; JCOEFPTR coef_block %define coef_block(b) (b)+12 ; JCOEFPTR coef_block
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
%define output_col(b) (b)+20 ; JDIMENSION output_col %define output_col(b) (b)+20 ; JDIMENSION output_col
%define original_ebp ebp+0 %define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
align 16 align 16
global EXTN(jsimd_idct_ifast_sse2) global EXTN(jsimd_idct_ifast_sse2)
EXTN(jsimd_idct_ifast_sse2): EXTN(jsimd_idct_ifast_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx pushpic ebx
; push ecx ; unused ; push ecx ; unused
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address get_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
; mov eax, [original_ebp] ; mov eax, [original_ebp]
mov edx, POINTER [dct_table(eax)] ; quantptr mov edx, POINTER [dct_table(eax)] ; quantptr
mov esi, JCOEFPTR [coef_block(eax)] ; inptr mov esi, JCOEFPTR [coef_block(eax)] ; inptr
%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz near .columnDCT jnz near .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
por xmm1,xmm0 por xmm1, xmm0
packsswb xmm1,xmm1 packsswb xmm1, xmm1
packsswb xmm1,xmm1 packsswb xmm1, xmm1
movd eax,xmm1 movd eax, xmm1
test eax,eax test eax, eax
jnz short .columnDCT jnz short .columnDCT
; -- AC terms all zero ; -- AC terms all zero
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
jmp near .column_end jmp near .column_end
alignx 16,7 alignx 16, 7
%endif %endif
.columnDCT: .columnDCT:
; -- Even part ; -- Even part
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm4,xmm0 movdqa xmm4, xmm0
movdqa xmm5,xmm1 movdqa xmm5, xmm1
psubw xmm0,xmm2 ; xmm0=tmp11 psubw xmm0, xmm2 ; xmm0=tmp11
psubw xmm1,xmm3 psubw xmm1, xmm3
paddw xmm4,xmm2 ; xmm4=tmp10 paddw xmm4, xmm2 ; xmm4=tmp10
paddw xmm5,xmm3 ; xmm5=tmp13 paddw xmm5, xmm3 ; xmm5=tmp13
psllw xmm1,PRE_MULTIPLY_SCALE_BITS psllw xmm1, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm1,[GOTOFF(ebx,PW_F1414)] pmulhw xmm1, [GOTOFF(ebx,PW_F1414)]
psubw xmm1,xmm5 ; xmm1=tmp12 psubw xmm1, xmm5 ; xmm1=tmp12
movdqa xmm6,xmm4 movdqa xmm6, xmm4
movdqa xmm7,xmm0 movdqa xmm7, xmm0
psubw xmm4,xmm5 ; xmm4=tmp3 psubw xmm4, xmm5 ; xmm4=tmp3
psubw xmm0,xmm1 ; xmm0=tmp2 psubw xmm0, xmm1 ; xmm0=tmp2
paddw xmm6,xmm5 ; xmm6=tmp0 paddw xmm6, xmm5 ; xmm6=tmp0
paddw xmm7,xmm1 ; xmm7=tmp1 paddw xmm7, xmm1 ; xmm7=tmp1
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
; -- Odd part ; -- Odd part
movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm4,xmm2 movdqa xmm4, xmm2
movdqa xmm0,xmm5 movdqa xmm0, xmm5
psubw xmm2,xmm1 ; xmm2=z12 psubw xmm2, xmm1 ; xmm2=z12
psubw xmm5,xmm3 ; xmm5=z10 psubw xmm5, xmm3 ; xmm5=z10
paddw xmm4,xmm1 ; xmm4=z11 paddw xmm4, xmm1 ; xmm4=z11
paddw xmm0,xmm3 ; xmm0=z13 paddw xmm0, xmm3 ; xmm0=z13
movdqa xmm1,xmm5 ; xmm1=z10(unscaled) movdqa xmm1, xmm5 ; xmm1=z10(unscaled)
psllw xmm2,PRE_MULTIPLY_SCALE_BITS psllw xmm2, PRE_MULTIPLY_SCALE_BITS
psllw xmm5,PRE_MULTIPLY_SCALE_BITS psllw xmm5, PRE_MULTIPLY_SCALE_BITS
movdqa xmm3,xmm4 movdqa xmm3, xmm4
psubw xmm4,xmm0 psubw xmm4, xmm0
paddw xmm3,xmm0 ; xmm3=tmp7 paddw xmm3, xmm0 ; xmm3=tmp7
psllw xmm4,PRE_MULTIPLY_SCALE_BITS psllw xmm4, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm4,[GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11 pmulhw xmm4, [GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11
; To avoid overflow... ; To avoid overflow...
; ;
; (Original) ; (Original)
; tmp12 = -2.613125930 * z10 + z5; ; tmp12 = -2.613125930 * z10 + z5;
; ;
; (This implementation) ; (This implementation)
; tmp12 = (-1.613125930 - 1) * z10 + z5; ; tmp12 = (-1.613125930 - 1) * z10 + z5;
; = -1.613125930 * z10 - z10 + z5; ; = -1.613125930 * z10 - z10 + z5;
movdqa xmm0,xmm5 movdqa xmm0, xmm5
paddw xmm5,xmm2 paddw xmm5, xmm2
pmulhw xmm5,[GOTOFF(ebx,PW_F1847)] ; xmm5=z5 pmulhw xmm5, [GOTOFF(ebx,PW_F1847)] ; xmm5=z5
pmulhw xmm0,[GOTOFF(ebx,PW_MF1613)] pmulhw xmm0, [GOTOFF(ebx,PW_MF1613)]
pmulhw xmm2,[GOTOFF(ebx,PW_F1082)] pmulhw xmm2, [GOTOFF(ebx,PW_F1082)]
psubw xmm0,xmm1 psubw xmm0, xmm1
psubw xmm2,xmm5 ; xmm2=tmp10 psubw xmm2, xmm5 ; xmm2=tmp10
paddw xmm0,xmm5 ; xmm0=tmp12 paddw xmm0, xmm5 ; xmm0=tmp12
; -- Final output stage ; -- Final output stage
psubw xmm0,xmm3 ; xmm0=tmp6 psubw xmm0, xmm3 ; xmm0=tmp6
movdqa xmm1,xmm6 movdqa xmm1, xmm6
movdqa xmm5,xmm7 movdqa xmm5, xmm7
paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
psubw xmm4,xmm0 ; xmm4=tmp5 psubw xmm4, xmm0 ; xmm4=tmp5
movdqa xmm3,xmm6 ; transpose coefficients(phase 1) movdqa xmm3, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
movdqa xmm0,xmm5 ; transpose coefficients(phase 1) movdqa xmm0, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
paddw xmm2,xmm4 ; xmm2=tmp4 paddw xmm2, xmm4 ; xmm2=tmp4
movdqa xmm5,xmm7 movdqa xmm5, xmm7
movdqa xmm0,xmm1 movdqa xmm0, xmm1
paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
movdqa xmm4,xmm7 ; transpose coefficients(phase 1) movdqa xmm4, xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
movdqa xmm2,xmm1 ; transpose coefficients(phase 1) movdqa xmm2, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm0,xmm3 ; transpose coefficients(phase 2) movdqa xmm0, xmm3 ; transpose coefficients(phase 2)
punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
movdqa xmm5,xmm6 ; transpose coefficients(phase 2) movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
movdqa xmm3,xmm1 ; transpose coefficients(phase 2) movdqa xmm3, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
movdqa xmm0,xmm2 ; transpose coefficients(phase 2) movdqa xmm0, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
movdqa xmm4,xmm6 ; transpose coefficients(phase 3) movdqa xmm4, xmm6 ; transpose coefficients(phase 3)
punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
movdqa xmm7,xmm5 ; transpose coefficients(phase 3) movdqa xmm7, xmm5 ; transpose coefficients(phase 3)
punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
movdqa xmm4,xmm1 ; transpose coefficients(phase 3) movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
movdqa xmm7,xmm3 ; transpose coefficients(phase 3) movdqa xmm7, xmm3 ; transpose coefficients(phase 3)
punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
.column_end: .column_end:
; -- Prefetch the next coefficient block ; -- Prefetch the next coefficient block
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
; ---- Pass 2: process rows from work array, store into output array. ; ---- Pass 2: process rows from work array, store into output array.
mov eax, [original_ebp] mov eax, [original_ebp]
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(eax)] mov eax, JDIMENSION [output_col(eax)]
; -- Even part ; -- Even part
; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
movdqa xmm2,xmm6 movdqa xmm2, xmm6
movdqa xmm0,xmm5 movdqa xmm0, xmm5
psubw xmm6,xmm1 ; xmm6=tmp11 psubw xmm6, xmm1 ; xmm6=tmp11
psubw xmm5,xmm3 psubw xmm5, xmm3
paddw xmm2,xmm1 ; xmm2=tmp10 paddw xmm2, xmm1 ; xmm2=tmp10
paddw xmm0,xmm3 ; xmm0=tmp13 paddw xmm0, xmm3 ; xmm0=tmp13
psllw xmm5,PRE_MULTIPLY_SCALE_BITS psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[GOTOFF(ebx,PW_F1414)] pmulhw xmm5, [GOTOFF(ebx,PW_F1414)]
psubw xmm5,xmm0 ; xmm5=tmp12 psubw xmm5, xmm0 ; xmm5=tmp12
movdqa xmm1,xmm2 movdqa xmm1, xmm2
movdqa xmm3,xmm6 movdqa xmm3, xmm6
psubw xmm2,xmm0 ; xmm2=tmp3 psubw xmm2, xmm0 ; xmm2=tmp3
psubw xmm6,xmm5 ; xmm6=tmp2 psubw xmm6, xmm5 ; xmm6=tmp2
paddw xmm1,xmm0 ; xmm1=tmp0 paddw xmm1, xmm0 ; xmm1=tmp0
paddw xmm3,xmm5 ; xmm3=tmp1 paddw xmm3, xmm5 ; xmm3=tmp1
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
; -- Odd part ; -- Odd part
; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
movdqa xmm2,xmm0 movdqa xmm2, xmm0
movdqa xmm6,xmm4 movdqa xmm6, xmm4
psubw xmm0,xmm7 ; xmm0=z12 psubw xmm0, xmm7 ; xmm0=z12
psubw xmm4,xmm5 ; xmm4=z10 psubw xmm4, xmm5 ; xmm4=z10
paddw xmm2,xmm7 ; xmm2=z11 paddw xmm2, xmm7 ; xmm2=z11
paddw xmm6,xmm5 ; xmm6=z13 paddw xmm6, xmm5 ; xmm6=z13
movdqa xmm7,xmm4 ; xmm7=z10(unscaled) movdqa xmm7, xmm4 ; xmm7=z10(unscaled)
psllw xmm0,PRE_MULTIPLY_SCALE_BITS psllw xmm0, PRE_MULTIPLY_SCALE_BITS
psllw xmm4,PRE_MULTIPLY_SCALE_BITS psllw xmm4, PRE_MULTIPLY_SCALE_BITS
movdqa xmm5,xmm2 movdqa xmm5, xmm2
psubw xmm2,xmm6 psubw xmm2, xmm6
paddw xmm5,xmm6 ; xmm5=tmp7 paddw xmm5, xmm6 ; xmm5=tmp7
psllw xmm2,PRE_MULTIPLY_SCALE_BITS psllw xmm2, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm2,[GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11 pmulhw xmm2, [GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11
; To avoid overflow... ; To avoid overflow...
; ;
; (Original) ; (Original)
; tmp12 = -2.613125930 * z10 + z5; ; tmp12 = -2.613125930 * z10 + z5;
; ;
; (This implementation) ; (This implementation)
; tmp12 = (-1.613125930 - 1) * z10 + z5; ; tmp12 = (-1.613125930 - 1) * z10 + z5;
; = -1.613125930 * z10 - z10 + z5; ; = -1.613125930 * z10 - z10 + z5;
movdqa xmm6,xmm4 movdqa xmm6, xmm4
paddw xmm4,xmm0 paddw xmm4, xmm0
pmulhw xmm4,[GOTOFF(ebx,PW_F1847)] ; xmm4=z5 pmulhw xmm4, [GOTOFF(ebx,PW_F1847)] ; xmm4=z5
pmulhw xmm6,[GOTOFF(ebx,PW_MF1613)] pmulhw xmm6, [GOTOFF(ebx,PW_MF1613)]
pmulhw xmm0,[GOTOFF(ebx,PW_F1082)] pmulhw xmm0, [GOTOFF(ebx,PW_F1082)]
psubw xmm6,xmm7 psubw xmm6, xmm7
psubw xmm0,xmm4 ; xmm0=tmp10 psubw xmm0, xmm4 ; xmm0=tmp10
paddw xmm6,xmm4 ; xmm6=tmp12 paddw xmm6, xmm4 ; xmm6=tmp12
; -- Final output stage ; -- Final output stage
psubw xmm6,xmm5 ; xmm6=tmp6 psubw xmm6, xmm5 ; xmm6=tmp6
movdqa xmm7,xmm1 movdqa xmm7, xmm1
movdqa xmm4,xmm3 movdqa xmm4, xmm3
paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
psraw xmm1,(PASS1_BITS+3) ; descale psraw xmm1, (PASS1_BITS+3) ; descale
psraw xmm3,(PASS1_BITS+3) ; descale psraw xmm3, (PASS1_BITS+3) ; descale
psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
psraw xmm7,(PASS1_BITS+3) ; descale psraw xmm7, (PASS1_BITS+3) ; descale
psraw xmm4,(PASS1_BITS+3) ; descale psraw xmm4, (PASS1_BITS+3) ; descale
psubw xmm2,xmm6 ; xmm2=tmp5 psubw xmm2, xmm6 ; xmm2=tmp5
packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
paddw xmm0,xmm2 ; xmm0=tmp4 paddw xmm0, xmm2 ; xmm0=tmp4
movdqa xmm4,xmm5 movdqa xmm4, xmm5
movdqa xmm7,xmm6 movdqa xmm7, xmm6
paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
psraw xmm5,(PASS1_BITS+3) ; descale psraw xmm5, (PASS1_BITS+3) ; descale
psraw xmm6,(PASS1_BITS+3) ; descale psraw xmm6, (PASS1_BITS+3) ; descale
psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
psraw xmm4,(PASS1_BITS+3) ; descale psraw xmm4, (PASS1_BITS+3) ; descale
psraw xmm7,(PASS1_BITS+3) ; descale psraw xmm7, (PASS1_BITS+3) ; descale
movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
paddb xmm1,xmm2 paddb xmm1, xmm2
paddb xmm3,xmm2 paddb xmm3, xmm2
paddb xmm5,xmm2 paddb xmm5, xmm2
paddb xmm7,xmm2 paddb xmm7, xmm2
movdqa xmm0,xmm1 ; transpose coefficients(phase 1) movdqa xmm0, xmm1 ; transpose coefficients(phase 1)
punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
movdqa xmm6,xmm5 ; transpose coefficients(phase 1) movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
movdqa xmm4,xmm1 ; transpose coefficients(phase 2) movdqa xmm4, xmm1 ; transpose coefficients(phase 2)
punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
movdqa xmm2,xmm6 ; transpose coefficients(phase 2) movdqa xmm2, xmm6 ; transpose coefficients(phase 2)
punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
movdqa xmm3,xmm1 ; transpose coefficients(phase 3) movdqa xmm3, xmm1 ; transpose coefficients(phase 3)
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
movdqa xmm7,xmm4 ; transpose coefficients(phase 3) movdqa xmm7, xmm4 ; transpose coefficients(phase 3)
punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
pop edi pop edi
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; unused ; pop ecx ; unused
poppic ebx poppic ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -26,74 +26,74 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%define CONST_BITS 13 %define CONST_BITS 13
%define PASS1_BITS 2 %define PASS1_BITS 2
%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) %define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) %define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) %define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) %define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
%if CONST_BITS == 13 %if CONST_BITS == 13
F_0_211 equ 1730 ; FIX(0.211164243) F_0_211 equ 1730 ; FIX(0.211164243)
F_0_509 equ 4176 ; FIX(0.509795579) F_0_509 equ 4176 ; FIX(0.509795579)
F_0_601 equ 4926 ; FIX(0.601344887) F_0_601 equ 4926 ; FIX(0.601344887)
F_0_720 equ 5906 ; FIX(0.720959822) F_0_720 equ 5906 ; FIX(0.720959822)
F_0_765 equ 6270 ; FIX(0.765366865) F_0_765 equ 6270 ; FIX(0.765366865)
F_0_850 equ 6967 ; FIX(0.850430095) F_0_850 equ 6967 ; FIX(0.850430095)
F_0_899 equ 7373 ; FIX(0.899976223) F_0_899 equ 7373 ; FIX(0.899976223)
F_1_061 equ 8697 ; FIX(1.061594337) F_1_061 equ 8697 ; FIX(1.061594337)
F_1_272 equ 10426 ; FIX(1.272758580) F_1_272 equ 10426 ; FIX(1.272758580)
F_1_451 equ 11893 ; FIX(1.451774981) F_1_451 equ 11893 ; FIX(1.451774981)
F_1_847 equ 15137 ; FIX(1.847759065) F_1_847 equ 15137 ; FIX(1.847759065)
F_2_172 equ 17799 ; FIX(2.172734803) F_2_172 equ 17799 ; FIX(2.172734803)
F_2_562 equ 20995 ; FIX(2.562915447) F_2_562 equ 20995 ; FIX(2.562915447)
F_3_624 equ 29692 ; FIX(3.624509785) F_3_624 equ 29692 ; FIX(3.624509785)
%else %else
; NASM cannot do compile-time arithmetic on floating-point constants. ; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) F_0_211 equ DESCALE( 226735879, 30-CONST_BITS) ; FIX(0.211164243)
F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) F_0_509 equ DESCALE( 547388834, 30-CONST_BITS) ; FIX(0.509795579)
F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) F_0_601 equ DESCALE( 645689155, 30-CONST_BITS) ; FIX(0.601344887)
F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) F_0_720 equ DESCALE( 774124714, 30-CONST_BITS) ; FIX(0.720959822)
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) F_0_850 equ DESCALE( 913142361, 30-CONST_BITS) ; FIX(0.850430095)
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) F_1_061 equ DESCALE(1139878239, 30-CONST_BITS) ; FIX(1.061594337)
F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) F_1_272 equ DESCALE(1366614119, 30-CONST_BITS) ; FIX(1.272758580)
F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) F_1_451 equ DESCALE(1558831516, 30-CONST_BITS) ; FIX(1.451774981)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) F_2_172 equ DESCALE(2332956230, 30-CONST_BITS) ; FIX(2.172734803)
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) F_3_624 equ DESCALE(3891787747, 30-CONST_BITS) ; FIX(3.624509785)
%endif %endif
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 16 alignz 16
global EXTN(jconst_idct_red_sse2) global EXTN(jconst_idct_red_sse2)
EXTN(jconst_idct_red_sse2): EXTN(jconst_idct_red_sse2):
PW_F184_MF076 times 4 dw F_1_847,-F_0_765 PW_F184_MF076 times 4 dw F_1_847,-F_0_765
PW_F256_F089 times 4 dw F_2_562, F_0_899 PW_F256_F089 times 4 dw F_2_562, F_0_899
PW_F106_MF217 times 4 dw F_1_061,-F_2_172 PW_F106_MF217 times 4 dw F_1_061,-F_2_172
PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509
PW_F145_MF021 times 4 dw F_1_451,-F_0_211 PW_F145_MF021 times 4 dw F_1_451,-F_0_211
PW_F362_MF127 times 4 dw F_3_624,-F_1_272 PW_F362_MF127 times 4 dw F_3_624,-F_1_272
PW_F085_MF072 times 4 dw F_0_850,-F_0_720 PW_F085_MF072 times 4 dw F_0_850,-F_0_720
PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1) PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1)
PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1) PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1)
PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1)
PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 64 BITS 64
; ;
; Perform dequantization and inverse DCT on one block of coefficients, ; Perform dequantization and inverse DCT on one block of coefficients,
; producing a reduced-size 4x4 output block. ; producing a reduced-size 4x4 output block.
@@ -108,292 +108,292 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r12 = JSAMPARRAY output_buf ; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col ; r13 = JDIMENSION output_col
%define original_rbp rbp+0 %define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
align 16 align 16
global EXTN(jsimd_idct_4x4_sse2) global EXTN(jsimd_idct_4x4_sse2)
EXTN(jsimd_idct_4x4_sse2): EXTN(jsimd_idct_4x4_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
mov rdx, r10 ; quantptr mov rdx, r10 ; quantptr
mov rsi, r11 ; inptr mov rsi, r11 ; inptr
%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
jnz short .columnDCT jnz short .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
por xmm0,xmm1 por xmm0, xmm1
packsswb xmm0,xmm0 packsswb xmm0, xmm0
packsswb xmm0,xmm0 packsswb xmm0, xmm0
movd eax,xmm0 movd eax, xmm0
test rax,rax test rax, rax
jnz short .columnDCT jnz short .columnDCT
; -- AC terms all zero ; -- AC terms all zero
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
psllw xmm0,PASS1_BITS psllw xmm0, PASS1_BITS
movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07) punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
jmp near .column_end jmp near .column_end
%endif %endif
.columnDCT: .columnDCT:
; -- Odd part ; -- Odd part
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm4,xmm0 movdqa xmm4, xmm0
movdqa xmm5,xmm0 movdqa xmm5, xmm0
punpcklwd xmm4,xmm1 punpcklwd xmm4, xmm1
punpckhwd xmm5,xmm1 punpckhwd xmm5, xmm1
movdqa xmm0,xmm4 movdqa xmm0, xmm4
movdqa xmm1,xmm5 movdqa xmm1, xmm5
pmaddwd xmm4,[rel PW_F256_F089] ; xmm4=(tmp2L) pmaddwd xmm4, [rel PW_F256_F089] ; xmm4=(tmp2L)
pmaddwd xmm5,[rel PW_F256_F089] ; xmm5=(tmp2H) pmaddwd xmm5, [rel PW_F256_F089] ; xmm5=(tmp2H)
pmaddwd xmm0,[rel PW_F106_MF217] ; xmm0=(tmp0L) pmaddwd xmm0, [rel PW_F106_MF217] ; xmm0=(tmp0L)
pmaddwd xmm1,[rel PW_F106_MF217] ; xmm1=(tmp0H) pmaddwd xmm1, [rel PW_F106_MF217] ; xmm1=(tmp0H)
movdqa xmm6,xmm2 movdqa xmm6, xmm2
movdqa xmm7,xmm2 movdqa xmm7, xmm2
punpcklwd xmm6,xmm3 punpcklwd xmm6, xmm3
punpckhwd xmm7,xmm3 punpckhwd xmm7, xmm3
movdqa xmm2,xmm6 movdqa xmm2, xmm6
movdqa xmm3,xmm7 movdqa xmm3, xmm7
pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2L) pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2L)
pmaddwd xmm7,[rel PW_MF060_MF050] ; xmm7=(tmp2H) pmaddwd xmm7, [rel PW_MF060_MF050] ; xmm7=(tmp2H)
pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0L) pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0L)
pmaddwd xmm3,[rel PW_F145_MF021] ; xmm3=(tmp0H) pmaddwd xmm3, [rel PW_F145_MF021] ; xmm3=(tmp0H)
paddd xmm6,xmm4 ; xmm6=tmp2L paddd xmm6, xmm4 ; xmm6=tmp2L
paddd xmm7,xmm5 ; xmm7=tmp2H paddd xmm7, xmm5 ; xmm7=tmp2H
paddd xmm2,xmm0 ; xmm2=tmp0L paddd xmm2, xmm0 ; xmm2=tmp0L
paddd xmm3,xmm1 ; xmm3=tmp0H paddd xmm3, xmm1 ; xmm3=tmp0H
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
; -- Even part ; -- Even part
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pxor xmm1,xmm1 pxor xmm1, xmm1
pxor xmm2,xmm2 pxor xmm2, xmm2
punpcklwd xmm1,xmm4 ; xmm1=tmp0L punpcklwd xmm1, xmm4 ; xmm1=tmp0L
punpckhwd xmm2,xmm4 ; xmm2=tmp0H punpckhwd xmm2, xmm4 ; xmm2=tmp0H
psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
movdqa xmm3,xmm5 ; xmm5=in2=z2 movdqa xmm3, xmm5 ; xmm5=in2=z2
punpcklwd xmm5,xmm0 ; xmm0=in6=z3 punpcklwd xmm5, xmm0 ; xmm0=in6=z3
punpckhwd xmm3,xmm0 punpckhwd xmm3, xmm0
pmaddwd xmm5,[rel PW_F184_MF076] ; xmm5=tmp2L pmaddwd xmm5, [rel PW_F184_MF076] ; xmm5=tmp2L
pmaddwd xmm3,[rel PW_F184_MF076] ; xmm3=tmp2H pmaddwd xmm3, [rel PW_F184_MF076] ; xmm3=tmp2H
movdqa xmm4,xmm1 movdqa xmm4, xmm1
movdqa xmm0,xmm2 movdqa xmm0, xmm2
paddd xmm1,xmm5 ; xmm1=tmp10L paddd xmm1, xmm5 ; xmm1=tmp10L
paddd xmm2,xmm3 ; xmm2=tmp10H paddd xmm2, xmm3 ; xmm2=tmp10H
psubd xmm4,xmm5 ; xmm4=tmp12L psubd xmm4, xmm5 ; xmm4=tmp12L
psubd xmm0,xmm3 ; xmm0=tmp12H psubd xmm0, xmm3 ; xmm0=tmp12H
; -- Final output stage ; -- Final output stage
movdqa xmm5,xmm1 movdqa xmm5, xmm1
movdqa xmm3,xmm2 movdqa xmm3, xmm2
paddd xmm1,xmm6 ; xmm1=data0L paddd xmm1, xmm6 ; xmm1=data0L
paddd xmm2,xmm7 ; xmm2=data0H paddd xmm2, xmm7 ; xmm2=data0H
psubd xmm5,xmm6 ; xmm5=data3L psubd xmm5, xmm6 ; xmm5=data3L
psubd xmm3,xmm7 ; xmm3=data3H psubd xmm3, xmm7 ; xmm3=data3H
movdqa xmm6,[rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4] movdqa xmm6, [rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4]
paddd xmm1,xmm6 paddd xmm1, xmm6
paddd xmm2,xmm6 paddd xmm2, xmm6
psrad xmm1,DESCALE_P1_4 psrad xmm1, DESCALE_P1_4
psrad xmm2,DESCALE_P1_4 psrad xmm2, DESCALE_P1_4
paddd xmm5,xmm6 paddd xmm5, xmm6
paddd xmm3,xmm6 paddd xmm3, xmm6
psrad xmm5,DESCALE_P1_4 psrad xmm5, DESCALE_P1_4
psrad xmm3,DESCALE_P1_4 psrad xmm3, DESCALE_P1_4
packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
movdqa xmm2,xmm4 movdqa xmm2, xmm4
movdqa xmm3,xmm0 movdqa xmm3, xmm0
paddd xmm4,xmm7 ; xmm4=data1L paddd xmm4, xmm7 ; xmm4=data1L
paddd xmm0,xmm6 ; xmm0=data1H paddd xmm0, xmm6 ; xmm0=data1H
psubd xmm2,xmm7 ; xmm2=data2L psubd xmm2, xmm7 ; xmm2=data2L
psubd xmm3,xmm6 ; xmm3=data2H psubd xmm3, xmm6 ; xmm3=data2H
movdqa xmm7,[rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4] movdqa xmm7, [rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4]
paddd xmm4,xmm7 paddd xmm4, xmm7
paddd xmm0,xmm7 paddd xmm0, xmm7
psrad xmm4,DESCALE_P1_4 psrad xmm4, DESCALE_P1_4
psrad xmm0,DESCALE_P1_4 psrad xmm0, DESCALE_P1_4
paddd xmm2,xmm7 paddd xmm2, xmm7
paddd xmm3,xmm7 paddd xmm3, xmm7
psrad xmm2,DESCALE_P1_4 psrad xmm2, DESCALE_P1_4
psrad xmm3,DESCALE_P1_4 psrad xmm3, DESCALE_P1_4
packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
movdqa xmm6,xmm1 ; transpose coefficients(phase 1) movdqa xmm6, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13) punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
movdqa xmm7,xmm2 ; transpose coefficients(phase 1) movdqa xmm7, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33) punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37) punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
movdqa xmm0,xmm1 ; transpose coefficients(phase 2) movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
movdqa xmm3,xmm6 ; transpose coefficients(phase 2) movdqa xmm3, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
.column_end: .column_end:
; -- Prefetch the next coefficient block ; -- Prefetch the next coefficient block
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
; ---- Pass 2: process rows, store into output array. ; ---- Pass 2: process rows, store into output array.
mov rax, [original_rbp] mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *) mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d mov eax, r13d
; -- Even part ; -- Even part
pxor xmm4,xmm4 pxor xmm4, xmm4
punpcklwd xmm4,xmm1 ; xmm4=tmp0 punpcklwd xmm4, xmm1 ; xmm4=tmp0
psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
; -- Odd part ; -- Odd part
punpckhwd xmm1,xmm0 punpckhwd xmm1, xmm0
punpckhwd xmm6,xmm3 punpckhwd xmm6, xmm3
movdqa xmm5,xmm1 movdqa xmm5, xmm1
movdqa xmm2,xmm6 movdqa xmm2, xmm6
pmaddwd xmm1,[rel PW_F256_F089] ; xmm1=(tmp2) pmaddwd xmm1, [rel PW_F256_F089] ; xmm1=(tmp2)
pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2) pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2)
pmaddwd xmm5,[rel PW_F106_MF217] ; xmm5=(tmp0) pmaddwd xmm5, [rel PW_F106_MF217] ; xmm5=(tmp0)
pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0) pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0)
paddd xmm6,xmm1 ; xmm6=tmp2 paddd xmm6, xmm1 ; xmm6=tmp2
paddd xmm2,xmm5 ; xmm2=tmp0 paddd xmm2, xmm5 ; xmm2=tmp0
; -- Even part ; -- Even part
punpcklwd xmm0,xmm3 punpcklwd xmm0, xmm3
pmaddwd xmm0,[rel PW_F184_MF076] ; xmm0=tmp2 pmaddwd xmm0, [rel PW_F184_MF076] ; xmm0=tmp2
movdqa xmm7,xmm4 movdqa xmm7, xmm4
paddd xmm4,xmm0 ; xmm4=tmp10 paddd xmm4, xmm0 ; xmm4=tmp10
psubd xmm7,xmm0 ; xmm7=tmp12 psubd xmm7, xmm0 ; xmm7=tmp12
; -- Final output stage ; -- Final output stage
movdqa xmm1,[rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4] movdqa xmm1, [rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4]
movdqa xmm5,xmm4 movdqa xmm5, xmm4
movdqa xmm3,xmm7 movdqa xmm3, xmm7
paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30) paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30)
paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31) paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31)
psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33) psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33)
psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32) psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
paddd xmm4,xmm1 paddd xmm4, xmm1
paddd xmm7,xmm1 paddd xmm7, xmm1
psrad xmm4,DESCALE_P2_4 psrad xmm4, DESCALE_P2_4
psrad xmm7,DESCALE_P2_4 psrad xmm7, DESCALE_P2_4
paddd xmm5,xmm1 paddd xmm5, xmm1
paddd xmm3,xmm1 paddd xmm3, xmm1
psrad xmm5,DESCALE_P2_4 psrad xmm5, DESCALE_P2_4
psrad xmm3,DESCALE_P2_4 psrad xmm3, DESCALE_P2_4
packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32) packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33) packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
movdqa xmm0,xmm4 ; transpose coefficients(phase 1) movdqa xmm0, xmm4 ; transpose coefficients(phase 1)
punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31) punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33) punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
movdqa xmm6,xmm4 ; transpose coefficients(phase 2) movdqa xmm6, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13) punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33) punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
paddb xmm4,[rel PB_CENTERJSAMP] paddb xmm4, [rel PB_CENTERJSAMP]
pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -411,165 +411,165 @@ EXTN(jsimd_idct_4x4_sse2):
; r12 = JSAMPARRAY output_buf ; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col ; r13 = JDIMENSION output_col
align 16 align 16
global EXTN(jsimd_idct_2x2_sse2) global EXTN(jsimd_idct_2x2_sse2)
EXTN(jsimd_idct_2x2_sse2): EXTN(jsimd_idct_2x2_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
push rbx push rbx
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
mov rdx, r10 ; quantptr mov rdx, r10 ; quantptr
mov rsi, r11 ; inptr mov rsi, r11 ; inptr
; | input: | result: | ; | input: | result: |
; | 00 01 ** 03 ** 05 ** 07 | | ; | 00 01 ** 03 ** 05 ** 07 | |
; | 10 11 ** 13 ** 15 ** 17 | | ; | 10 11 ** 13 ** 15 ** 17 | |
; | ** ** ** ** ** ** ** ** | | ; | ** ** ** ** ** ** ** ** | |
; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
; | 50 51 ** 53 ** 55 ** 57 | | ; | 50 51 ** 53 ** 55 ** 57 | |
; | ** ** ** ** ** ** ** ** | | ; | ** ** ** ** ** ** ** ** | |
; | 70 71 ** 73 ** 75 ** 77 | | ; | 70 71 ** 73 ** 75 ** 77 | |
; -- Odd part ; -- Odd part
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
pcmpeqd xmm7,xmm7 pcmpeqd xmm7, xmm7
pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
pmaddwd xmm4,[rel PW_F362_MF127] pmaddwd xmm4, [rel PW_F362_MF127]
pmaddwd xmm5,[rel PW_F085_MF072] pmaddwd xmm5, [rel PW_F085_MF072]
psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37) por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77) por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
pmaddwd xmm0,[rel PW_F362_MF127] pmaddwd xmm0, [rel PW_F362_MF127]
pmaddwd xmm2,[rel PW_F085_MF072] pmaddwd xmm2, [rel PW_F085_MF072]
paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3] paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3]
paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7] paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
; -- Even part ; -- Even part
movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
; xmm6=(00 01 ** 03 ** 05 ** 07) ; xmm6=(00 01 ** 03 ** 05 ** 07)
movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
; -- Final output stage ; -- Final output stage
movdqa xmm3,xmm6 movdqa xmm3, xmm6
movdqa xmm5,xmm1 movdqa xmm5, xmm1
paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
movdqa xmm2,[rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2] movdqa xmm2, [rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2]
punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **) punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **)
movdqa xmm7,xmm1 movdqa xmm7, xmm1
punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3) punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3)
punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7) punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7)
paddd xmm6,xmm2 paddd xmm6, xmm2
psrad xmm6,DESCALE_P1_2 psrad xmm6, DESCALE_P1_2
paddd xmm1,xmm2 paddd xmm1, xmm2
paddd xmm7,xmm2 paddd xmm7, xmm2
psrad xmm1,DESCALE_P1_2 psrad xmm1, DESCALE_P1_2
psrad xmm7,DESCALE_P1_2 psrad xmm7, DESCALE_P1_2
; -- Prefetch the next coefficient block ; -- Prefetch the next coefficient block
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
; ---- Pass 2: process rows, store into output array. ; ---- Pass 2: process rows, store into output array.
mov rdi, r12 ; (JSAMPROW *) mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d mov eax, r13d
; | input:| result:| ; | input:| result:|
; | A0 B0 | | ; | A0 B0 | |
; | A1 B1 | C0 C1 | ; | A1 B1 | C0 C1 |
; | A3 B3 | D0 D1 | ; | A3 B3 | D0 D1 |
; | A5 B5 | | ; | A5 B5 | |
; | A7 B7 | | ; | A7 B7 | |
; -- Odd part ; -- Odd part
packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
pmaddwd xmm1,[rel PW_F362_MF127] pmaddwd xmm1, [rel PW_F362_MF127]
pmaddwd xmm7,[rel PW_F085_MF072] pmaddwd xmm7, [rel PW_F085_MF072]
paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1] paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
; -- Even part ; -- Even part
pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
; -- Final output stage ; -- Final output stage
movdqa xmm4,xmm6 movdqa xmm4, xmm6
paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1) punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1)
paddd xmm6,[rel PD_DESCALE_P2_2] paddd xmm6, [rel PD_DESCALE_P2_2]
psrad xmm6,DESCALE_P2_2 psrad xmm6, DESCALE_P2_2
packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
paddb xmm6,[rel PB_CENTERJSAMP] paddb xmm6, [rel PB_CENTERJSAMP]
pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --) pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --) pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx
mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx
pop rbx pop rbx
uncollect_args uncollect_args
pop rbp pop rbp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -25,74 +25,74 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%define CONST_BITS 13 %define CONST_BITS 13
%define PASS1_BITS 2 %define PASS1_BITS 2
%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) %define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) %define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) %define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) %define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
%if CONST_BITS == 13 %if CONST_BITS == 13
F_0_211 equ 1730 ; FIX(0.211164243) F_0_211 equ 1730 ; FIX(0.211164243)
F_0_509 equ 4176 ; FIX(0.509795579) F_0_509 equ 4176 ; FIX(0.509795579)
F_0_601 equ 4926 ; FIX(0.601344887) F_0_601 equ 4926 ; FIX(0.601344887)
F_0_720 equ 5906 ; FIX(0.720959822) F_0_720 equ 5906 ; FIX(0.720959822)
F_0_765 equ 6270 ; FIX(0.765366865) F_0_765 equ 6270 ; FIX(0.765366865)
F_0_850 equ 6967 ; FIX(0.850430095) F_0_850 equ 6967 ; FIX(0.850430095)
F_0_899 equ 7373 ; FIX(0.899976223) F_0_899 equ 7373 ; FIX(0.899976223)
F_1_061 equ 8697 ; FIX(1.061594337) F_1_061 equ 8697 ; FIX(1.061594337)
F_1_272 equ 10426 ; FIX(1.272758580) F_1_272 equ 10426 ; FIX(1.272758580)
F_1_451 equ 11893 ; FIX(1.451774981) F_1_451 equ 11893 ; FIX(1.451774981)
F_1_847 equ 15137 ; FIX(1.847759065) F_1_847 equ 15137 ; FIX(1.847759065)
F_2_172 equ 17799 ; FIX(2.172734803) F_2_172 equ 17799 ; FIX(2.172734803)
F_2_562 equ 20995 ; FIX(2.562915447) F_2_562 equ 20995 ; FIX(2.562915447)
F_3_624 equ 29692 ; FIX(3.624509785) F_3_624 equ 29692 ; FIX(3.624509785)
%else %else
; NASM cannot do compile-time arithmetic on floating-point constants. ; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) F_0_211 equ DESCALE( 226735879, 30-CONST_BITS) ; FIX(0.211164243)
F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) F_0_509 equ DESCALE( 547388834, 30-CONST_BITS) ; FIX(0.509795579)
F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) F_0_601 equ DESCALE( 645689155, 30-CONST_BITS) ; FIX(0.601344887)
F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) F_0_720 equ DESCALE( 774124714, 30-CONST_BITS) ; FIX(0.720959822)
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) F_0_850 equ DESCALE( 913142361, 30-CONST_BITS) ; FIX(0.850430095)
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) F_1_061 equ DESCALE(1139878239, 30-CONST_BITS) ; FIX(1.061594337)
F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) F_1_272 equ DESCALE(1366614119, 30-CONST_BITS) ; FIX(1.272758580)
F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) F_1_451 equ DESCALE(1558831516, 30-CONST_BITS) ; FIX(1.451774981)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) F_2_172 equ DESCALE(2332956230, 30-CONST_BITS) ; FIX(2.172734803)
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) F_3_624 equ DESCALE(3891787747, 30-CONST_BITS) ; FIX(3.624509785)
%endif %endif
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 16 alignz 16
global EXTN(jconst_idct_red_sse2) global EXTN(jconst_idct_red_sse2)
EXTN(jconst_idct_red_sse2): EXTN(jconst_idct_red_sse2):
PW_F184_MF076 times 4 dw F_1_847,-F_0_765 PW_F184_MF076 times 4 dw F_1_847,-F_0_765
PW_F256_F089 times 4 dw F_2_562, F_0_899 PW_F256_F089 times 4 dw F_2_562, F_0_899
PW_F106_MF217 times 4 dw F_1_061,-F_2_172 PW_F106_MF217 times 4 dw F_1_061,-F_2_172
PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509
PW_F145_MF021 times 4 dw F_1_451,-F_0_211 PW_F145_MF021 times 4 dw F_1_451,-F_0_211
PW_F362_MF127 times 4 dw F_3_624,-F_1_272 PW_F362_MF127 times 4 dw F_3_624,-F_1_272
PW_F085_MF072 times 4 dw F_0_850,-F_0_720 PW_F085_MF072 times 4 dw F_0_850,-F_0_720
PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1) PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1)
PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1) PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1)
PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1)
PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 16 alignz 16
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 32 BITS 32
; ;
; Perform dequantization and inverse DCT on one block of coefficients, ; Perform dequantization and inverse DCT on one block of coefficients,
; producing a reduced-size 4x4 output block. ; producing a reduced-size 4x4 output block.
@@ -102,309 +102,309 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; JSAMPARRAY output_buf, JDIMENSION output_col) ; JSAMPARRAY output_buf, JDIMENSION output_col)
; ;
%define dct_table(b) (b)+8 ; void *dct_table %define dct_table(b) (b)+8 ; void *dct_table
%define coef_block(b) (b)+12 ; JCOEFPTR coef_block %define coef_block(b) (b)+12 ; JCOEFPTR coef_block
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
%define output_col(b) (b)+20 ; JDIMENSION output_col %define output_col(b) (b)+20 ; JDIMENSION output_col
%define original_ebp ebp+0 %define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
align 16 align 16
global EXTN(jsimd_idct_4x4_sse2) global EXTN(jsimd_idct_4x4_sse2)
EXTN(jsimd_idct_4x4_sse2): EXTN(jsimd_idct_4x4_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx pushpic ebx
; push ecx ; unused ; push ecx ; unused
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address get_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
; mov eax, [original_ebp] ; mov eax, [original_ebp]
mov edx, POINTER [dct_table(eax)] ; quantptr mov edx, POINTER [dct_table(eax)] ; quantptr
mov esi, JCOEFPTR [coef_block(eax)] ; inptr mov esi, JCOEFPTR [coef_block(eax)] ; inptr
%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz short .columnDCT jnz short .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
por xmm0,xmm1 por xmm0, xmm1
packsswb xmm0,xmm0 packsswb xmm0, xmm0
packsswb xmm0,xmm0 packsswb xmm0, xmm0
movd eax,xmm0 movd eax, xmm0
test eax,eax test eax, eax
jnz short .columnDCT jnz short .columnDCT
; -- AC terms all zero ; -- AC terms all zero
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
psllw xmm0,PASS1_BITS psllw xmm0, PASS1_BITS
movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07) punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
jmp near .column_end jmp near .column_end
alignx 16,7 alignx 16, 7
%endif %endif
.columnDCT: .columnDCT:
; -- Odd part ; -- Odd part
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm4,xmm0 movdqa xmm4, xmm0
movdqa xmm5,xmm0 movdqa xmm5, xmm0
punpcklwd xmm4,xmm1 punpcklwd xmm4, xmm1
punpckhwd xmm5,xmm1 punpckhwd xmm5, xmm1
movdqa xmm0,xmm4 movdqa xmm0, xmm4
movdqa xmm1,xmm5 movdqa xmm1, xmm5
pmaddwd xmm4,[GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L) pmaddwd xmm4, [GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L)
pmaddwd xmm5,[GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H) pmaddwd xmm5, [GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H)
pmaddwd xmm0,[GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L) pmaddwd xmm0, [GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L)
pmaddwd xmm1,[GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H) pmaddwd xmm1, [GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H)
movdqa xmm6,xmm2 movdqa xmm6, xmm2
movdqa xmm7,xmm2 movdqa xmm7, xmm2
punpcklwd xmm6,xmm3 punpcklwd xmm6, xmm3
punpckhwd xmm7,xmm3 punpckhwd xmm7, xmm3
movdqa xmm2,xmm6 movdqa xmm2, xmm6
movdqa xmm3,xmm7 movdqa xmm3, xmm7
pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L) pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L)
pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H) pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H)
pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L) pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L)
pmaddwd xmm3,[GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H) pmaddwd xmm3, [GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H)
paddd xmm6,xmm4 ; xmm6=tmp2L paddd xmm6, xmm4 ; xmm6=tmp2L
paddd xmm7,xmm5 ; xmm7=tmp2H paddd xmm7, xmm5 ; xmm7=tmp2H
paddd xmm2,xmm0 ; xmm2=tmp0L paddd xmm2, xmm0 ; xmm2=tmp0L
paddd xmm3,xmm1 ; xmm3=tmp0H paddd xmm3, xmm1 ; xmm3=tmp0H
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
; -- Even part ; -- Even part
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pxor xmm1,xmm1 pxor xmm1, xmm1
pxor xmm2,xmm2 pxor xmm2, xmm2
punpcklwd xmm1,xmm4 ; xmm1=tmp0L punpcklwd xmm1, xmm4 ; xmm1=tmp0L
punpckhwd xmm2,xmm4 ; xmm2=tmp0H punpckhwd xmm2, xmm4 ; xmm2=tmp0H
psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
movdqa xmm3,xmm5 ; xmm5=in2=z2 movdqa xmm3, xmm5 ; xmm5=in2=z2
punpcklwd xmm5,xmm0 ; xmm0=in6=z3 punpcklwd xmm5, xmm0 ; xmm0=in6=z3
punpckhwd xmm3,xmm0 punpckhwd xmm3, xmm0
pmaddwd xmm5,[GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L pmaddwd xmm5, [GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L
pmaddwd xmm3,[GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H pmaddwd xmm3, [GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H
movdqa xmm4,xmm1 movdqa xmm4, xmm1
movdqa xmm0,xmm2 movdqa xmm0, xmm2
paddd xmm1,xmm5 ; xmm1=tmp10L paddd xmm1, xmm5 ; xmm1=tmp10L
paddd xmm2,xmm3 ; xmm2=tmp10H paddd xmm2, xmm3 ; xmm2=tmp10H
psubd xmm4,xmm5 ; xmm4=tmp12L psubd xmm4, xmm5 ; xmm4=tmp12L
psubd xmm0,xmm3 ; xmm0=tmp12H psubd xmm0, xmm3 ; xmm0=tmp12H
; -- Final output stage ; -- Final output stage
movdqa xmm5,xmm1 movdqa xmm5, xmm1
movdqa xmm3,xmm2 movdqa xmm3, xmm2
paddd xmm1,xmm6 ; xmm1=data0L paddd xmm1, xmm6 ; xmm1=data0L
paddd xmm2,xmm7 ; xmm2=data0H paddd xmm2, xmm7 ; xmm2=data0H
psubd xmm5,xmm6 ; xmm5=data3L psubd xmm5, xmm6 ; xmm5=data3L
psubd xmm3,xmm7 ; xmm3=data3H psubd xmm3, xmm7 ; xmm3=data3H
movdqa xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4] movdqa xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4]
paddd xmm1,xmm6 paddd xmm1, xmm6
paddd xmm2,xmm6 paddd xmm2, xmm6
psrad xmm1,DESCALE_P1_4 psrad xmm1, DESCALE_P1_4
psrad xmm2,DESCALE_P1_4 psrad xmm2, DESCALE_P1_4
paddd xmm5,xmm6 paddd xmm5, xmm6
paddd xmm3,xmm6 paddd xmm3, xmm6
psrad xmm5,DESCALE_P1_4 psrad xmm5, DESCALE_P1_4
psrad xmm3,DESCALE_P1_4 psrad xmm3, DESCALE_P1_4
packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
movdqa xmm2,xmm4 movdqa xmm2, xmm4
movdqa xmm3,xmm0 movdqa xmm3, xmm0
paddd xmm4,xmm7 ; xmm4=data1L paddd xmm4, xmm7 ; xmm4=data1L
paddd xmm0,xmm6 ; xmm0=data1H paddd xmm0, xmm6 ; xmm0=data1H
psubd xmm2,xmm7 ; xmm2=data2L psubd xmm2, xmm7 ; xmm2=data2L
psubd xmm3,xmm6 ; xmm3=data2H psubd xmm3, xmm6 ; xmm3=data2H
movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4] movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4]
paddd xmm4,xmm7 paddd xmm4, xmm7
paddd xmm0,xmm7 paddd xmm0, xmm7
psrad xmm4,DESCALE_P1_4 psrad xmm4, DESCALE_P1_4
psrad xmm0,DESCALE_P1_4 psrad xmm0, DESCALE_P1_4
paddd xmm2,xmm7 paddd xmm2, xmm7
paddd xmm3,xmm7 paddd xmm3, xmm7
psrad xmm2,DESCALE_P1_4 psrad xmm2, DESCALE_P1_4
psrad xmm3,DESCALE_P1_4 psrad xmm3, DESCALE_P1_4
packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
movdqa xmm6,xmm1 ; transpose coefficients(phase 1) movdqa xmm6, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13) punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
movdqa xmm7,xmm2 ; transpose coefficients(phase 1) movdqa xmm7, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33) punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37) punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
movdqa xmm0,xmm1 ; transpose coefficients(phase 2) movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
movdqa xmm3,xmm6 ; transpose coefficients(phase 2) movdqa xmm3, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
.column_end: .column_end:
; -- Prefetch the next coefficient block ; -- Prefetch the next coefficient block
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
; ---- Pass 2: process rows, store into output array. ; ---- Pass 2: process rows, store into output array.
mov eax, [original_ebp] mov eax, [original_ebp]
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(eax)] mov eax, JDIMENSION [output_col(eax)]
; -- Even part ; -- Even part
pxor xmm4,xmm4 pxor xmm4, xmm4
punpcklwd xmm4,xmm1 ; xmm4=tmp0 punpcklwd xmm4, xmm1 ; xmm4=tmp0
psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
; -- Odd part ; -- Odd part
punpckhwd xmm1,xmm0 punpckhwd xmm1, xmm0
punpckhwd xmm6,xmm3 punpckhwd xmm6, xmm3
movdqa xmm5,xmm1 movdqa xmm5, xmm1
movdqa xmm2,xmm6 movdqa xmm2, xmm6
pmaddwd xmm1,[GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2) pmaddwd xmm1, [GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2)
pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2) pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2)
pmaddwd xmm5,[GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0) pmaddwd xmm5, [GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0)
pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0) pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0)
paddd xmm6,xmm1 ; xmm6=tmp2 paddd xmm6, xmm1 ; xmm6=tmp2
paddd xmm2,xmm5 ; xmm2=tmp0 paddd xmm2, xmm5 ; xmm2=tmp0
; -- Even part ; -- Even part
punpcklwd xmm0,xmm3 punpcklwd xmm0, xmm3
pmaddwd xmm0,[GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2 pmaddwd xmm0, [GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2
movdqa xmm7,xmm4 movdqa xmm7, xmm4
paddd xmm4,xmm0 ; xmm4=tmp10 paddd xmm4, xmm0 ; xmm4=tmp10
psubd xmm7,xmm0 ; xmm7=tmp12 psubd xmm7, xmm0 ; xmm7=tmp12
; -- Final output stage ; -- Final output stage
movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4] movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4]
movdqa xmm5,xmm4 movdqa xmm5, xmm4
movdqa xmm3,xmm7 movdqa xmm3, xmm7
paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30) paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30)
paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31) paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31)
psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33) psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33)
psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32) psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
paddd xmm4,xmm1 paddd xmm4, xmm1
paddd xmm7,xmm1 paddd xmm7, xmm1
psrad xmm4,DESCALE_P2_4 psrad xmm4, DESCALE_P2_4
psrad xmm7,DESCALE_P2_4 psrad xmm7, DESCALE_P2_4
paddd xmm5,xmm1 paddd xmm5, xmm1
paddd xmm3,xmm1 paddd xmm3, xmm1
psrad xmm5,DESCALE_P2_4 psrad xmm5, DESCALE_P2_4
psrad xmm3,DESCALE_P2_4 psrad xmm3, DESCALE_P2_4
packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32) packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33) packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
movdqa xmm0,xmm4 ; transpose coefficients(phase 1) movdqa xmm0, xmm4 ; transpose coefficients(phase 1)
punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31) punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33) punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
movdqa xmm6,xmm4 ; transpose coefficients(phase 2) movdqa xmm6, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13) punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33) punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
paddb xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)] paddb xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)]
pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
pop edi pop edi
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; unused ; pop ecx ; unused
poppic ebx poppic ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -417,177 +417,177 @@ EXTN(jsimd_idct_4x4_sse2):
; JSAMPARRAY output_buf, JDIMENSION output_col) ; JSAMPARRAY output_buf, JDIMENSION output_col)
; ;
%define dct_table(b) (b)+8 ; void *dct_table %define dct_table(b) (b)+8 ; void *dct_table
%define coef_block(b) (b)+12 ; JCOEFPTR coef_block %define coef_block(b) (b)+12 ; JCOEFPTR coef_block
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
%define output_col(b) (b)+20 ; JDIMENSION output_col %define output_col(b) (b)+20 ; JDIMENSION output_col
align 16 align 16
global EXTN(jsimd_idct_2x2_sse2) global EXTN(jsimd_idct_2x2_sse2)
EXTN(jsimd_idct_2x2_sse2): EXTN(jsimd_idct_2x2_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address get_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
mov edx, POINTER [dct_table(ebp)] ; quantptr mov edx, POINTER [dct_table(ebp)] ; quantptr
mov esi, JCOEFPTR [coef_block(ebp)] ; inptr mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
; | input: | result: | ; | input: | result: |
; | 00 01 ** 03 ** 05 ** 07 | | ; | 00 01 ** 03 ** 05 ** 07 | |
; | 10 11 ** 13 ** 15 ** 17 | | ; | 10 11 ** 13 ** 15 ** 17 | |
; | ** ** ** ** ** ** ** ** | | ; | ** ** ** ** ** ** ** ** | |
; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
; | 50 51 ** 53 ** 55 ** 57 | | ; | 50 51 ** 53 ** 55 ** 57 | |
; | ** ** ** ** ** ** ** ** | | ; | ** ** ** ** ** ** ** ** | |
; | 70 71 ** 73 ** 75 ** 77 | | ; | 70 71 ** 73 ** 75 ** 77 | |
; -- Odd part ; -- Odd part
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
pcmpeqd xmm7,xmm7 pcmpeqd xmm7, xmm7
pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
pmaddwd xmm4,[GOTOFF(ebx,PW_F362_MF127)] pmaddwd xmm4, [GOTOFF(ebx,PW_F362_MF127)]
pmaddwd xmm5,[GOTOFF(ebx,PW_F085_MF072)] pmaddwd xmm5, [GOTOFF(ebx,PW_F085_MF072)]
psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37) por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77) por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)] pmaddwd xmm0, [GOTOFF(ebx,PW_F362_MF127)]
pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)] pmaddwd xmm2, [GOTOFF(ebx,PW_F085_MF072)]
paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3] paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3]
paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7] paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
; -- Even part ; -- Even part
movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
; xmm6=(00 01 ** 03 ** 05 ** 07) ; xmm6=(00 01 ** 03 ** 05 ** 07)
movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
; -- Final output stage ; -- Final output stage
movdqa xmm3,xmm6 movdqa xmm3, xmm6
movdqa xmm5,xmm1 movdqa xmm5, xmm1
paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2] movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2]
punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **) punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **)
movdqa xmm7,xmm1 movdqa xmm7, xmm1
punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3) punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3)
punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7) punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7)
paddd xmm6,xmm2 paddd xmm6, xmm2
psrad xmm6,DESCALE_P1_2 psrad xmm6, DESCALE_P1_2
paddd xmm1,xmm2 paddd xmm1, xmm2
paddd xmm7,xmm2 paddd xmm7, xmm2
psrad xmm1,DESCALE_P1_2 psrad xmm1, DESCALE_P1_2
psrad xmm7,DESCALE_P1_2 psrad xmm7, DESCALE_P1_2
; -- Prefetch the next coefficient block ; -- Prefetch the next coefficient block
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
; ---- Pass 2: process rows, store into output array. ; ---- Pass 2: process rows, store into output array.
mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(ebp)] mov eax, JDIMENSION [output_col(ebp)]
; | input:| result:| ; | input:| result:|
; | A0 B0 | | ; | A0 B0 | |
; | A1 B1 | C0 C1 | ; | A1 B1 | C0 C1 |
; | A3 B3 | D0 D1 | ; | A3 B3 | D0 D1 |
; | A5 B5 | | ; | A5 B5 | |
; | A7 B7 | | ; | A7 B7 | |
; -- Odd part ; -- Odd part
packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
pmaddwd xmm1,[GOTOFF(ebx,PW_F362_MF127)] pmaddwd xmm1, [GOTOFF(ebx,PW_F362_MF127)]
pmaddwd xmm7,[GOTOFF(ebx,PW_F085_MF072)] pmaddwd xmm7, [GOTOFF(ebx,PW_F085_MF072)]
paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1] paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
; -- Even part ; -- Even part
pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
; -- Final output stage ; -- Final output stage
movdqa xmm4,xmm6 movdqa xmm4, xmm6
paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1) punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1)
paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)] paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)]
psrad xmm6,DESCALE_P2_2 psrad xmm6, DESCALE_P2_2
packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
paddb xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)] paddb xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)]
pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --) pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --) pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
mov WORD [edx+eax*SIZEOF_JSAMPLE], bx mov WORD [edx+eax*SIZEOF_JSAMPLE], bx
mov WORD [esi+eax*SIZEOF_JSAMPLE], cx mov WORD [esi+eax*SIZEOF_JSAMPLE], cx
pop edi pop edi
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
pop ebx pop ebx
pop ebp pop ebp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -20,8 +20,8 @@
%include "jdct.inc" %include "jdct.inc"
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 64 BITS 64
; ;
; Load data into workspace, applying unsigned->signed conversion ; Load data into workspace, applying unsigned->signed conversion
; ;
@@ -34,65 +34,65 @@
; r11 = JDIMENSION start_col ; r11 = JDIMENSION start_col
; r12 = FAST_FLOAT *workspace ; r12 = FAST_FLOAT *workspace
align 16 align 16
global EXTN(jsimd_convsamp_float_sse2) global EXTN(jsimd_convsamp_float_sse2)
EXTN(jsimd_convsamp_float_sse2): EXTN(jsimd_convsamp_float_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
push rbx push rbx
pcmpeqw xmm7,xmm7 pcmpeqw xmm7, xmm7
psllw xmm7,7 psllw xmm7, 7
packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
mov rsi, r10 mov rsi, r10
mov eax, r11d mov eax, r11d
mov rdi, r12 mov rdi, r12
mov rcx, DCTSIZE/2 mov rcx, DCTSIZE/2
.convloop: .convloop:
mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
psubb xmm0,xmm7 ; xmm0=(01234567) psubb xmm0, xmm7 ; xmm0=(01234567)
psubb xmm1,xmm7 ; xmm1=(89ABCDEF) psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
cvtdq2ps xmm2,xmm2 ; xmm2=(0123) cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
cvtdq2ps xmm0,xmm0 ; xmm0=(4567) cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
add rsi, byte 2*SIZEOF_JSAMPROW add rsi, byte 2*SIZEOF_JSAMPROW
add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
dec rcx dec rcx
jnz short .convloop jnz short .convloop
pop rbx pop rbx
uncollect_args uncollect_args
pop rbp pop rbp
ret ret
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -108,50 +108,50 @@ EXTN(jsimd_convsamp_float_sse2):
; r11 = FAST_FLOAT *divisors ; r11 = FAST_FLOAT *divisors
; r12 = FAST_FLOAT *workspace ; r12 = FAST_FLOAT *workspace
align 16 align 16
global EXTN(jsimd_quantize_float_sse2) global EXTN(jsimd_quantize_float_sse2)
EXTN(jsimd_quantize_float_sse2): EXTN(jsimd_quantize_float_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
mov rsi, r12 mov rsi, r12
mov rdx, r11 mov rdx, r11
mov rdi, r10 mov rdi, r10
mov rax, DCTSIZE2/16 mov rax, DCTSIZE2/16
.quantloop: .quantloop:
movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)] movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
cvtps2dq xmm0,xmm0 cvtps2dq xmm0, xmm0
cvtps2dq xmm1,xmm1 cvtps2dq xmm1, xmm1
cvtps2dq xmm2,xmm2 cvtps2dq xmm2, xmm2
cvtps2dq xmm3,xmm3 cvtps2dq xmm3, xmm3
packssdw xmm0,xmm1 packssdw xmm0, xmm1
packssdw xmm2,xmm3 packssdw xmm2, xmm3
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
add rsi, byte 16*SIZEOF_FAST_FLOAT add rsi, byte 16*SIZEOF_FAST_FLOAT
add rdx, byte 16*SIZEOF_FAST_FLOAT add rdx, byte 16*SIZEOF_FAST_FLOAT
add rdi, byte 16*SIZEOF_JCOEF add rdi, byte 16*SIZEOF_JCOEF
dec rax dec rax
jnz short .quantloop jnz short .quantloop
uncollect_args uncollect_args
pop rbp pop rbp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -19,8 +19,8 @@
%include "jdct.inc" %include "jdct.inc"
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 32 BITS 32
; ;
; Load data into workspace, applying unsigned->signed conversion ; Load data into workspace, applying unsigned->signed conversion
; ;
@@ -29,75 +29,75 @@
; FAST_FLOAT *workspace); ; FAST_FLOAT *workspace);
; ;
%define sample_data ebp+8 ; JSAMPARRAY sample_data %define sample_data ebp+8 ; JSAMPARRAY sample_data
%define start_col ebp+12 ; JDIMENSION start_col %define start_col ebp+12 ; JDIMENSION start_col
%define workspace ebp+16 ; FAST_FLOAT *workspace %define workspace ebp+16 ; FAST_FLOAT *workspace
align 16 align 16
global EXTN(jsimd_convsamp_float_sse2) global EXTN(jsimd_convsamp_float_sse2)
EXTN(jsimd_convsamp_float_sse2): EXTN(jsimd_convsamp_float_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
pcmpeqw xmm7,xmm7 pcmpeqw xmm7, xmm7
psllw xmm7,7 psllw xmm7, 7
packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
mov eax, JDIMENSION [start_col] mov eax, JDIMENSION [start_col]
mov edi, POINTER [workspace] ; (DCTELEM *) mov edi, POINTER [workspace] ; (DCTELEM *)
mov ecx, DCTSIZE/2 mov ecx, DCTSIZE/2
alignx 16,7 alignx 16, 7
.convloop: .convloop:
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
psubb xmm0,xmm7 ; xmm0=(01234567) psubb xmm0, xmm7 ; xmm0=(01234567)
psubb xmm1,xmm7 ; xmm1=(89ABCDEF) psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
cvtdq2ps xmm2,xmm2 ; xmm2=(0123) cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
cvtdq2ps xmm0,xmm0 ; xmm0=(4567) cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
add esi, byte 2*SIZEOF_JSAMPROW add esi, byte 2*SIZEOF_JSAMPROW
add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
dec ecx dec ecx
jnz short .convloop jnz short .convloop
pop edi pop edi
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
pop ebx pop ebx
pop ebp pop ebp
ret ret
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -109,62 +109,62 @@ EXTN(jsimd_convsamp_float_sse2):
; FAST_FLOAT *workspace); ; FAST_FLOAT *workspace);
; ;
%define coef_block ebp+8 ; JCOEFPTR coef_block %define coef_block ebp+8 ; JCOEFPTR coef_block
%define divisors ebp+12 ; FAST_FLOAT *divisors %define divisors ebp+12 ; FAST_FLOAT *divisors
%define workspace ebp+16 ; FAST_FLOAT *workspace %define workspace ebp+16 ; FAST_FLOAT *workspace
align 16 align 16
global EXTN(jsimd_quantize_float_sse2) global EXTN(jsimd_quantize_float_sse2)
EXTN(jsimd_quantize_float_sse2): EXTN(jsimd_quantize_float_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
; push ebx ; unused ; push ebx ; unused
; push ecx ; unused ; push ecx ; unused
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
mov esi, POINTER [workspace] mov esi, POINTER [workspace]
mov edx, POINTER [divisors] mov edx, POINTER [divisors]
mov edi, JCOEFPTR [coef_block] mov edi, JCOEFPTR [coef_block]
mov eax, DCTSIZE2/16 mov eax, DCTSIZE2/16
alignx 16,7 alignx 16, 7
.quantloop: .quantloop:
movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
cvtps2dq xmm0,xmm0 cvtps2dq xmm0, xmm0
cvtps2dq xmm1,xmm1 cvtps2dq xmm1, xmm1
cvtps2dq xmm2,xmm2 cvtps2dq xmm2, xmm2
cvtps2dq xmm3,xmm3 cvtps2dq xmm3, xmm3
packssdw xmm0,xmm1 packssdw xmm0, xmm1
packssdw xmm2,xmm3 packssdw xmm2, xmm3
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
add esi, byte 16*SIZEOF_FAST_FLOAT add esi, byte 16*SIZEOF_FAST_FLOAT
add edx, byte 16*SIZEOF_FAST_FLOAT add edx, byte 16*SIZEOF_FAST_FLOAT
add edi, byte 16*SIZEOF_JCOEF add edi, byte 16*SIZEOF_JCOEF
dec eax dec eax
jnz short .quantloop jnz short .quantloop
pop edi pop edi
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; unused ; pop ecx ; unused
; pop ebx ; unused ; pop ebx ; unused
pop ebp pop ebp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -20,8 +20,8 @@
%include "jdct.inc" %include "jdct.inc"
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 64 BITS 64
; ;
; Load data into workspace, applying unsigned->signed conversion ; Load data into workspace, applying unsigned->signed conversion
; ;
@@ -34,60 +34,60 @@
; r11 = JDIMENSION start_col ; r11 = JDIMENSION start_col
; r12 = DCTELEM *workspace ; r12 = DCTELEM *workspace
align 16 align 16
global EXTN(jsimd_convsamp_sse2) global EXTN(jsimd_convsamp_sse2)
EXTN(jsimd_convsamp_sse2): EXTN(jsimd_convsamp_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
push rbx push rbx
pxor xmm6,xmm6 ; xmm6=(all 0's) pxor xmm6, xmm6 ; xmm6=(all 0's)
pcmpeqw xmm7,xmm7 pcmpeqw xmm7, xmm7
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
mov rsi, r10 mov rsi, r10
mov eax, r11d mov eax, r11d
mov rdi, r12 mov rdi, r12
mov rcx, DCTSIZE/4 mov rcx, DCTSIZE/4
.convloop: .convloop:
mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567) movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
punpcklbw xmm0,xmm6 ; xmm0=(01234567) punpcklbw xmm0, xmm6 ; xmm0=(01234567)
punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm1,xmm7 paddw xmm1, xmm7
punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
paddw xmm2,xmm7 paddw xmm2, xmm7
paddw xmm3,xmm7 paddw xmm3, xmm7
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
add rsi, byte 4*SIZEOF_JSAMPROW add rsi, byte 4*SIZEOF_JSAMPROW
add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
dec rcx dec rcx
jnz short .convloop jnz short .convloop
pop rbx pop rbx
uncollect_args uncollect_args
pop rbp pop rbp
ret ret
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
; ;
@@ -102,85 +102,85 @@ EXTN(jsimd_convsamp_sse2):
; DCTELEM *workspace); ; DCTELEM *workspace);
; ;
%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) %define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) %define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) %define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
; r10 = JCOEFPTR coef_block ; r10 = JCOEFPTR coef_block
; r11 = DCTELEM *divisors ; r11 = DCTELEM *divisors
; r12 = DCTELEM *workspace ; r12 = DCTELEM *workspace
align 16 align 16
global EXTN(jsimd_quantize_sse2) global EXTN(jsimd_quantize_sse2)
EXTN(jsimd_quantize_sse2): EXTN(jsimd_quantize_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
mov rsi, r12 mov rsi, r12
mov rdx, r11 mov rdx, r11
mov rdi, r10 mov rdi, r10
mov rax, DCTSIZE2/32 mov rax, DCTSIZE2/32
.quantloop: .quantloop:
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)] movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)] movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)] movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)] movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
movdqa xmm0,xmm4 movdqa xmm0, xmm4
movdqa xmm1,xmm5 movdqa xmm1, xmm5
movdqa xmm2,xmm6 movdqa xmm2, xmm6
movdqa xmm3,xmm7 movdqa xmm3, xmm7
psraw xmm4,(WORD_BIT-1) psraw xmm4, (WORD_BIT-1)
psraw xmm5,(WORD_BIT-1) psraw xmm5, (WORD_BIT-1)
psraw xmm6,(WORD_BIT-1) psraw xmm6, (WORD_BIT-1)
psraw xmm7,(WORD_BIT-1) psraw xmm7, (WORD_BIT-1)
pxor xmm0,xmm4 pxor xmm0, xmm4
pxor xmm1,xmm5 pxor xmm1, xmm5
pxor xmm2,xmm6 pxor xmm2, xmm6
pxor xmm3,xmm7 pxor xmm3, xmm7
psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)] paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)] paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)]
paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)] paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)]
pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal
pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)] pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)] pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)] pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale
pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)] pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)] pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)] pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
pxor xmm0,xmm4 pxor xmm0, xmm4
pxor xmm1,xmm5 pxor xmm1, xmm5
pxor xmm2,xmm6 pxor xmm2, xmm6
pxor xmm3,xmm7 pxor xmm3, xmm7
psubw xmm0,xmm4 psubw xmm0, xmm4
psubw xmm1,xmm5 psubw xmm1, xmm5
psubw xmm2,xmm6 psubw xmm2, xmm6
psubw xmm3,xmm7 psubw xmm3, xmm7
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
add rsi, byte 32*SIZEOF_DCTELEM add rsi, byte 32*SIZEOF_DCTELEM
add rdx, byte 32*SIZEOF_DCTELEM add rdx, byte 32*SIZEOF_DCTELEM
add rdi, byte 32*SIZEOF_JCOEF add rdi, byte 32*SIZEOF_JCOEF
dec rax dec rax
jnz near .quantloop jnz near .quantloop
uncollect_args uncollect_args
pop rbp pop rbp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -19,8 +19,8 @@
%include "jdct.inc" %include "jdct.inc"
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
BITS 32 BITS 32
; ;
; Load data into workspace, applying unsigned->signed conversion ; Load data into workspace, applying unsigned->signed conversion
; ;
@@ -29,70 +29,70 @@
; DCTELEM *workspace); ; DCTELEM *workspace);
; ;
%define sample_data ebp+8 ; JSAMPARRAY sample_data %define sample_data ebp+8 ; JSAMPARRAY sample_data
%define start_col ebp+12 ; JDIMENSION start_col %define start_col ebp+12 ; JDIMENSION start_col
%define workspace ebp+16 ; DCTELEM *workspace %define workspace ebp+16 ; DCTELEM *workspace
align 16 align 16
global EXTN(jsimd_convsamp_sse2) global EXTN(jsimd_convsamp_sse2)
EXTN(jsimd_convsamp_sse2): EXTN(jsimd_convsamp_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
pxor xmm6,xmm6 ; xmm6=(all 0's) pxor xmm6, xmm6 ; xmm6=(all 0's)
pcmpeqw xmm7,xmm7 pcmpeqw xmm7, xmm7
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
mov eax, JDIMENSION [start_col] mov eax, JDIMENSION [start_col]
mov edi, POINTER [workspace] ; (DCTELEM *) mov edi, POINTER [workspace] ; (DCTELEM *)
mov ecx, DCTSIZE/4 mov ecx, DCTSIZE/4
alignx 16,7 alignx 16, 7
.convloop: .convloop:
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
punpcklbw xmm0,xmm6 ; xmm0=(01234567) punpcklbw xmm0, xmm6 ; xmm0=(01234567)
punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm1,xmm7 paddw xmm1, xmm7
punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
paddw xmm2,xmm7 paddw xmm2, xmm7
paddw xmm3,xmm7 paddw xmm3, xmm7
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
add esi, byte 4*SIZEOF_JSAMPROW add esi, byte 4*SIZEOF_JSAMPROW
add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
dec ecx dec ecx
jnz short .convloop jnz short .convloop
pop edi pop edi
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
pop ebx pop ebx
pop ebp pop ebp
ret ret
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
; ;
@@ -107,93 +107,93 @@ EXTN(jsimd_convsamp_sse2):
; DCTELEM *workspace); ; DCTELEM *workspace);
; ;
%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) %define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) %define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) %define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
%define coef_block ebp+8 ; JCOEFPTR coef_block %define coef_block ebp+8 ; JCOEFPTR coef_block
%define divisors ebp+12 ; DCTELEM *divisors %define divisors ebp+12 ; DCTELEM *divisors
%define workspace ebp+16 ; DCTELEM *workspace %define workspace ebp+16 ; DCTELEM *workspace
align 16 align 16
global EXTN(jsimd_quantize_sse2) global EXTN(jsimd_quantize_sse2)
EXTN(jsimd_quantize_sse2): EXTN(jsimd_quantize_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
; push ebx ; unused ; push ebx ; unused
; push ecx ; unused ; push ecx ; unused
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
mov esi, POINTER [workspace] mov esi, POINTER [workspace]
mov edx, POINTER [divisors] mov edx, POINTER [divisors]
mov edi, JCOEFPTR [coef_block] mov edi, JCOEFPTR [coef_block]
mov eax, DCTSIZE2/32 mov eax, DCTSIZE2/32
alignx 16,7 alignx 16, 7
.quantloop: .quantloop:
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
movdqa xmm0,xmm4 movdqa xmm0, xmm4
movdqa xmm1,xmm5 movdqa xmm1, xmm5
movdqa xmm2,xmm6 movdqa xmm2, xmm6
movdqa xmm3,xmm7 movdqa xmm3, xmm7
psraw xmm4,(WORD_BIT-1) psraw xmm4, (WORD_BIT-1)
psraw xmm5,(WORD_BIT-1) psraw xmm5, (WORD_BIT-1)
psraw xmm6,(WORD_BIT-1) psraw xmm6, (WORD_BIT-1)
psraw xmm7,(WORD_BIT-1) psraw xmm7, (WORD_BIT-1)
pxor xmm0,xmm4 pxor xmm0, xmm4
pxor xmm1,xmm5 pxor xmm1, xmm5
pxor xmm2,xmm6 pxor xmm2, xmm6
pxor xmm3,xmm7 pxor xmm3, xmm7
psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] paddw xmm1, XMMWORD [CORRECTION(1,0,edx)]
paddw xmm2, XMMWORD [CORRECTION(2,0,edx)] paddw xmm2, XMMWORD [CORRECTION(2,0,edx)]
paddw xmm3, XMMWORD [CORRECTION(3,0,edx)] paddw xmm3, XMMWORD [CORRECTION(3,0,edx)]
pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale
pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
pxor xmm0,xmm4 pxor xmm0, xmm4
pxor xmm1,xmm5 pxor xmm1, xmm5
pxor xmm2,xmm6 pxor xmm2, xmm6
pxor xmm3,xmm7 pxor xmm3, xmm7
psubw xmm0,xmm4 psubw xmm0, xmm4
psubw xmm1,xmm5 psubw xmm1, xmm5
psubw xmm2,xmm6 psubw xmm2, xmm6
psubw xmm3,xmm7 psubw xmm3, xmm7
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
add esi, byte 32*SIZEOF_DCTELEM add esi, byte 32*SIZEOF_DCTELEM
add edx, byte 32*SIZEOF_DCTELEM add edx, byte 32*SIZEOF_DCTELEM
add edi, byte 32*SIZEOF_JCOEF add edi, byte 32*SIZEOF_JCOEF
dec eax dec eax
jnz near .quantloop jnz near .quantloop
pop edi pop edi
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; unused ; pop ecx ; unused
; pop ebx ; unused ; pop ebx ; unused
pop ebp pop ebp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this. ; segment unless we do this.
align 16 align 16

View File

@@ -19,79 +19,79 @@
; -- jpeglib.h ; -- jpeglib.h
; ;
%define _cpp_protection_DCTSIZE DCTSIZE %define _cpp_protection_DCTSIZE DCTSIZE
%define _cpp_protection_DCTSIZE2 DCTSIZE2 %define _cpp_protection_DCTSIZE2 DCTSIZE2
; ;
; -- jmorecfg.h ; -- jmorecfg.h
; ;
%define _cpp_protection_RGB_RED RGB_RED %define _cpp_protection_RGB_RED RGB_RED
%define _cpp_protection_RGB_GREEN RGB_GREEN %define _cpp_protection_RGB_GREEN RGB_GREEN
%define _cpp_protection_RGB_BLUE RGB_BLUE %define _cpp_protection_RGB_BLUE RGB_BLUE
%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE %define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE
%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED %define _cpp_protection_EXT_RGB_RED EXT_RGB_RED
%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN %define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN
%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE %define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE
%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE %define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE
%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED %define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED
%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN %define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN
%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE %define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE
%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE %define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE
%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED %define _cpp_protection_EXT_BGR_RED EXT_BGR_RED
%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN %define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN
%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE %define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE
%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE %define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE
%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED %define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED
%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN %define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN
%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE %define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE
%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE %define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE
%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED %define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED
%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN %define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN
%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE %define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE
%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE %define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE
%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED %define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED
%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN %define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN
%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE %define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE
%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE %define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE
%define RGBX_FILLER_0XFF 1 %define RGBX_FILLER_0XFF 1
; Representation of a single sample (pixel element value). ; Representation of a single sample (pixel element value).
; On this SIMD implementation, this must be 'unsigned char'. ; On this SIMD implementation, this must be 'unsigned char'.
; ;
%define JSAMPLE byte ; unsigned char %define JSAMPLE byte ; unsigned char
%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE) %define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE %define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE
; Representation of a DCT frequency coefficient. ; Representation of a DCT frequency coefficient.
; On this SIMD implementation, this must be 'short'. ; On this SIMD implementation, this must be 'short'.
; ;
%define JCOEF word ; short %define JCOEF word ; short
%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF) %define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
; Datatype used for image dimensions. ; Datatype used for image dimensions.
; On this SIMD implementation, this must be 'unsigned int'. ; On this SIMD implementation, this must be 'unsigned int'.
; ;
%define JDIMENSION dword ; unsigned int %define JDIMENSION dword ; unsigned int
%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION) %define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h) %define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h) %define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h) %define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h) %define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW) %define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY) %define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE) %define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR) %define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
; ;
; -- jdct.h ; -- jdct.h
@@ -101,30 +101,30 @@
; the DCT is to be performed in-place in that buffer. ; the DCT is to be performed in-place in that buffer.
; To maximize parallelism, Type DCTELEM is changed to short (originally, int). ; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
; ;
%define DCTELEM word ; short %define DCTELEM word ; short
%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM) %define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
%define FAST_FLOAT FP32 ; float %define FAST_FLOAT FP32 ; float
%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT) %define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT)
; To maximize parallelism, Type MULTIPLIER is changed to short. ; To maximize parallelism, Type MULTIPLIER is changed to short.
; ;
%define ISLOW_MULT_TYPE word ; must be short %define ISLOW_MULT_TYPE word ; must be short
%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE) %define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
%define IFAST_MULT_TYPE word ; must be short %define IFAST_MULT_TYPE word ; must be short
%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE) %define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors %define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
%define FLOAT_MULT_TYPE FP32 ; must be float %define FLOAT_MULT_TYPE FP32 ; must be float
%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE) %define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
; ;
; -- jsimd.h ; -- jsimd.h
; ;
%define _cpp_protection_JSIMD_NONE JSIMD_NONE %define _cpp_protection_JSIMD_NONE JSIMD_NONE
%define _cpp_protection_JSIMD_MMX JSIMD_MMX %define _cpp_protection_JSIMD_MMX JSIMD_MMX
%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW %define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW
%define _cpp_protection_JSIMD_SSE JSIMD_SSE %define _cpp_protection_JSIMD_SSE JSIMD_SSE
%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2 %define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2

View File

@@ -38,11 +38,11 @@
; -- segment definition -- ; -- segment definition --
; ;
%ifdef __YASM_VER__ %ifdef __YASM_VER__
%define SEG_TEXT .text align=16 %define SEG_TEXT .text align=16
%define SEG_CONST .rdata align=16 %define SEG_CONST .rdata align=16
%else %else
%define SEG_TEXT .text align=16 public use32 class=CODE %define SEG_TEXT .text align=16 public use32 class=CODE
%define SEG_CONST .rdata align=16 public use32 class=CONST %define SEG_CONST .rdata align=16 public use32 class=CONST
%endif %endif
%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- %elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)--------
@@ -57,15 +57,15 @@
%define SEG_TEXT .text align=16 public use64 class=CODE %define SEG_TEXT .text align=16 public use64 class=CODE
%define SEG_CONST .rdata align=16 public use64 class=CONST %define SEG_CONST .rdata align=16 public use64 class=CONST
%endif %endif
%define EXTN(name) name ; foo() -> foo %define EXTN(name) name ; foo() -> foo
%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- %elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
; * Borland C++ (Win32) ; * Borland C++ (Win32)
; -- segment definition -- ; -- segment definition --
; ;
%define SEG_TEXT _text align=16 public use32 class=CODE %define SEG_TEXT _text align=16 public use32 class=CODE
%define SEG_CONST _data align=16 public use32 class=DATA %define SEG_CONST _data align=16 public use32 class=DATA
%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ %elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
; * Linux ; * Linux
@@ -78,17 +78,17 @@ section .note.GNU-stack noalloc noexec nowrite progbits
; -- segment definition -- ; -- segment definition --
; ;
%ifdef __x86_64__ %ifdef __x86_64__
%define SEG_TEXT .text progbits align=16 %define SEG_TEXT .text progbits align=16
%define SEG_CONST .rodata progbits align=16 %define SEG_CONST .rodata progbits align=16
%else %else
%define SEG_TEXT .text progbits alloc exec nowrite align=16 %define SEG_TEXT .text progbits alloc exec nowrite align=16
%define SEG_CONST .rodata progbits alloc noexec nowrite align=16 %define SEG_CONST .rodata progbits alloc noexec nowrite align=16
%endif %endif
; To make the code position-independent, append -DPIC to the commandline ; To make the code position-independent, append -DPIC to the commandline
; ;
%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC %define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
%define EXTN(name) name ; foo() -> foo %define EXTN(name) name ; foo() -> foo
%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- %elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)----
; * Older Linux using a.out format (nasm -f aout -DAOUT ...) ; * Older Linux using a.out format (nasm -f aout -DAOUT ...)
@@ -96,20 +96,20 @@ section .note.GNU-stack noalloc noexec nowrite progbits
; -- segment definition -- ; -- segment definition --
; ;
%define SEG_TEXT .text %define SEG_TEXT .text
%define SEG_CONST .data %define SEG_CONST .data
; To make the code position-independent, append -DPIC to the commandline ; To make the code position-independent, append -DPIC to the commandline
; ;
%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC %define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- %elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format) ; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
; -- segment definition -- ; -- segment definition --
; ;
%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why? %define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why?
%define SEG_CONST .rodata align=16 %define SEG_CONST .rodata align=16
; The generation of position-independent code (PIC) is the default on Darwin. ; The generation of position-independent code (PIC) is the default on Darwin.
; ;
@@ -120,10 +120,10 @@ section .note.GNU-stack noalloc noexec nowrite progbits
; -- segment definition -- ; -- segment definition --
; ;
%define SEG_TEXT .text %define SEG_TEXT .text
%define SEG_CONST .data %define SEG_CONST .data
%endif ; ---------------------------------------------- %endif ; ----------------------------------------------
; ========================================================================== ; ==========================================================================
@@ -131,54 +131,54 @@ section .note.GNU-stack noalloc noexec nowrite progbits
; Common types ; Common types
; ;
%ifdef __x86_64__ %ifdef __x86_64__
%define POINTER qword ; general pointer type %define POINTER qword ; general pointer type
%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER) %define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT %define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
%else %else
%define POINTER dword ; general pointer type %define POINTER dword ; general pointer type
%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) %define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT %define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
%endif %endif
%define INT dword ; signed integer type %define INT dword ; signed integer type
%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) %define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT %define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
%define FP32 dword ; IEEE754 single %define FP32 dword ; IEEE754 single
%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) %define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT %define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
%define MMWORD qword ; int64 (MMX register) %define MMWORD qword ; int64 (MMX register)
%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) %define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT %define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
; NASM is buggy and doesn't properly handle operand sizes for SSE ; NASM is buggy and doesn't properly handle operand sizes for SSE
; instructions, so for now we have to define XMMWORD as blank. ; instructions, so for now we have to define XMMWORD as blank.
%define XMMWORD ; int128 (SSE register) %define XMMWORD ; int128 (SSE register)
%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) %define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT %define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
; Similar hacks for when we load a dword or MMWORD into an xmm# register ; Similar hacks for when we load a dword or MMWORD into an xmm# register
%define XMM_DWORD %define XMM_DWORD
%define XMM_MMWORD %define XMM_MMWORD
%define SIZEOF_BYTE 1 ; sizeof(BYTE) %define SIZEOF_BYTE 1 ; sizeof(BYTE)
%define SIZEOF_WORD 2 ; sizeof(WORD) %define SIZEOF_WORD 2 ; sizeof(WORD)
%define SIZEOF_DWORD 4 ; sizeof(DWORD) %define SIZEOF_DWORD 4 ; sizeof(DWORD)
%define SIZEOF_QWORD 8 ; sizeof(QWORD) %define SIZEOF_QWORD 8 ; sizeof(QWORD)
%define SIZEOF_OWORD 16 ; sizeof(OWORD) %define SIZEOF_OWORD 16 ; sizeof(OWORD)
%define BYTE_BIT 8 ; CHAR_BIT in C %define BYTE_BIT 8 ; CHAR_BIT in C
%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT %define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT
%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT %define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT
%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT %define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT
%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT %define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
; External Symbol Name ; External Symbol Name
; ;
%ifndef EXTN %ifndef EXTN
%define EXTN(name) _ %+ name ; foo() -> _foo %define EXTN(name) _ %+ name ; foo() -> _foo
%endif %endif
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -188,75 +188,76 @@ section .note.GNU-stack noalloc noexec nowrite progbits
%undef PIC %undef PIC
%endif %endif
%ifdef PIC ; ------------------------------------------- %ifdef PIC ; -------------------------------------------
%ifidn GOT_SYMBOL,_MACHO_PIC_ ; -------------------- %ifidn GOT_SYMBOL, _MACHO_PIC_ ; --------------------
; At present, nasm doesn't seem to support PIC generation for Mach-O. ; At present, nasm doesn't seem to support PIC generation for Mach-O.
; The PIC support code below is a little tricky. ; The PIC support code below is a little tricky.
SECTION SEG_CONST SECTION SEG_CONST
const_base: const_base:
%define GOTOFF(got,sym) (got) + (sym) - const_base %define GOTOFF(got,sym) (got) + (sym) - const_base
%imacro get_GOT 1 %imacro get_GOT 1
; NOTE: this macro destroys ecx resister. ; NOTE: this macro destroys ecx resister.
call %%geteip call %%geteip
add ecx, byte (%%ref - $) add ecx, byte (%%ref - $)
jmp short %%adjust jmp short %%adjust
%%geteip: %%geteip:
mov ecx, POINTER [esp] mov ecx, POINTER [esp]
ret ret
%%adjust: %%adjust:
push ebp push ebp
xor ebp,ebp ; ebp = 0 xor ebp, ebp ; ebp = 0
%ifidni %1,ebx ; (%1 == ebx) %ifidni %1, ebx ; (%1 == ebx)
; db 0x8D,0x9C + jmp near const_base = ; db 0x8D,0x9C + jmp near const_base =
; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
db 0x8D,0x9C ; 8D,9C db 0x8D, 0x9C ; 8D,9C
jmp near const_base ; E9,(const_base-%%ref) jmp near const_base ; E9,(const_base-%%ref)
%%ref: %%ref:
%else ; (%1 != ebx) %else ; (%1 != ebx)
; db 0x8D,0x8C + jmp near const_base = ; db 0x8D,0x8C + jmp near const_base =
; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
db 0x8D,0x8C ; 8D,8C db 0x8D, 0x8C ; 8D,8C
jmp near const_base ; E9,(const_base-%%ref) jmp near const_base ; E9,(const_base-%%ref)
%%ref: mov %1, ecx %%ref:
%endif ; (%1 == ebx) mov %1, ecx
pop ebp %endif ; (%1 == ebx)
pop ebp
%endmacro %endmacro
%else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- %else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff %define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
%imacro get_GOT 1 %imacro get_GOT 1
extern GOT_SYMBOL extern GOT_SYMBOL
call %%geteip call %%geteip
add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
jmp short %%done jmp short %%done
%%geteip: %%geteip:
mov %1, POINTER [esp] mov %1, POINTER [esp]
ret ret
%%done: %%done:
%endmacro %endmacro
%endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- %endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
%imacro pushpic 1.nolist %imacro pushpic 1.nolist
push %1 push %1
%endmacro %endmacro
%imacro poppic 1.nolist %imacro poppic 1.nolist
pop %1 pop %1
%endmacro %endmacro
%imacro movpic 2.nolist %imacro movpic 2.nolist
mov %1,%2 mov %1, %2
%endmacro %endmacro
%else ; !PIC ----------------------------------------- %else ; !PIC -----------------------------------------
%define GOTOFF(got,sym) (sym) %define GOTOFF(got,sym) (sym)
%imacro get_GOT 1.nolist %imacro get_GOT 1.nolist
%endmacro %endmacro
@@ -267,7 +268,7 @@ const_base:
%imacro movpic 2.nolist %imacro movpic 2.nolist
%endmacro %endmacro
%endif ; PIC ----------------------------------------- %endif ; PIC -----------------------------------------
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
; Align the next instruction on {2,4,8,16,..}-byte boundary. ; Align the next instruction on {2,4,8,16,..}-byte boundary.
@@ -277,28 +278,29 @@ const_base:
%define FILLB(b,n) (($$-(b)) & ((n)-1)) %define FILLB(b,n) (($$-(b)) & ((n)-1))
%imacro alignx 1-2.nolist 0xFFFF %imacro alignx 1-2.nolist 0xFFFF
%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \ %%bs: \
db 0x90 ; nop times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \ db 0x90 ; nop
db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000] times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \ db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \ db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \ db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00] times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \ db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00]
db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00] times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \ db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00]
db 0x8B,0xED ; mov ebp,ebp times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \ db 0x8B,0xED ; mov ebp,ebp
db 0x90 ; nop times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
db 0x90 ; nop
%endmacro %endmacro
; Align the next data on {2,4,8,16,..}-byte boundary. ; Align the next data on {2,4,8,16,..}-byte boundary.
; ;
%imacro alignz 1.nolist %imacro alignz 1.nolist
align %1, db 0 ; filling zeros align %1, db 0 ; filling zeros
%endmacro %endmacro
%ifdef __x86_64__ %ifdef __x86_64__
@@ -306,61 +308,61 @@ const_base:
%ifdef WIN64 %ifdef WIN64
%imacro collect_args 0 %imacro collect_args 0
push r12 push r12
push r13 push r13
push r14 push r14
push r15 push r15
mov r10, rcx mov r10, rcx
mov r11, rdx mov r11, rdx
mov r12, r8 mov r12, r8
mov r13, r9 mov r13, r9
mov r14, [rax+48] mov r14, [rax+48]
mov r15, [rax+56] mov r15, [rax+56]
push rsi push rsi
push rdi push rdi
sub rsp, SIZEOF_XMMWORD sub rsp, SIZEOF_XMMWORD
movaps XMMWORD [rsp], xmm6 movaps XMMWORD [rsp], xmm6
sub rsp, SIZEOF_XMMWORD sub rsp, SIZEOF_XMMWORD
movaps XMMWORD [rsp], xmm7 movaps XMMWORD [rsp], xmm7
%endmacro %endmacro
%imacro uncollect_args 0 %imacro uncollect_args 0
movaps xmm7, XMMWORD [rsp] movaps xmm7, XMMWORD [rsp]
add rsp, SIZEOF_XMMWORD add rsp, SIZEOF_XMMWORD
movaps xmm6, XMMWORD [rsp] movaps xmm6, XMMWORD [rsp]
add rsp, SIZEOF_XMMWORD add rsp, SIZEOF_XMMWORD
pop rdi pop rdi
pop rsi pop rsi
pop r15 pop r15
pop r14 pop r14
pop r13 pop r13
pop r12 pop r12
%endmacro %endmacro
%else %else
%imacro collect_args 0 %imacro collect_args 0
push r10 push r10
push r11 push r11
push r12 push r12
push r13 push r13
push r14 push r14
push r15 push r15
mov r10, rdi mov r10, rdi
mov r11, rsi mov r11, rsi
mov r12, rdx mov r12, rdx
mov r13, rcx mov r13, rcx
mov r14, r8 mov r14, r8
mov r15, r9 mov r15, r9
%endmacro %endmacro
%imacro uncollect_args 0 %imacro uncollect_args 0
pop r15 pop r15
pop r14 pop r14
pop r13 pop r13
pop r12 pop r12
pop r11 pop r11
pop r10 pop r10
%endmacro %endmacro
%endif %endif