Reformat SSE/SSE2 SIMD code to improve readability

This commit is contained in:
DRC
2016-05-27 16:58:23 -05:00
parent 3ff13e651b
commit ff5685d534
43 changed files with 11067 additions and 11065 deletions

View File

@@ -42,17 +42,17 @@
EXTN(jsimd_rgb_ycc_convert_sse2): EXTN(jsimd_rgb_ycc_convert_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
push rbx push rbx
mov ecx, r10d mov ecx, r10d
test rcx,rcx test rcx, rcx
jz near .return jz near .return
push rcx push rcx
@@ -70,7 +70,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
mov rsi, r11 mov rsi, r11
mov eax, r14d mov eax, r14d
test rax,rax test rax, rax
jle near .return jle near .return
.rowloop: .rowloop:
push rdx push rdx
@@ -92,7 +92,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
.column_ld1: .column_ld1:
push rax push rax
push rdx push rdx
lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE test cl, SIZEOF_BYTE
jz short .column_ld2 jz short .column_ld2
sub rcx, byte SIZEOF_BYTE sub rcx, byte SIZEOF_BYTE
@@ -103,9 +103,9 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
sub rcx, byte SIZEOF_WORD sub rcx, byte SIZEOF_WORD
movzx rdx, WORD [rsi+rcx] movzx rdx, WORD [rsi+rcx]
shl rax, WORD_BIT shl rax, WORD_BIT
or rax,rdx or rax, rdx
.column_ld4: .column_ld4:
movd xmmA,eax movd xmmA, eax
pop rdx pop rdx
pop rax pop rax
test cl, SIZEOF_DWORD test cl, SIZEOF_DWORD
@@ -113,18 +113,18 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
sub rcx, byte SIZEOF_DWORD sub rcx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [rsi+rcx] movd xmmF, XMM_DWORD [rsi+rcx]
pslldq xmmA, SIZEOF_DWORD pslldq xmmA, SIZEOF_DWORD
por xmmA,xmmF por xmmA, xmmF
.column_ld8: .column_ld8:
test cl, SIZEOF_MMWORD test cl, SIZEOF_MMWORD
jz short .column_ld16 jz short .column_ld16
sub rcx, byte SIZEOF_MMWORD sub rcx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [rsi+rcx] movq xmmB, XMM_MMWORD [rsi+rcx]
pslldq xmmA, SIZEOF_MMWORD pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmB por xmmA, xmmB
.column_ld16: .column_ld16:
test cl, SIZEOF_XMMWORD test cl, SIZEOF_XMMWORD
jz short .column_ld32 jz short .column_ld32
movdqa xmmF,xmmA movdqa xmmF, xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
mov rcx, SIZEOF_XMMWORD mov rcx, SIZEOF_XMMWORD
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
@@ -132,7 +132,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
test cl, 2*SIZEOF_XMMWORD test cl, 2*SIZEOF_XMMWORD
mov rcx, SIZEOF_XMMWORD mov rcx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv jz short .rgb_ycc_cnv
movdqa xmmB,xmmA movdqa xmmB, xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
@@ -147,49 +147,49 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
movdqa xmmG,xmmA movdqa xmmG, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
movdqa xmmE,xmmA movdqa xmmE, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
pxor xmmH,xmmH pxor xmmH, xmmH
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmB,xmmE movdqa xmmB, xmmE
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmF,xmmD movdqa xmmF, xmmD
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
@@ -204,19 +204,19 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
sub rcx, byte SIZEOF_XMMWORD/8 sub rcx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmE por xmmA, xmmE
.column_ld4: .column_ld4:
test cl, SIZEOF_XMMWORD/4 test cl, SIZEOF_XMMWORD/4
jz short .column_ld8 jz short .column_ld8
sub rcx, byte SIZEOF_XMMWORD/4 sub rcx, byte SIZEOF_XMMWORD/4
movdqa xmmE,xmmA movdqa xmmE, xmmA
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
.column_ld8: .column_ld8:
test cl, SIZEOF_XMMWORD/2 test cl, SIZEOF_XMMWORD/2
mov rcx, SIZEOF_XMMWORD mov rcx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv jz short .rgb_ycc_cnv
movdqa xmmF,xmmA movdqa xmmF, xmmA
movdqa xmmH,xmmE movdqa xmmH, xmmE
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
@@ -233,48 +233,48 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmC,xmmF movdqa xmmC, xmmF
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmB,xmmA movdqa xmmB, xmmA
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmG,xmmD movdqa xmmG, xmmD
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmE,xmmA movdqa xmmE, xmmA
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmH,xmmB movdqa xmmH, xmmB
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
pxor xmmF,xmmF pxor xmmF, xmmF
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmD,xmmB movdqa xmmD, xmmB
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmG,xmmE movdqa xmmG, xmmE
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
punpcklbw xmmF,xmmH punpcklbw xmmF, xmmH
punpckhbw xmmH,xmmH punpckhbw xmmH, xmmH
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------
@@ -296,158 +296,158 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
movdqa xmm6,xmm1 movdqa xmm6, xmm1
punpcklwd xmm1,xmm3 punpcklwd xmm1, xmm3
punpckhwd xmm6,xmm3 punpckhwd xmm6, xmm3
movdqa xmm7,xmm1 movdqa xmm7, xmm1
movdqa xmm4,xmm6 movdqa xmm4, xmm6
pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
pmaddwd xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) pmaddwd xmm7, [rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
pxor xmm1,xmm1 pxor xmm1, xmm1
pxor xmm6,xmm6 pxor xmm6, xmm6
punpcklwd xmm1,xmm5 ; xmm1=BOL punpcklwd xmm1, xmm5 ; xmm1=BOL
punpckhwd xmm6,xmm5 ; xmm6=BOH punpckhwd xmm6, xmm5 ; xmm6=BOH
psrld xmm1,1 ; xmm1=BOL*FIX(0.500) psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
psrld xmm6,1 ; xmm6=BOH*FIX(0.500) psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
movdqa xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ] movdqa xmm5, [rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
paddd xmm7,xmm1 paddd xmm7, xmm1
paddd xmm4,xmm6 paddd xmm4, xmm6
paddd xmm7,xmm5 paddd xmm7, xmm5
paddd xmm4,xmm5 paddd xmm4, xmm5
psrld xmm7,SCALEBITS ; xmm7=CbOL psrld xmm7, SCALEBITS ; xmm7=CbOL
psrld xmm4,SCALEBITS ; xmm4=CbOH psrld xmm4, SCALEBITS ; xmm4=CbOH
packssdw xmm7,xmm4 ; xmm7=CbO packssdw xmm7, xmm4 ; xmm7=CbO
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
movdqa xmm6,xmm0 movdqa xmm6, xmm0
punpcklwd xmm0,xmm2 punpcklwd xmm0, xmm2
punpckhwd xmm6,xmm2 punpckhwd xmm6, xmm2
movdqa xmm5,xmm0 movdqa xmm5, xmm0
movdqa xmm4,xmm6 movdqa xmm4, xmm6
pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
pmaddwd xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) pmaddwd xmm5, [rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
pxor xmm0,xmm0 pxor xmm0, xmm0
pxor xmm6,xmm6 pxor xmm6, xmm6
punpcklwd xmm0,xmm1 ; xmm0=BEL punpcklwd xmm0, xmm1 ; xmm0=BEL
punpckhwd xmm6,xmm1 ; xmm6=BEH punpckhwd xmm6, xmm1 ; xmm6=BEH
psrld xmm0,1 ; xmm0=BEL*FIX(0.500) psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
psrld xmm6,1 ; xmm6=BEH*FIX(0.500) psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
paddd xmm5,xmm0 paddd xmm5, xmm0
paddd xmm4,xmm6 paddd xmm4, xmm6
paddd xmm5,xmm1 paddd xmm5, xmm1
paddd xmm4,xmm1 paddd xmm4, xmm1
psrld xmm5,SCALEBITS ; xmm5=CbEL psrld xmm5, SCALEBITS ; xmm5=CbEL
psrld xmm4,SCALEBITS ; xmm4=CbEH psrld xmm4, SCALEBITS ; xmm4=CbEH
packssdw xmm5,xmm4 ; xmm5=CbE packssdw xmm5, xmm4 ; xmm5=CbE
psllw xmm7,BYTE_BIT psllw xmm7, BYTE_BIT
por xmm5,xmm7 ; xmm5=Cb por xmm5, xmm7 ; xmm5=Cb
movdqa XMMWORD [rbx], xmm5 ; Save Cb movdqa XMMWORD [rbx], xmm5 ; Save Cb
movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
movdqa xmm4,xmm0 movdqa xmm4, xmm0
punpcklwd xmm0,xmm3 punpcklwd xmm0, xmm3
punpckhwd xmm4,xmm3 punpckhwd xmm4, xmm3
movdqa xmm7,xmm0 movdqa xmm7, xmm0
movdqa xmm5,xmm4 movdqa xmm5, xmm4
pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
pmaddwd xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) pmaddwd xmm7, [rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
paddd xmm0, XMMWORD [wk(4)] paddd xmm0, XMMWORD [wk(4)]
paddd xmm4, XMMWORD [wk(5)] paddd xmm4, XMMWORD [wk(5)]
paddd xmm0,xmm3 paddd xmm0, xmm3
paddd xmm4,xmm3 paddd xmm4, xmm3
psrld xmm0,SCALEBITS ; xmm0=YOL psrld xmm0, SCALEBITS ; xmm0=YOL
psrld xmm4,SCALEBITS ; xmm4=YOH psrld xmm4, SCALEBITS ; xmm4=YOH
packssdw xmm0,xmm4 ; xmm0=YO packssdw xmm0, xmm4 ; xmm0=YO
pxor xmm3,xmm3 pxor xmm3, xmm3
pxor xmm4,xmm4 pxor xmm4, xmm4
punpcklwd xmm3,xmm1 ; xmm3=ROL punpcklwd xmm3, xmm1 ; xmm3=ROL
punpckhwd xmm4,xmm1 ; xmm4=ROH punpckhwd xmm4, xmm1 ; xmm4=ROH
psrld xmm3,1 ; xmm3=ROL*FIX(0.500) psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
psrld xmm4,1 ; xmm4=ROH*FIX(0.500) psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
paddd xmm7,xmm3 paddd xmm7, xmm3
paddd xmm5,xmm4 paddd xmm5, xmm4
paddd xmm7,xmm1 paddd xmm7, xmm1
paddd xmm5,xmm1 paddd xmm5, xmm1
psrld xmm7,SCALEBITS ; xmm7=CrOL psrld xmm7, SCALEBITS ; xmm7=CrOL
psrld xmm5,SCALEBITS ; xmm5=CrOH psrld xmm5, SCALEBITS ; xmm5=CrOH
packssdw xmm7,xmm5 ; xmm7=CrO packssdw xmm7, xmm5 ; xmm7=CrO
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
movdqa xmm4,xmm6 movdqa xmm4, xmm6
punpcklwd xmm6,xmm2 punpcklwd xmm6, xmm2
punpckhwd xmm4,xmm2 punpckhwd xmm4, xmm2
movdqa xmm1,xmm6 movdqa xmm1, xmm6
movdqa xmm5,xmm4 movdqa xmm5, xmm4
pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
pmaddwd xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) pmaddwd xmm1, [rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
paddd xmm6, XMMWORD [wk(6)] paddd xmm6, XMMWORD [wk(6)]
paddd xmm4, XMMWORD [wk(7)] paddd xmm4, XMMWORD [wk(7)]
paddd xmm6,xmm2 paddd xmm6, xmm2
paddd xmm4,xmm2 paddd xmm4, xmm2
psrld xmm6,SCALEBITS ; xmm6=YEL psrld xmm6, SCALEBITS ; xmm6=YEL
psrld xmm4,SCALEBITS ; xmm4=YEH psrld xmm4, SCALEBITS ; xmm4=YEH
packssdw xmm6,xmm4 ; xmm6=YE packssdw xmm6, xmm4 ; xmm6=YE
psllw xmm0,BYTE_BIT psllw xmm0, BYTE_BIT
por xmm6,xmm0 ; xmm6=Y por xmm6, xmm0 ; xmm6=Y
movdqa XMMWORD [rdi], xmm6 ; Save Y movdqa XMMWORD [rdi], xmm6 ; Save Y
pxor xmm2,xmm2 pxor xmm2, xmm2
pxor xmm4,xmm4 pxor xmm4, xmm4
punpcklwd xmm2,xmm3 ; xmm2=REL punpcklwd xmm2, xmm3 ; xmm2=REL
punpckhwd xmm4,xmm3 ; xmm4=REH punpckhwd xmm4, xmm3 ; xmm4=REH
psrld xmm2,1 ; xmm2=REL*FIX(0.500) psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
psrld xmm4,1 ; xmm4=REH*FIX(0.500) psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
movdqa xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ] movdqa xmm0, [rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
paddd xmm1,xmm2 paddd xmm1, xmm2
paddd xmm5,xmm4 paddd xmm5, xmm4
paddd xmm1,xmm0 paddd xmm1, xmm0
paddd xmm5,xmm0 paddd xmm5, xmm0
psrld xmm1,SCALEBITS ; xmm1=CrEL psrld xmm1, SCALEBITS ; xmm1=CrEL
psrld xmm5,SCALEBITS ; xmm5=CrEH psrld xmm5, SCALEBITS ; xmm5=CrEH
packssdw xmm1,xmm5 ; xmm1=CrE packssdw xmm1, xmm5 ; xmm1=CrE
psllw xmm7,BYTE_BIT psllw xmm7, BYTE_BIT
por xmm1,xmm7 ; xmm1=Cr por xmm1, xmm7 ; xmm1=Cr
movdqa XMMWORD [rdx], xmm1 ; Save Cr movdqa XMMWORD [rdx], xmm1 ; Save Cr
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
@@ -457,7 +457,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
add rdx, byte SIZEOF_XMMWORD ; outptr2 add rdx, byte SIZEOF_XMMWORD ; outptr2
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
test rcx,rcx test rcx, rcx
jnz near .column_ld1 jnz near .column_ld1
pop rcx ; col pop rcx ; col
@@ -476,7 +476,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret

View File

@@ -42,11 +42,11 @@
EXTN(jsimd_rgb_ycc_convert_sse2): EXTN(jsimd_rgb_ycc_convert_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address pushpic eax ; make a room for GOT address
push ebx push ebx
@@ -59,7 +59,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
movpic POINTER [gotptr], ebx ; save GOT address movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)] mov ecx, JDIMENSION [img_width(eax)]
test ecx,ecx test ecx, ecx
jz near .return jz near .return
push ecx push ecx
@@ -77,9 +77,9 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
mov esi, JSAMPARRAY [input_buf(eax)] mov esi, JSAMPARRAY [input_buf(eax)]
mov eax, INT [num_rows(eax)] mov eax, INT [num_rows(eax)]
test eax,eax test eax, eax
jle near .return jle near .return
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
pushpic eax pushpic eax
push edx push edx
@@ -96,14 +96,14 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
alignx 16,7 alignx 16, 7
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1: .column_ld1:
push eax push eax
push edx push edx
lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE test cl, SIZEOF_BYTE
jz short .column_ld2 jz short .column_ld2
sub ecx, byte SIZEOF_BYTE sub ecx, byte SIZEOF_BYTE
@@ -114,9 +114,9 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
sub ecx, byte SIZEOF_WORD sub ecx, byte SIZEOF_WORD
movzx edx, WORD [esi+ecx] movzx edx, WORD [esi+ecx]
shl eax, WORD_BIT shl eax, WORD_BIT
or eax,edx or eax, edx
.column_ld4: .column_ld4:
movd xmmA,eax movd xmmA, eax
pop edx pop edx
pop eax pop eax
test cl, SIZEOF_DWORD test cl, SIZEOF_DWORD
@@ -124,18 +124,18 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
sub ecx, byte SIZEOF_DWORD sub ecx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [esi+ecx] movd xmmF, XMM_DWORD [esi+ecx]
pslldq xmmA, SIZEOF_DWORD pslldq xmmA, SIZEOF_DWORD
por xmmA,xmmF por xmmA, xmmF
.column_ld8: .column_ld8:
test cl, SIZEOF_MMWORD test cl, SIZEOF_MMWORD
jz short .column_ld16 jz short .column_ld16
sub ecx, byte SIZEOF_MMWORD sub ecx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [esi+ecx] movq xmmB, XMM_MMWORD [esi+ecx]
pslldq xmmA, SIZEOF_MMWORD pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmB por xmmA, xmmB
.column_ld16: .column_ld16:
test cl, SIZEOF_XMMWORD test cl, SIZEOF_XMMWORD
jz short .column_ld32 jz short .column_ld32
movdqa xmmF,xmmA movdqa xmmF, xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
@@ -143,11 +143,11 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
test cl, 2*SIZEOF_XMMWORD test cl, 2*SIZEOF_XMMWORD
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv jz short .rgb_ycc_cnv
movdqa xmmB,xmmA movdqa xmmB, xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -159,49 +159,49 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
movdqa xmmG,xmmA movdqa xmmG, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
movdqa xmmE,xmmA movdqa xmmE, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
pxor xmmH,xmmH pxor xmmH, xmmH
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmB,xmmE movdqa xmmB, xmmE
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmF,xmmD movdqa xmmF, xmmD
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
@@ -216,23 +216,23 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
sub ecx, byte SIZEOF_XMMWORD/8 sub ecx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmE por xmmA, xmmE
.column_ld4: .column_ld4:
test cl, SIZEOF_XMMWORD/4 test cl, SIZEOF_XMMWORD/4
jz short .column_ld8 jz short .column_ld8
sub ecx, byte SIZEOF_XMMWORD/4 sub ecx, byte SIZEOF_XMMWORD/4
movdqa xmmE,xmmA movdqa xmmE, xmmA
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld8: .column_ld8:
test cl, SIZEOF_XMMWORD/2 test cl, SIZEOF_XMMWORD/2
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv jz short .rgb_ycc_cnv
movdqa xmmF,xmmA movdqa xmmF, xmmA
movdqa xmmH,xmmE movdqa xmmH, xmmE
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -246,48 +246,48 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmC,xmmF movdqa xmmC, xmmF
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmB,xmmA movdqa xmmB, xmmA
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmG,xmmD movdqa xmmG, xmmD
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmE,xmmA movdqa xmmE, xmmA
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmH,xmmB movdqa xmmH, xmmB
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
pxor xmmF,xmmF pxor xmmF, xmmF
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmD,xmmB movdqa xmmD, xmmB
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmG,xmmE movdqa xmmG, xmmE
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
punpcklbw xmmF,xmmH punpcklbw xmmF, xmmH
punpckhbw xmmH,xmmH punpckhbw xmmH, xmmH
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------
@@ -309,158 +309,158 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
movdqa xmm6,xmm1 movdqa xmm6, xmm1
punpcklwd xmm1,xmm3 punpcklwd xmm1, xmm3
punpckhwd xmm6,xmm3 punpckhwd xmm6, xmm3
movdqa xmm7,xmm1 movdqa xmm7, xmm1
movdqa xmm4,xmm6 movdqa xmm4, xmm6
pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) pmaddwd xmm7, [GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
pxor xmm1,xmm1 pxor xmm1, xmm1
pxor xmm6,xmm6 pxor xmm6, xmm6
punpcklwd xmm1,xmm5 ; xmm1=BOL punpcklwd xmm1, xmm5 ; xmm1=BOL
punpckhwd xmm6,xmm5 ; xmm6=BOH punpckhwd xmm6, xmm5 ; xmm6=BOH
psrld xmm1,1 ; xmm1=BOL*FIX(0.500) psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
psrld xmm6,1 ; xmm6=BOH*FIX(0.500) psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ] movdqa xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
paddd xmm7,xmm1 paddd xmm7, xmm1
paddd xmm4,xmm6 paddd xmm4, xmm6
paddd xmm7,xmm5 paddd xmm7, xmm5
paddd xmm4,xmm5 paddd xmm4, xmm5
psrld xmm7,SCALEBITS ; xmm7=CbOL psrld xmm7, SCALEBITS ; xmm7=CbOL
psrld xmm4,SCALEBITS ; xmm4=CbOH psrld xmm4, SCALEBITS ; xmm4=CbOH
packssdw xmm7,xmm4 ; xmm7=CbO packssdw xmm7, xmm4 ; xmm7=CbO
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
movdqa xmm6,xmm0 movdqa xmm6, xmm0
punpcklwd xmm0,xmm2 punpcklwd xmm0, xmm2
punpckhwd xmm6,xmm2 punpckhwd xmm6, xmm2
movdqa xmm5,xmm0 movdqa xmm5, xmm0
movdqa xmm4,xmm6 movdqa xmm4, xmm6
pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) pmaddwd xmm5, [GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
pxor xmm0,xmm0 pxor xmm0, xmm0
pxor xmm6,xmm6 pxor xmm6, xmm6
punpcklwd xmm0,xmm1 ; xmm0=BEL punpcklwd xmm0, xmm1 ; xmm0=BEL
punpckhwd xmm6,xmm1 ; xmm6=BEH punpckhwd xmm6, xmm1 ; xmm6=BEH
psrld xmm0,1 ; xmm0=BEL*FIX(0.500) psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
psrld xmm6,1 ; xmm6=BEH*FIX(0.500) psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
paddd xmm5,xmm0 paddd xmm5, xmm0
paddd xmm4,xmm6 paddd xmm4, xmm6
paddd xmm5,xmm1 paddd xmm5, xmm1
paddd xmm4,xmm1 paddd xmm4, xmm1
psrld xmm5,SCALEBITS ; xmm5=CbEL psrld xmm5, SCALEBITS ; xmm5=CbEL
psrld xmm4,SCALEBITS ; xmm4=CbEH psrld xmm4, SCALEBITS ; xmm4=CbEH
packssdw xmm5,xmm4 ; xmm5=CbE packssdw xmm5, xmm4 ; xmm5=CbE
psllw xmm7,BYTE_BIT psllw xmm7, BYTE_BIT
por xmm5,xmm7 ; xmm5=Cb por xmm5, xmm7 ; xmm5=Cb
movdqa XMMWORD [ebx], xmm5 ; Save Cb movdqa XMMWORD [ebx], xmm5 ; Save Cb
movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
movdqa xmm4,xmm0 movdqa xmm4, xmm0
punpcklwd xmm0,xmm3 punpcklwd xmm0, xmm3
punpckhwd xmm4,xmm3 punpckhwd xmm4, xmm3
movdqa xmm7,xmm0 movdqa xmm7, xmm0
movdqa xmm5,xmm4 movdqa xmm5, xmm4
pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) pmaddwd xmm7, [GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
paddd xmm0, XMMWORD [wk(4)] paddd xmm0, XMMWORD [wk(4)]
paddd xmm4, XMMWORD [wk(5)] paddd xmm4, XMMWORD [wk(5)]
paddd xmm0,xmm3 paddd xmm0, xmm3
paddd xmm4,xmm3 paddd xmm4, xmm3
psrld xmm0,SCALEBITS ; xmm0=YOL psrld xmm0, SCALEBITS ; xmm0=YOL
psrld xmm4,SCALEBITS ; xmm4=YOH psrld xmm4, SCALEBITS ; xmm4=YOH
packssdw xmm0,xmm4 ; xmm0=YO packssdw xmm0, xmm4 ; xmm0=YO
pxor xmm3,xmm3 pxor xmm3, xmm3
pxor xmm4,xmm4 pxor xmm4, xmm4
punpcklwd xmm3,xmm1 ; xmm3=ROL punpcklwd xmm3, xmm1 ; xmm3=ROL
punpckhwd xmm4,xmm1 ; xmm4=ROH punpckhwd xmm4, xmm1 ; xmm4=ROH
psrld xmm3,1 ; xmm3=ROL*FIX(0.500) psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
psrld xmm4,1 ; xmm4=ROH*FIX(0.500) psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
paddd xmm7,xmm3 paddd xmm7, xmm3
paddd xmm5,xmm4 paddd xmm5, xmm4
paddd xmm7,xmm1 paddd xmm7, xmm1
paddd xmm5,xmm1 paddd xmm5, xmm1
psrld xmm7,SCALEBITS ; xmm7=CrOL psrld xmm7, SCALEBITS ; xmm7=CrOL
psrld xmm5,SCALEBITS ; xmm5=CrOH psrld xmm5, SCALEBITS ; xmm5=CrOH
packssdw xmm7,xmm5 ; xmm7=CrO packssdw xmm7, xmm5 ; xmm7=CrO
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
movdqa xmm4,xmm6 movdqa xmm4, xmm6
punpcklwd xmm6,xmm2 punpcklwd xmm6, xmm2
punpckhwd xmm4,xmm2 punpckhwd xmm4, xmm2
movdqa xmm1,xmm6 movdqa xmm1, xmm6
movdqa xmm5,xmm4 movdqa xmm5, xmm4
pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) pmaddwd xmm1, [GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
paddd xmm6, XMMWORD [wk(6)] paddd xmm6, XMMWORD [wk(6)]
paddd xmm4, XMMWORD [wk(7)] paddd xmm4, XMMWORD [wk(7)]
paddd xmm6,xmm2 paddd xmm6, xmm2
paddd xmm4,xmm2 paddd xmm4, xmm2
psrld xmm6,SCALEBITS ; xmm6=YEL psrld xmm6, SCALEBITS ; xmm6=YEL
psrld xmm4,SCALEBITS ; xmm4=YEH psrld xmm4, SCALEBITS ; xmm4=YEH
packssdw xmm6,xmm4 ; xmm6=YE packssdw xmm6, xmm4 ; xmm6=YE
psllw xmm0,BYTE_BIT psllw xmm0, BYTE_BIT
por xmm6,xmm0 ; xmm6=Y por xmm6, xmm0 ; xmm6=Y
movdqa XMMWORD [edi], xmm6 ; Save Y movdqa XMMWORD [edi], xmm6 ; Save Y
pxor xmm2,xmm2 pxor xmm2, xmm2
pxor xmm4,xmm4 pxor xmm4, xmm4
punpcklwd xmm2,xmm3 ; xmm2=REL punpcklwd xmm2, xmm3 ; xmm2=REL
punpckhwd xmm4,xmm3 ; xmm4=REH punpckhwd xmm4, xmm3 ; xmm4=REH
psrld xmm2,1 ; xmm2=REL*FIX(0.500) psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
psrld xmm4,1 ; xmm4=REH*FIX(0.500) psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ] movdqa xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
paddd xmm1,xmm2 paddd xmm1, xmm2
paddd xmm5,xmm4 paddd xmm5, xmm4
paddd xmm1,xmm0 paddd xmm1, xmm0
paddd xmm5,xmm0 paddd xmm5, xmm0
psrld xmm1,SCALEBITS ; xmm1=CrEL psrld xmm1, SCALEBITS ; xmm1=CrEL
psrld xmm5,SCALEBITS ; xmm5=CrEH psrld xmm5, SCALEBITS ; xmm5=CrEH
packssdw xmm1,xmm5 ; xmm1=CrE packssdw xmm1, xmm5 ; xmm1=CrE
psllw xmm7,BYTE_BIT psllw xmm7, BYTE_BIT
por xmm1,xmm7 ; xmm1=Cr por xmm1, xmm7 ; xmm1=Cr
movdqa XMMWORD [edx], xmm1 ; Save Cr movdqa XMMWORD [edx], xmm1 ; Save Cr
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
@@ -470,7 +470,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
add edx, byte SIZEOF_XMMWORD ; outptr2 add edx, byte SIZEOF_XMMWORD ; outptr2
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
test ecx,ecx test ecx, ecx
jnz near .column_ld1 jnz near .column_ld1
pop ecx ; col pop ecx ; col
@@ -493,7 +493,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
pop ebx pop ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret

View File

@@ -42,17 +42,17 @@
EXTN(jsimd_rgb_gray_convert_sse2): EXTN(jsimd_rgb_gray_convert_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
push rbx push rbx
mov ecx, r10d mov ecx, r10d
test rcx,rcx test rcx, rcx
jz near .return jz near .return
push rcx push rcx
@@ -66,7 +66,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
mov rsi, r11 mov rsi, r11
mov eax, r14d mov eax, r14d
test rax,rax test rax, rax
jle near .return jle near .return
.rowloop: .rowloop:
push rdi push rdi
@@ -84,7 +84,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
.column_ld1: .column_ld1:
push rax push rax
push rdx push rdx
lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE test cl, SIZEOF_BYTE
jz short .column_ld2 jz short .column_ld2
sub rcx, byte SIZEOF_BYTE sub rcx, byte SIZEOF_BYTE
@@ -95,9 +95,9 @@ EXTN(jsimd_rgb_gray_convert_sse2):
sub rcx, byte SIZEOF_WORD sub rcx, byte SIZEOF_WORD
movzx rdx, WORD [rsi+rcx] movzx rdx, WORD [rsi+rcx]
shl rax, WORD_BIT shl rax, WORD_BIT
or rax,rdx or rax, rdx
.column_ld4: .column_ld4:
movd xmmA,eax movd xmmA, eax
pop rdx pop rdx
pop rax pop rax
test cl, SIZEOF_DWORD test cl, SIZEOF_DWORD
@@ -105,18 +105,18 @@ EXTN(jsimd_rgb_gray_convert_sse2):
sub rcx, byte SIZEOF_DWORD sub rcx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [rsi+rcx] movd xmmF, XMM_DWORD [rsi+rcx]
pslldq xmmA, SIZEOF_DWORD pslldq xmmA, SIZEOF_DWORD
por xmmA,xmmF por xmmA, xmmF
.column_ld8: .column_ld8:
test cl, SIZEOF_MMWORD test cl, SIZEOF_MMWORD
jz short .column_ld16 jz short .column_ld16
sub rcx, byte SIZEOF_MMWORD sub rcx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [rsi+rcx] movq xmmB, XMM_MMWORD [rsi+rcx]
pslldq xmmA, SIZEOF_MMWORD pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmB por xmmA, xmmB
.column_ld16: .column_ld16:
test cl, SIZEOF_XMMWORD test cl, SIZEOF_XMMWORD
jz short .column_ld32 jz short .column_ld32
movdqa xmmF,xmmA movdqa xmmF, xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
mov rcx, SIZEOF_XMMWORD mov rcx, SIZEOF_XMMWORD
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
@@ -124,7 +124,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
test cl, 2*SIZEOF_XMMWORD test cl, 2*SIZEOF_XMMWORD
mov rcx, SIZEOF_XMMWORD mov rcx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv jz short .rgb_gray_cnv
movdqa xmmB,xmmA movdqa xmmB, xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
@@ -139,49 +139,49 @@ EXTN(jsimd_rgb_gray_convert_sse2):
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
movdqa xmmG,xmmA movdqa xmmG, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
movdqa xmmE,xmmA movdqa xmmE, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
pxor xmmH,xmmH pxor xmmH, xmmH
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmB,xmmE movdqa xmmB, xmmE
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmF,xmmD movdqa xmmF, xmmD
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
@@ -196,19 +196,19 @@ EXTN(jsimd_rgb_gray_convert_sse2):
sub rcx, byte SIZEOF_XMMWORD/8 sub rcx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmE por xmmA, xmmE
.column_ld4: .column_ld4:
test cl, SIZEOF_XMMWORD/4 test cl, SIZEOF_XMMWORD/4
jz short .column_ld8 jz short .column_ld8
sub rcx, byte SIZEOF_XMMWORD/4 sub rcx, byte SIZEOF_XMMWORD/4
movdqa xmmE,xmmA movdqa xmmE, xmmA
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
.column_ld8: .column_ld8:
test cl, SIZEOF_XMMWORD/2 test cl, SIZEOF_XMMWORD/2
mov rcx, SIZEOF_XMMWORD mov rcx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv jz short .rgb_gray_cnv
movdqa xmmF,xmmA movdqa xmmF, xmmA
movdqa xmmH,xmmE movdqa xmmH, xmmE
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
@@ -225,48 +225,48 @@ EXTN(jsimd_rgb_gray_convert_sse2):
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmC,xmmF movdqa xmmC, xmmF
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmB,xmmA movdqa xmmB, xmmA
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmG,xmmD movdqa xmmG, xmmD
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmE,xmmA movdqa xmmE, xmmA
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmH,xmmB movdqa xmmH, xmmB
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
pxor xmmF,xmmF pxor xmmF, xmmF
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmD,xmmB movdqa xmmD, xmmB
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmG,xmmE movdqa xmmG, xmmE
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
punpcklbw xmmF,xmmH punpcklbw xmmF, xmmH
punpckhbw xmmH,xmmH punpckhbw xmmH, xmmH
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------
@@ -279,19 +279,19 @@ EXTN(jsimd_rgb_gray_convert_sse2):
; (This implementation) ; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
movdqa xmm6,xmm1 movdqa xmm6, xmm1
punpcklwd xmm1,xmm3 punpcklwd xmm1, xmm3
punpckhwd xmm6,xmm3 punpckhwd xmm6, xmm3
pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm6,xmm0 movdqa xmm6, xmm0
punpcklwd xmm0,xmm2 punpcklwd xmm0, xmm2
punpckhwd xmm6,xmm2 punpckhwd xmm6, xmm2
pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
@@ -299,40 +299,40 @@ EXTN(jsimd_rgb_gray_convert_sse2):
movdqa xmm0, xmm5 ; xmm0=BO movdqa xmm0, xmm5 ; xmm0=BO
movdqa xmm6, xmm4 ; xmm6=BE movdqa xmm6, xmm4 ; xmm6=BE
movdqa xmm4,xmm0 movdqa xmm4, xmm0
punpcklwd xmm0,xmm3 punpcklwd xmm0, xmm3
punpckhwd xmm4,xmm3 punpckhwd xmm4, xmm3
pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
paddd xmm0, xmm1 paddd xmm0, xmm1
paddd xmm4, xmm7 paddd xmm4, xmm7
paddd xmm0,xmm3 paddd xmm0, xmm3
paddd xmm4,xmm3 paddd xmm4, xmm3
psrld xmm0,SCALEBITS ; xmm0=YOL psrld xmm0, SCALEBITS ; xmm0=YOL
psrld xmm4,SCALEBITS ; xmm4=YOH psrld xmm4, SCALEBITS ; xmm4=YOH
packssdw xmm0,xmm4 ; xmm0=YO packssdw xmm0, xmm4 ; xmm0=YO
movdqa xmm4,xmm6 movdqa xmm4, xmm6
punpcklwd xmm6,xmm2 punpcklwd xmm6, xmm2
punpckhwd xmm4,xmm2 punpckhwd xmm4, xmm2
pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
paddd xmm6, XMMWORD [wk(0)] paddd xmm6, XMMWORD [wk(0)]
paddd xmm4, XMMWORD [wk(1)] paddd xmm4, XMMWORD [wk(1)]
paddd xmm6,xmm2 paddd xmm6, xmm2
paddd xmm4,xmm2 paddd xmm4, xmm2
psrld xmm6,SCALEBITS ; xmm6=YEL psrld xmm6, SCALEBITS ; xmm6=YEL
psrld xmm4,SCALEBITS ; xmm4=YEH psrld xmm4, SCALEBITS ; xmm4=YEH
packssdw xmm6,xmm4 ; xmm6=YE packssdw xmm6, xmm4 ; xmm6=YE
psllw xmm0,BYTE_BIT psllw xmm0, BYTE_BIT
por xmm6,xmm0 ; xmm6=Y por xmm6, xmm0 ; xmm6=Y
movdqa XMMWORD [rdi], xmm6 ; Save Y movdqa XMMWORD [rdi], xmm6 ; Save Y
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
@@ -340,7 +340,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
add rdi, byte SIZEOF_XMMWORD ; outptr0 add rdi, byte SIZEOF_XMMWORD ; outptr0
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
test rcx,rcx test rcx, rcx
jnz near .column_ld1 jnz near .column_ld1
pop rcx ; col pop rcx ; col
@@ -355,7 +355,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret

View File

@@ -44,11 +44,11 @@
EXTN(jsimd_rgb_gray_convert_sse2): EXTN(jsimd_rgb_gray_convert_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address pushpic eax ; make a room for GOT address
push ebx push ebx
@@ -61,7 +61,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
movpic POINTER [gotptr], ebx ; save GOT address movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)] mov ecx, JDIMENSION [img_width(eax)]
test ecx,ecx test ecx, ecx
jz near .return jz near .return
push ecx push ecx
@@ -75,9 +75,9 @@ EXTN(jsimd_rgb_gray_convert_sse2):
mov esi, JSAMPARRAY [input_buf(eax)] mov esi, JSAMPARRAY [input_buf(eax)]
mov eax, INT [num_rows(eax)] mov eax, INT [num_rows(eax)]
test eax,eax test eax, eax
jle near .return jle near .return
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
pushpic eax pushpic eax
push edi push edi
@@ -90,14 +90,14 @@ EXTN(jsimd_rgb_gray_convert_sse2):
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
alignx 16,7 alignx 16, 7
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1: .column_ld1:
push eax push eax
push edx push edx
lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE test cl, SIZEOF_BYTE
jz short .column_ld2 jz short .column_ld2
sub ecx, byte SIZEOF_BYTE sub ecx, byte SIZEOF_BYTE
@@ -108,9 +108,9 @@ EXTN(jsimd_rgb_gray_convert_sse2):
sub ecx, byte SIZEOF_WORD sub ecx, byte SIZEOF_WORD
movzx edx, WORD [esi+ecx] movzx edx, WORD [esi+ecx]
shl eax, WORD_BIT shl eax, WORD_BIT
or eax,edx or eax, edx
.column_ld4: .column_ld4:
movd xmmA,eax movd xmmA, eax
pop edx pop edx
pop eax pop eax
test cl, SIZEOF_DWORD test cl, SIZEOF_DWORD
@@ -118,18 +118,18 @@ EXTN(jsimd_rgb_gray_convert_sse2):
sub ecx, byte SIZEOF_DWORD sub ecx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [esi+ecx] movd xmmF, XMM_DWORD [esi+ecx]
pslldq xmmA, SIZEOF_DWORD pslldq xmmA, SIZEOF_DWORD
por xmmA,xmmF por xmmA, xmmF
.column_ld8: .column_ld8:
test cl, SIZEOF_MMWORD test cl, SIZEOF_MMWORD
jz short .column_ld16 jz short .column_ld16
sub ecx, byte SIZEOF_MMWORD sub ecx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [esi+ecx] movq xmmB, XMM_MMWORD [esi+ecx]
pslldq xmmA, SIZEOF_MMWORD pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmB por xmmA, xmmB
.column_ld16: .column_ld16:
test cl, SIZEOF_XMMWORD test cl, SIZEOF_XMMWORD
jz short .column_ld32 jz short .column_ld32
movdqa xmmF,xmmA movdqa xmmF, xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
@@ -137,11 +137,11 @@ EXTN(jsimd_rgb_gray_convert_sse2):
test cl, 2*SIZEOF_XMMWORD test cl, 2*SIZEOF_XMMWORD
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv jz short .rgb_gray_cnv
movdqa xmmB,xmmA movdqa xmmB, xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -153,49 +153,49 @@ EXTN(jsimd_rgb_gray_convert_sse2):
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
movdqa xmmG,xmmA movdqa xmmG, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
movdqa xmmE,xmmA movdqa xmmE, xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
pxor xmmH,xmmH pxor xmmH, xmmH
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmB,xmmE movdqa xmmB, xmmE
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmF,xmmD movdqa xmmF, xmmD
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
@@ -210,23 +210,23 @@ EXTN(jsimd_rgb_gray_convert_sse2):
sub ecx, byte SIZEOF_XMMWORD/8 sub ecx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmE por xmmA, xmmE
.column_ld4: .column_ld4:
test cl, SIZEOF_XMMWORD/4 test cl, SIZEOF_XMMWORD/4
jz short .column_ld8 jz short .column_ld8
sub ecx, byte SIZEOF_XMMWORD/4 sub ecx, byte SIZEOF_XMMWORD/4
movdqa xmmE,xmmA movdqa xmmE, xmmA
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld8: .column_ld8:
test cl, SIZEOF_XMMWORD/2 test cl, SIZEOF_XMMWORD/2
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv jz short .rgb_gray_cnv
movdqa xmmF,xmmA movdqa xmmF, xmmA
movdqa xmmH,xmmE movdqa xmmH, xmmE
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -240,48 +240,48 @@ EXTN(jsimd_rgb_gray_convert_sse2):
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmC,xmmF movdqa xmmC, xmmF
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmB,xmmA movdqa xmmB, xmmA
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmG,xmmD movdqa xmmG, xmmD
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmE,xmmA movdqa xmmE, xmmA
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmH,xmmB movdqa xmmH, xmmB
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
pxor xmmF,xmmF pxor xmmF, xmmF
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmD,xmmB movdqa xmmD, xmmB
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmG,xmmE movdqa xmmG, xmmE
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
punpcklbw xmmF,xmmH punpcklbw xmmF, xmmH
punpckhbw xmmH,xmmH punpckhbw xmmH, xmmH
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------
@@ -294,19 +294,19 @@ EXTN(jsimd_rgb_gray_convert_sse2):
; (This implementation) ; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
movdqa xmm6,xmm1 movdqa xmm6, xmm1
punpcklwd xmm1,xmm3 punpcklwd xmm1, xmm3
punpckhwd xmm6,xmm3 punpckhwd xmm6, xmm3
pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm6,xmm0 movdqa xmm6, xmm0
punpcklwd xmm0,xmm2 punpcklwd xmm0, xmm2
punpckhwd xmm6,xmm2 punpckhwd xmm6, xmm2
pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
@@ -314,40 +314,40 @@ EXTN(jsimd_rgb_gray_convert_sse2):
movdqa xmm0, xmm5 ; xmm0=BO movdqa xmm0, xmm5 ; xmm0=BO
movdqa xmm6, xmm4 ; xmm6=BE movdqa xmm6, xmm4 ; xmm6=BE
movdqa xmm4,xmm0 movdqa xmm4, xmm0
punpcklwd xmm0,xmm3 punpcklwd xmm0, xmm3
punpckhwd xmm4,xmm3 punpckhwd xmm4, xmm3
pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
paddd xmm0, xmm1 paddd xmm0, xmm1
paddd xmm4, xmm7 paddd xmm4, xmm7
paddd xmm0,xmm3 paddd xmm0, xmm3
paddd xmm4,xmm3 paddd xmm4, xmm3
psrld xmm0,SCALEBITS ; xmm0=YOL psrld xmm0, SCALEBITS ; xmm0=YOL
psrld xmm4,SCALEBITS ; xmm4=YOH psrld xmm4, SCALEBITS ; xmm4=YOH
packssdw xmm0,xmm4 ; xmm0=YO packssdw xmm0, xmm4 ; xmm0=YO
movdqa xmm4,xmm6 movdqa xmm4, xmm6
punpcklwd xmm6,xmm2 punpcklwd xmm6, xmm2
punpckhwd xmm4,xmm2 punpckhwd xmm4, xmm2
pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
paddd xmm6, XMMWORD [wk(0)] paddd xmm6, XMMWORD [wk(0)]
paddd xmm4, XMMWORD [wk(1)] paddd xmm4, XMMWORD [wk(1)]
paddd xmm6,xmm2 paddd xmm6, xmm2
paddd xmm4,xmm2 paddd xmm4, xmm2
psrld xmm6,SCALEBITS ; xmm6=YEL psrld xmm6, SCALEBITS ; xmm6=YEL
psrld xmm4,SCALEBITS ; xmm4=YEH psrld xmm4, SCALEBITS ; xmm4=YEH
packssdw xmm6,xmm4 ; xmm6=YE packssdw xmm6, xmm4 ; xmm6=YE
psllw xmm0,BYTE_BIT psllw xmm0, BYTE_BIT
por xmm6,xmm0 ; xmm6=Y por xmm6, xmm0 ; xmm6=Y
movdqa XMMWORD [edi], xmm6 ; Save Y movdqa XMMWORD [edi], xmm6 ; Save Y
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
@@ -355,7 +355,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
add edi, byte SIZEOF_XMMWORD ; outptr0 add edi, byte SIZEOF_XMMWORD ; outptr0
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
test ecx,ecx test ecx, ecx
jnz near .column_ld1 jnz near .column_ld1
pop ecx ; col pop ecx ; col
@@ -374,7 +374,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
pop ebx pop ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret

View File

@@ -187,10 +187,10 @@ EXTN(jconst_huff_encode_one_block):
EXTN(jsimd_huff_encode_one_block_sse2): EXTN(jsimd_huff_encode_one_block_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp,rsp ; rbp = aligned rbp
lea rsp, [t2] lea rsp, [t2]
collect_args collect_args
@@ -350,7 +350,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
add rsp, 4*SIZEOF_XMMWORD add rsp, 4*SIZEOF_XMMWORD
%endif %endif
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret

View File

@@ -218,7 +218,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
mov esi, ecx mov esi, ecx
; This is a well-known technique for obtaining the absolute value ; This is a well-known technique for obtaining the absolute value
; without a branch. It is derived from an assembly language technique ; with out a branch. It is derived from an assembly language technique
; presented in "How to Optimize for the Pentium Processors", ; presented in "How to Optimize for the Pentium Processors",
; Copyright (c) 1996, 1997 by Agner Fog. ; Copyright (c) 1996, 1997 by Agner Fog.
mov edx, ecx mov edx, ecx

View File

@@ -44,12 +44,12 @@
EXTN(jsimd_h2v1_downsample_sse2): EXTN(jsimd_h2v1_downsample_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
mov ecx, r13d mov ecx, r13d
shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
jz near .return jz near .return
mov edx, r10d mov edx, r10d
@@ -57,12 +57,12 @@ EXTN(jsimd_h2v1_downsample_sse2):
; -- expand_right_edge ; -- expand_right_edge
push rcx push rcx
shl rcx,1 ; output_cols * 2 shl rcx, 1 ; output_cols * 2
sub rcx,rdx sub rcx, rdx
jle short .expand_end jle short .expand_end
mov rax, r11 mov rax, r11
test rax,rax test rax, rax
jle short .expand_end jle short .expand_end
cld cld
@@ -72,7 +72,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
push rcx push rcx
mov rdi, JSAMPROW [rsi] mov rdi, JSAMPROW [rsi]
add rdi,rdx add rdi, rdx
mov al, JSAMPLE [rdi-1] mov al, JSAMPLE [rdi-1]
rep stosb rep stosb
@@ -90,14 +90,14 @@ EXTN(jsimd_h2v1_downsample_sse2):
; -- h2v1_downsample ; -- h2v1_downsample
mov eax, r12d ; rowctr mov eax, r12d ; rowctr
test eax,eax test eax, eax
jle near .return jle near .return
mov rdx, 0x00010000 ; bias pattern mov rdx, 0x00010000 ; bias pattern
movd xmm7,edx movd xmm7, edx
pcmpeqw xmm6,xmm6 pcmpeqw xmm6, xmm6
pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov rsi, r14 ; input_data mov rsi, r14 ; input_data
mov rdi, r15 ; output_data mov rdi, r15 ; output_data
@@ -114,7 +114,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
.columnloop_r8: .columnloop_r8:
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
pxor xmm1,xmm1 pxor xmm1, xmm1
mov rcx, SIZEOF_XMMWORD mov rcx, SIZEOF_XMMWORD
jmp short .downsample jmp short .downsample
@@ -123,22 +123,22 @@ EXTN(jsimd_h2v1_downsample_sse2):
movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
.downsample: .downsample:
movdqa xmm2,xmm0 movdqa xmm2, xmm0
movdqa xmm3,xmm1 movdqa xmm3, xmm1
pand xmm0,xmm6 pand xmm0, xmm6
psrlw xmm2,BYTE_BIT psrlw xmm2, BYTE_BIT
pand xmm1,xmm6 pand xmm1, xmm6
psrlw xmm3,BYTE_BIT psrlw xmm3, BYTE_BIT
paddw xmm0,xmm2 paddw xmm0, xmm2
paddw xmm1,xmm3 paddw xmm1, xmm3
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm1,xmm7 paddw xmm1, xmm7
psrlw xmm0,1 psrlw xmm0, 1
psrlw xmm1,1 psrlw xmm1, 1
packuswb xmm0,xmm1 packuswb xmm0, xmm1
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
@@ -147,7 +147,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
add rdi, byte 1*SIZEOF_XMMWORD ; outptr add rdi, byte 1*SIZEOF_XMMWORD ; outptr
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jae short .columnloop jae short .columnloop
test rcx,rcx test rcx, rcx
jnz short .columnloop_r8 jnz short .columnloop_r8
pop rsi pop rsi
@@ -188,12 +188,12 @@ EXTN(jsimd_h2v1_downsample_sse2):
EXTN(jsimd_h2v2_downsample_sse2): EXTN(jsimd_h2v2_downsample_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
mov ecx, r13d mov ecx, r13d
shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
jz near .return jz near .return
mov edx, r10d mov edx, r10d
@@ -201,12 +201,12 @@ EXTN(jsimd_h2v2_downsample_sse2):
; -- expand_right_edge ; -- expand_right_edge
push rcx push rcx
shl rcx,1 ; output_cols * 2 shl rcx, 1 ; output_cols * 2
sub rcx,rdx sub rcx, rdx
jle short .expand_end jle short .expand_end
mov rax, r11 mov rax, r11
test rax,rax test rax, rax
jle short .expand_end jle short .expand_end
cld cld
@@ -216,7 +216,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
push rcx push rcx
mov rdi, JSAMPROW [rsi] mov rdi, JSAMPROW [rsi]
add rdi,rdx add rdi, rdx
mov al, JSAMPLE [rdi-1] mov al, JSAMPLE [rdi-1]
rep stosb rep stosb
@@ -234,14 +234,14 @@ EXTN(jsimd_h2v2_downsample_sse2):
; -- h2v2_downsample ; -- h2v2_downsample
mov eax, r12d ; rowctr mov eax, r12d ; rowctr
test rax,rax test rax, rax
jle near .return jle near .return
mov rdx, 0x00020001 ; bias pattern mov rdx, 0x00020001 ; bias pattern
movd xmm7,edx movd xmm7, edx
pcmpeqw xmm6,xmm6 pcmpeqw xmm6, xmm6
pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov rsi, r14 ; input_data mov rsi, r14 ; input_data
mov rdi, r15 ; output_data mov rdi, r15 ; output_data
@@ -260,8 +260,8 @@ EXTN(jsimd_h2v2_downsample_sse2):
.columnloop_r8: .columnloop_r8:
movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
pxor xmm2,xmm2 pxor xmm2, xmm2
pxor xmm3,xmm3 pxor xmm3, xmm3
mov rcx, SIZEOF_XMMWORD mov rcx, SIZEOF_XMMWORD
jmp short .downsample jmp short .downsample
@@ -272,32 +272,32 @@ EXTN(jsimd_h2v2_downsample_sse2):
movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
.downsample: .downsample:
movdqa xmm4,xmm0 movdqa xmm4, xmm0
movdqa xmm5,xmm1 movdqa xmm5, xmm1
pand xmm0,xmm6 pand xmm0, xmm6
psrlw xmm4,BYTE_BIT psrlw xmm4, BYTE_BIT
pand xmm1,xmm6 pand xmm1, xmm6
psrlw xmm5,BYTE_BIT psrlw xmm5, BYTE_BIT
paddw xmm0,xmm4 paddw xmm0, xmm4
paddw xmm1,xmm5 paddw xmm1, xmm5
movdqa xmm4,xmm2 movdqa xmm4, xmm2
movdqa xmm5,xmm3 movdqa xmm5, xmm3
pand xmm2,xmm6 pand xmm2, xmm6
psrlw xmm4,BYTE_BIT psrlw xmm4, BYTE_BIT
pand xmm3,xmm6 pand xmm3, xmm6
psrlw xmm5,BYTE_BIT psrlw xmm5, BYTE_BIT
paddw xmm2,xmm4 paddw xmm2, xmm4
paddw xmm3,xmm5 paddw xmm3, xmm5
paddw xmm0,xmm1 paddw xmm0, xmm1
paddw xmm2,xmm3 paddw xmm2, xmm3
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm2,xmm7 paddw xmm2, xmm7
psrlw xmm0,2 psrlw xmm0, 2
psrlw xmm2,2 psrlw xmm2, 2
packuswb xmm0,xmm2 packuswb xmm0, xmm2
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
@@ -307,7 +307,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
add rdi, byte 1*SIZEOF_XMMWORD ; outptr add rdi, byte 1*SIZEOF_XMMWORD ; outptr
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
test rcx,rcx test rcx, rcx
jnz near .columnloop_r8 jnz near .columnloop_r8
pop rsi pop rsi

View File

@@ -43,7 +43,7 @@
EXTN(jsimd_h2v1_downsample_sse2): EXTN(jsimd_h2v1_downsample_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
; push ebx ; unused ; push ebx ; unused
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
@@ -51,7 +51,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
push edi push edi
mov ecx, JDIMENSION [width_blks(ebp)] mov ecx, JDIMENSION [width_blks(ebp)]
shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
jz near .return jz near .return
mov edx, JDIMENSION [img_width(ebp)] mov edx, JDIMENSION [img_width(ebp)]
@@ -59,23 +59,23 @@ EXTN(jsimd_h2v1_downsample_sse2):
; -- expand_right_edge ; -- expand_right_edge
push ecx push ecx
shl ecx,1 ; output_cols * 2 shl ecx, 1 ; output_cols * 2
sub ecx,edx sub ecx, edx
jle short .expand_end jle short .expand_end
mov eax, INT [max_v_samp(ebp)] mov eax, INT [max_v_samp(ebp)]
test eax,eax test eax, eax
jle short .expand_end jle short .expand_end
cld cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16,7 alignx 16, 7
.expandloop: .expandloop:
push eax push eax
push ecx push ecx
mov edi, JSAMPROW [esi] mov edi, JSAMPROW [esi]
add edi,edx add edi, edx
mov al, JSAMPLE [edi-1] mov al, JSAMPLE [edi-1]
rep stosb rep stosb
@@ -93,18 +93,18 @@ EXTN(jsimd_h2v1_downsample_sse2):
; -- h2v1_downsample ; -- h2v1_downsample
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
test eax,eax test eax, eax
jle near .return jle near .return
mov edx, 0x00010000 ; bias pattern mov edx, 0x00010000 ; bias pattern
movd xmm7,edx movd xmm7, edx
pcmpeqw xmm6,xmm6 pcmpeqw xmm6, xmm6
pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
push ecx push ecx
push edi push edi
@@ -115,36 +115,36 @@ EXTN(jsimd_h2v1_downsample_sse2):
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop jae short .columnloop
alignx 16,7 alignx 16, 7
.columnloop_r8: .columnloop_r8:
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
pxor xmm1,xmm1 pxor xmm1, xmm1
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jmp short .downsample jmp short .downsample
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
.downsample: .downsample:
movdqa xmm2,xmm0 movdqa xmm2, xmm0
movdqa xmm3,xmm1 movdqa xmm3, xmm1
pand xmm0,xmm6 pand xmm0, xmm6
psrlw xmm2,BYTE_BIT psrlw xmm2, BYTE_BIT
pand xmm1,xmm6 pand xmm1, xmm6
psrlw xmm3,BYTE_BIT psrlw xmm3, BYTE_BIT
paddw xmm0,xmm2 paddw xmm0, xmm2
paddw xmm1,xmm3 paddw xmm1, xmm3
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm1,xmm7 paddw xmm1, xmm7
psrlw xmm0,1 psrlw xmm0, 1
psrlw xmm1,1 psrlw xmm1, 1
packuswb xmm0,xmm1 packuswb xmm0, xmm1
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
@@ -153,7 +153,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
add edi, byte 1*SIZEOF_XMMWORD ; outptr add edi, byte 1*SIZEOF_XMMWORD ; outptr
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop jae short .columnloop
test ecx,ecx test ecx, ecx
jnz short .columnloop_r8 jnz short .columnloop_r8
pop esi pop esi
@@ -198,7 +198,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
EXTN(jsimd_h2v2_downsample_sse2): EXTN(jsimd_h2v2_downsample_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
; push ebx ; unused ; push ebx ; unused
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
@@ -206,7 +206,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
push edi push edi
mov ecx, JDIMENSION [width_blks(ebp)] mov ecx, JDIMENSION [width_blks(ebp)]
shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
jz near .return jz near .return
mov edx, JDIMENSION [img_width(ebp)] mov edx, JDIMENSION [img_width(ebp)]
@@ -214,23 +214,23 @@ EXTN(jsimd_h2v2_downsample_sse2):
; -- expand_right_edge ; -- expand_right_edge
push ecx push ecx
shl ecx,1 ; output_cols * 2 shl ecx, 1 ; output_cols * 2
sub ecx,edx sub ecx, edx
jle short .expand_end jle short .expand_end
mov eax, INT [max_v_samp(ebp)] mov eax, INT [max_v_samp(ebp)]
test eax,eax test eax, eax
jle short .expand_end jle short .expand_end
cld cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16,7 alignx 16, 7
.expandloop: .expandloop:
push eax push eax
push ecx push ecx
mov edi, JSAMPROW [esi] mov edi, JSAMPROW [esi]
add edi,edx add edi, edx
mov al, JSAMPLE [edi-1] mov al, JSAMPLE [edi-1]
rep stosb rep stosb
@@ -248,18 +248,18 @@ EXTN(jsimd_h2v2_downsample_sse2):
; -- h2v2_downsample ; -- h2v2_downsample
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
test eax,eax test eax, eax
jle near .return jle near .return
mov edx, 0x00020001 ; bias pattern mov edx, 0x00020001 ; bias pattern
movd xmm7,edx movd xmm7, edx
pcmpeqw xmm6,xmm6 pcmpeqw xmm6, xmm6
pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
push ecx push ecx
push edi push edi
@@ -271,16 +271,16 @@ EXTN(jsimd_h2v2_downsample_sse2):
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop jae short .columnloop
alignx 16,7 alignx 16, 7
.columnloop_r8: .columnloop_r8:
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
pxor xmm2,xmm2 pxor xmm2, xmm2
pxor xmm3,xmm3 pxor xmm3, xmm3
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jmp short .downsample jmp short .downsample
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
@@ -289,32 +289,32 @@ EXTN(jsimd_h2v2_downsample_sse2):
movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
.downsample: .downsample:
movdqa xmm4,xmm0 movdqa xmm4, xmm0
movdqa xmm5,xmm1 movdqa xmm5, xmm1
pand xmm0,xmm6 pand xmm0, xmm6
psrlw xmm4,BYTE_BIT psrlw xmm4, BYTE_BIT
pand xmm1,xmm6 pand xmm1, xmm6
psrlw xmm5,BYTE_BIT psrlw xmm5, BYTE_BIT
paddw xmm0,xmm4 paddw xmm0, xmm4
paddw xmm1,xmm5 paddw xmm1, xmm5
movdqa xmm4,xmm2 movdqa xmm4, xmm2
movdqa xmm5,xmm3 movdqa xmm5, xmm3
pand xmm2,xmm6 pand xmm2, xmm6
psrlw xmm4,BYTE_BIT psrlw xmm4, BYTE_BIT
pand xmm3,xmm6 pand xmm3, xmm6
psrlw xmm5,BYTE_BIT psrlw xmm5, BYTE_BIT
paddw xmm2,xmm4 paddw xmm2, xmm4
paddw xmm3,xmm5 paddw xmm3, xmm5
paddw xmm0,xmm1 paddw xmm0, xmm1
paddw xmm2,xmm3 paddw xmm2, xmm3
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm2,xmm7 paddw xmm2, xmm7
psrlw xmm0,2 psrlw xmm0, 2
psrlw xmm2,2 psrlw xmm2, 2
packuswb xmm0,xmm2 packuswb xmm0, xmm2
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
@@ -324,7 +324,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
add edi, byte 1*SIZEOF_XMMWORD ; outptr add edi, byte 1*SIZEOF_XMMWORD ; outptr
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
test ecx,ecx test ecx, ecx
jnz near .columnloop_r8 jnz near .columnloop_r8
pop esi pop esi

View File

@@ -42,17 +42,17 @@
EXTN(jsimd_ycc_rgb_convert_sse2): EXTN(jsimd_ycc_rgb_convert_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
push rbx push rbx
mov ecx, r10d ; num_cols mov ecx, r10d ; num_cols
test rcx,rcx test rcx, rcx
jz near .return jz near .return
push rcx push rcx
@@ -70,7 +70,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
mov rdi, r13 mov rdi, r13
mov eax, r14d mov eax, r14d
test rax,rax test rax, rax
jle near .return jle near .return
.rowloop: .rowloop:
push rax push rax
@@ -89,21 +89,21 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF) movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF)
movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF) movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF)
pcmpeqw xmm4,xmm4 pcmpeqw xmm4, xmm4
pcmpeqw xmm7,xmm7 pcmpeqw xmm7, xmm7
psrlw xmm4,BYTE_BIT psrlw xmm4, BYTE_BIT
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
paddw xmm4,xmm7 paddw xmm4, xmm7
paddw xmm5,xmm7 paddw xmm5, xmm7
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm1,xmm7 paddw xmm1, xmm7
; (Original) ; (Original)
; R = Y + 1.40200 * Cr ; R = Y + 1.40200 * Cr
@@ -115,85 +115,85 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb ; B = Y - 0.22800 * Cb + Cb + Cb
movdqa xmm2,xmm4 ; xmm2=CbE movdqa xmm2, xmm4 ; xmm2=CbE
movdqa xmm3,xmm5 ; xmm3=CbO movdqa xmm3, xmm5 ; xmm3=CbO
paddw xmm4,xmm4 ; xmm4=2*CbE paddw xmm4, xmm4 ; xmm4=2*CbE
paddw xmm5,xmm5 ; xmm5=2*CbO paddw xmm5, xmm5 ; xmm5=2*CbO
movdqa xmm6,xmm0 ; xmm6=CrE movdqa xmm6, xmm0 ; xmm6=CrE
movdqa xmm7,xmm1 ; xmm7=CrO movdqa xmm7, xmm1 ; xmm7=CrO
paddw xmm0,xmm0 ; xmm0=2*CrE paddw xmm0, xmm0 ; xmm0=2*CrE
paddw xmm1,xmm1 ; xmm1=2*CrO paddw xmm1, xmm1 ; xmm1=2*CrO
pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800)) pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800))
pmulhw xmm5,[rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800)) pmulhw xmm5, [rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800))
pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200)) pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200))
pmulhw xmm1,[rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200)) pmulhw xmm1, [rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200))
paddw xmm4,[rel PW_ONE] paddw xmm4, [rel PW_ONE]
paddw xmm5,[rel PW_ONE] paddw xmm5, [rel PW_ONE]
psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
paddw xmm0,[rel PW_ONE] paddw xmm0, [rel PW_ONE]
paddw xmm1,[rel PW_ONE] paddw xmm1, [rel PW_ONE]
psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
paddw xmm4,xmm2 paddw xmm4, xmm2
paddw xmm5,xmm3 paddw xmm5, xmm3
paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
movdqa xmm4,xmm2 movdqa xmm4, xmm2
movdqa xmm5,xmm3 movdqa xmm5, xmm3
punpcklwd xmm2,xmm6 punpcklwd xmm2, xmm6
punpckhwd xmm4,xmm6 punpckhwd xmm4, xmm6
pmaddwd xmm2,[rel PW_MF0344_F0285] pmaddwd xmm2, [rel PW_MF0344_F0285]
pmaddwd xmm4,[rel PW_MF0344_F0285] pmaddwd xmm4, [rel PW_MF0344_F0285]
punpcklwd xmm3,xmm7 punpcklwd xmm3, xmm7
punpckhwd xmm5,xmm7 punpckhwd xmm5, xmm7
pmaddwd xmm3,[rel PW_MF0344_F0285] pmaddwd xmm3, [rel PW_MF0344_F0285]
pmaddwd xmm5,[rel PW_MF0344_F0285] pmaddwd xmm5, [rel PW_MF0344_F0285]
paddd xmm2,[rel PD_ONEHALF] paddd xmm2, [rel PD_ONEHALF]
paddd xmm4,[rel PD_ONEHALF] paddd xmm4, [rel PD_ONEHALF]
psrad xmm2,SCALEBITS psrad xmm2, SCALEBITS
psrad xmm4,SCALEBITS psrad xmm4, SCALEBITS
paddd xmm3,[rel PD_ONEHALF] paddd xmm3, [rel PD_ONEHALF]
paddd xmm5,[rel PD_ONEHALF] paddd xmm5, [rel PD_ONEHALF]
psrad xmm3,SCALEBITS psrad xmm3, SCALEBITS
psrad xmm5,SCALEBITS psrad xmm5, SCALEBITS
packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF) movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF)
pcmpeqw xmm4,xmm4 pcmpeqw xmm4, xmm4
psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
@@ -202,44 +202,44 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
movdqa xmmG,xmmA movdqa xmmG, xmmA
movdqa xmmH,xmmA movdqa xmmH, xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
movdqa xmmC,xmmD movdqa xmmC, xmmD
movdqa xmmB,xmmD movdqa xmmB, xmmD
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
movdqa xmmF,xmmE movdqa xmmF, xmmE
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB,xmmE movdqa xmmB, xmmE
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB,xmmF movdqa xmmB, xmmF
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st32 jb short .column_st32
@@ -272,7 +272,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF movdqa xmmA, xmmF
sub rcx, byte 2*SIZEOF_XMMWORD sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15 jmp short .column_st15
.column_st16: .column_st16:
@@ -280,7 +280,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
jb short .column_st15 jb short .column_st15
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA, xmmD
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
.column_st15: .column_st15:
; Store the lower 8 bytes of xmmA to the output when it has enough ; Store the lower 8 bytes of xmmA to the output when it has enough
@@ -320,35 +320,35 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF %ifdef RGBX_FILLER_0XFF
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else %else
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif %endif
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG,xmmB movdqa xmmG, xmmB
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH,xmmC movdqa xmmH, xmmC
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st32 jb short .column_st32
@@ -382,15 +382,15 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC movdqa xmmA, xmmC
movdqa xmmD,xmmH movdqa xmmD, xmmH
sub rcx, byte SIZEOF_XMMWORD/2 sub rcx, byte SIZEOF_XMMWORD/2
.column_st16: .column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4 cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15 jb short .column_st15
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA, xmmD
sub rcx, byte SIZEOF_XMMWORD/4 sub rcx, byte SIZEOF_XMMWORD/4
.column_st15: .column_st15:
; Store two pixels (8 bytes) of xmmA to the output when it has enough ; Store two pixels (8 bytes) of xmmA to the output when it has enough
@@ -430,7 +430,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret

View File

@@ -44,11 +44,11 @@
EXTN(jsimd_ycc_rgb_convert_sse2): EXTN(jsimd_ycc_rgb_convert_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address pushpic eax ; make a room for GOT address
push ebx push ebx
@@ -61,7 +61,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movpic POINTER [gotptr], ebx ; save GOT address movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [out_width(eax)] ; num_cols mov ecx, JDIMENSION [out_width(eax)] ; num_cols
test ecx,ecx test ecx, ecx
jz near .return jz near .return
push ecx push ecx
@@ -79,9 +79,9 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
mov edi, JSAMPARRAY [output_buf(eax)] mov edi, JSAMPARRAY [output_buf(eax)]
mov eax, INT [num_rows(eax)] mov eax, INT [num_rows(eax)]
test eax,eax test eax, eax
jle near .return jle near .return
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
push eax push eax
push edi push edi
@@ -95,27 +95,27 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
mov edx, JSAMPROW [edx] ; inptr2 mov edx, JSAMPROW [edx] ; inptr2
mov edi, JSAMPROW [edi] ; outptr mov edi, JSAMPROW [edi] ; outptr
movpic eax, POINTER [gotptr] ; load GOT address (eax) movpic eax, POINTER [gotptr] ; load GOT address (eax)
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF) movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF) movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
pcmpeqw xmm4,xmm4 pcmpeqw xmm4, xmm4
pcmpeqw xmm7,xmm7 pcmpeqw xmm7, xmm7
psrlw xmm4,BYTE_BIT psrlw xmm4, BYTE_BIT
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
paddw xmm4,xmm7 paddw xmm4, xmm7
paddw xmm5,xmm7 paddw xmm5, xmm7
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm1,xmm7 paddw xmm1, xmm7
; (Original) ; (Original)
; R = Y + 1.40200 * Cr ; R = Y + 1.40200 * Cr
@@ -127,85 +127,85 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb ; B = Y - 0.22800 * Cb + Cb + Cb
movdqa xmm2,xmm4 ; xmm2=CbE movdqa xmm2, xmm4 ; xmm2=CbE
movdqa xmm3,xmm5 ; xmm3=CbO movdqa xmm3, xmm5 ; xmm3=CbO
paddw xmm4,xmm4 ; xmm4=2*CbE paddw xmm4, xmm4 ; xmm4=2*CbE
paddw xmm5,xmm5 ; xmm5=2*CbO paddw xmm5, xmm5 ; xmm5=2*CbO
movdqa xmm6,xmm0 ; xmm6=CrE movdqa xmm6, xmm0 ; xmm6=CrE
movdqa xmm7,xmm1 ; xmm7=CrO movdqa xmm7, xmm1 ; xmm7=CrO
paddw xmm0,xmm0 ; xmm0=2*CrE paddw xmm0, xmm0 ; xmm0=2*CrE
paddw xmm1,xmm1 ; xmm1=2*CrO paddw xmm1, xmm1 ; xmm1=2*CrO
pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800)) pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800)) pmulhw xmm5, [GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200)) pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200)) pmulhw xmm1, [GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
paddw xmm4,[GOTOFF(eax,PW_ONE)] paddw xmm4, [GOTOFF(eax,PW_ONE)]
paddw xmm5,[GOTOFF(eax,PW_ONE)] paddw xmm5, [GOTOFF(eax,PW_ONE)]
psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
paddw xmm0,[GOTOFF(eax,PW_ONE)] paddw xmm0, [GOTOFF(eax,PW_ONE)]
paddw xmm1,[GOTOFF(eax,PW_ONE)] paddw xmm1, [GOTOFF(eax,PW_ONE)]
psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
paddw xmm4,xmm2 paddw xmm4, xmm2
paddw xmm5,xmm3 paddw xmm5, xmm3
paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
movdqa xmm4,xmm2 movdqa xmm4, xmm2
movdqa xmm5,xmm3 movdqa xmm5, xmm3
punpcklwd xmm2,xmm6 punpcklwd xmm2, xmm6
punpckhwd xmm4,xmm6 punpckhwd xmm4, xmm6
pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)] pmaddwd xmm4, [GOTOFF(eax,PW_MF0344_F0285)]
punpcklwd xmm3,xmm7 punpcklwd xmm3, xmm7
punpckhwd xmm5,xmm7 punpckhwd xmm5, xmm7
pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)] pmaddwd xmm3, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
paddd xmm4,[GOTOFF(eax,PD_ONEHALF)] paddd xmm4, [GOTOFF(eax,PD_ONEHALF)]
psrad xmm2,SCALEBITS psrad xmm2, SCALEBITS
psrad xmm4,SCALEBITS psrad xmm4, SCALEBITS
paddd xmm3,[GOTOFF(eax,PD_ONEHALF)] paddd xmm3, [GOTOFF(eax,PD_ONEHALF)]
paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
psrad xmm3,SCALEBITS psrad xmm3, SCALEBITS
psrad xmm5,SCALEBITS psrad xmm5, SCALEBITS
packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF) movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
pcmpeqw xmm4,xmm4 pcmpeqw xmm4, xmm4
psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
@@ -214,44 +214,44 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
movdqa xmmG,xmmA movdqa xmmG, xmmA
movdqa xmmH,xmmA movdqa xmmH, xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
movdqa xmmC,xmmD movdqa xmmC, xmmD
movdqa xmmB,xmmD movdqa xmmB, xmmD
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
movdqa xmmF,xmmE movdqa xmmF, xmmE
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB,xmmE movdqa xmmB, xmmE
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB,xmmF movdqa xmmB, xmmF
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32 jb short .column_st32
@@ -276,7 +276,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
add ebx, byte SIZEOF_XMMWORD ; inptr1 add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2 add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16,7 alignx 16, 7
.column_st32: .column_st32:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
@@ -285,7 +285,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF movdqa xmmA, xmmF
sub ecx, byte 2*SIZEOF_XMMWORD sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15 jmp short .column_st15
.column_st16: .column_st16:
@@ -293,7 +293,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
jb short .column_st15 jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA, xmmD
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
.column_st15: .column_st15:
; Store the lower 8 bytes of xmmA to the output when it has enough ; Store the lower 8 bytes of xmmA to the output when it has enough
@@ -333,35 +333,35 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF %ifdef RGBX_FILLER_0XFF
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else %else
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif %endif
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG,xmmB movdqa xmmG, xmmB
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH,xmmC movdqa xmmH, xmmC
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32 jb short .column_st32
@@ -388,7 +388,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
add ebx, byte SIZEOF_XMMWORD ; inptr1 add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2 add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16,7 alignx 16, 7
.column_st32: .column_st32:
cmp ecx, byte SIZEOF_XMMWORD/2 cmp ecx, byte SIZEOF_XMMWORD/2
@@ -396,15 +396,15 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC movdqa xmmA, xmmC
movdqa xmmD,xmmH movdqa xmmD, xmmH
sub ecx, byte SIZEOF_XMMWORD/2 sub ecx, byte SIZEOF_XMMWORD/2
.column_st16: .column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4 cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15 jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA, xmmD
sub ecx, byte SIZEOF_XMMWORD/4 sub ecx, byte SIZEOF_XMMWORD/4
.column_st15: .column_st15:
; Store two pixels (8 bytes) of xmmA to the output when it has enough ; Store two pixels (8 bytes) of xmmA to the output when it has enough
@@ -424,7 +424,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------
alignx 16,7 alignx 16, 7
.nextrow: .nextrow:
pop ecx pop ecx
@@ -449,7 +449,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
pop ebx pop ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret

View File

@@ -42,17 +42,17 @@
EXTN(jsimd_h2v1_merged_upsample_sse2): EXTN(jsimd_h2v1_merged_upsample_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
push rbx push rbx
mov ecx, r10d ; col mov ecx, r10d ; col
test rcx,rcx test rcx, rcx
jz near .return jz near .return
push rcx push rcx
@@ -75,21 +75,21 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF) movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF)
movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF) movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF)
pxor xmm1,xmm1 ; xmm1=(all 0's) pxor xmm1, xmm1 ; xmm1=(all 0's)
pcmpeqw xmm3,xmm3 pcmpeqw xmm3, xmm3
psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
movdqa xmm4,xmm6 movdqa xmm4, xmm6
punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
movdqa xmm0,xmm7 movdqa xmm0, xmm7
punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
paddw xmm6,xmm3 paddw xmm6, xmm3
paddw xmm4,xmm3 paddw xmm4, xmm3
paddw xmm7,xmm3 paddw xmm7, xmm3
paddw xmm0,xmm3 paddw xmm0, xmm3
; (Original) ; (Original)
; R = Y + 1.40200 * Cr ; R = Y + 1.40200 * Cr
@@ -101,67 +101,67 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb ; B = Y - 0.22800 * Cb + Cb + Cb
movdqa xmm5,xmm6 ; xmm5=CbH movdqa xmm5, xmm6 ; xmm5=CbH
movdqa xmm2,xmm4 ; xmm2=CbL movdqa xmm2, xmm4 ; xmm2=CbL
paddw xmm6,xmm6 ; xmm6=2*CbH paddw xmm6, xmm6 ; xmm6=2*CbH
paddw xmm4,xmm4 ; xmm4=2*CbL paddw xmm4, xmm4 ; xmm4=2*CbL
movdqa xmm1,xmm7 ; xmm1=CrH movdqa xmm1, xmm7 ; xmm1=CrH
movdqa xmm3,xmm0 ; xmm3=CrL movdqa xmm3, xmm0 ; xmm3=CrL
paddw xmm7,xmm7 ; xmm7=2*CrH paddw xmm7, xmm7 ; xmm7=2*CrH
paddw xmm0,xmm0 ; xmm0=2*CrL paddw xmm0, xmm0 ; xmm0=2*CrL
pmulhw xmm6,[rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800)) pmulhw xmm6, [rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800))
pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800)) pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800))
pmulhw xmm7,[rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200)) pmulhw xmm7, [rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200)) pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
paddw xmm6,[rel PW_ONE] paddw xmm6, [rel PW_ONE]
paddw xmm4,[rel PW_ONE] paddw xmm4, [rel PW_ONE]
psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
paddw xmm7,[rel PW_ONE] paddw xmm7, [rel PW_ONE]
paddw xmm0,[rel PW_ONE] paddw xmm0, [rel PW_ONE]
psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
paddw xmm6,xmm5 paddw xmm6, xmm5
paddw xmm4,xmm2 paddw xmm4, xmm2
paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
movdqa xmm6,xmm5 movdqa xmm6, xmm5
movdqa xmm7,xmm2 movdqa xmm7, xmm2
punpcklwd xmm5,xmm1 punpcklwd xmm5, xmm1
punpckhwd xmm6,xmm1 punpckhwd xmm6, xmm1
pmaddwd xmm5,[rel PW_MF0344_F0285] pmaddwd xmm5, [rel PW_MF0344_F0285]
pmaddwd xmm6,[rel PW_MF0344_F0285] pmaddwd xmm6, [rel PW_MF0344_F0285]
punpcklwd xmm2,xmm3 punpcklwd xmm2, xmm3
punpckhwd xmm7,xmm3 punpckhwd xmm7, xmm3
pmaddwd xmm2,[rel PW_MF0344_F0285] pmaddwd xmm2, [rel PW_MF0344_F0285]
pmaddwd xmm7,[rel PW_MF0344_F0285] pmaddwd xmm7, [rel PW_MF0344_F0285]
paddd xmm5,[rel PD_ONEHALF] paddd xmm5, [rel PD_ONEHALF]
paddd xmm6,[rel PD_ONEHALF] paddd xmm6, [rel PD_ONEHALF]
psrad xmm5,SCALEBITS psrad xmm5, SCALEBITS
psrad xmm6,SCALEBITS psrad xmm6, SCALEBITS
paddd xmm2,[rel PD_ONEHALF] paddd xmm2, [rel PD_ONEHALF]
paddd xmm7,[rel PD_ONEHALF] paddd xmm7, [rel PD_ONEHALF]
psrad xmm2,SCALEBITS psrad xmm2, SCALEBITS
psrad xmm7,SCALEBITS psrad xmm7, SCALEBITS
packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
mov al,2 ; Yctr mov al, 2 ; Yctr
jmp short .Yloop_1st jmp short .Yloop_1st
.Yloop_2nd: .Yloop_2nd:
@@ -172,29 +172,29 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
.Yloop_1st: .Yloop_1st:
movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF) movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF)
pcmpeqw xmm6,xmm6 pcmpeqw xmm6, xmm6
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
@@ -203,44 +203,44 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
movdqa xmmG,xmmA movdqa xmmG, xmmA
movdqa xmmH,xmmA movdqa xmmH, xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
movdqa xmmC,xmmD movdqa xmmC, xmmD
movdqa xmmB,xmmD movdqa xmmB, xmmD
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
movdqa xmmF,xmmE movdqa xmmF, xmmE
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB,xmmE movdqa xmmB, xmmE
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB,xmmF movdqa xmmB, xmmF
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st32 jb short .column_st32
@@ -276,7 +276,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF movdqa xmmA, xmmF
sub rcx, byte 2*SIZEOF_XMMWORD sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15 jmp short .column_st15
.column_st16: .column_st16:
@@ -284,7 +284,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
jb short .column_st15 jb short .column_st15
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA, xmmD
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
.column_st15: .column_st15:
; Store the lower 8 bytes of xmmA to the output when it has enough ; Store the lower 8 bytes of xmmA to the output when it has enough
@@ -324,35 +324,35 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF %ifdef RGBX_FILLER_0XFF
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else %else
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif %endif
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG,xmmB movdqa xmmG, xmmB
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH,xmmC movdqa xmmH, xmmC
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st32 jb short .column_st32
@@ -389,15 +389,15 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC movdqa xmmA, xmmC
movdqa xmmD,xmmH movdqa xmmD, xmmH
sub rcx, byte SIZEOF_XMMWORD/2 sub rcx, byte SIZEOF_XMMWORD/2
.column_st16: .column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4 cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15 jb short .column_st15
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA, xmmD
sub rcx, byte SIZEOF_XMMWORD/4 sub rcx, byte SIZEOF_XMMWORD/4
.column_st15: .column_st15:
; Store two pixels (8 bytes) of xmmA to the output when it has enough ; Store two pixels (8 bytes) of xmmA to the output when it has enough
@@ -423,7 +423,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret
@@ -449,8 +449,8 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
EXTN(jsimd_h2v2_merged_upsample_sse2): EXTN(jsimd_h2v2_merged_upsample_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
push rbx push rbx
@@ -467,7 +467,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
push rdx ; inptr2 push rdx ; inptr2
push rbx ; inptr1 push rbx ; inptr1
push rsi ; inptr00 push rsi ; inptr00
mov rbx,rsp mov rbx, rsp
push rdi push rdi
push rcx push rcx
@@ -500,7 +500,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
push rdx ; inptr2 push rdx ; inptr2
push rbx ; inptr1 push rbx ; inptr1
push rsi ; inptr00 push rsi ; inptr00
mov rbx,rsp mov rbx, rsp
push rdi push rdi
push rcx push rcx

View File

@@ -44,11 +44,11 @@
EXTN(jsimd_h2v1_merged_upsample_sse2): EXTN(jsimd_h2v1_merged_upsample_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address pushpic eax ; make a room for GOT address
push ebx push ebx
@@ -61,7 +61,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movpic POINTER [gotptr], ebx ; save GOT address movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [output_width(eax)] ; col mov ecx, JDIMENSION [output_width(eax)] ; col
test ecx,ecx test ecx, ecx
jz near .return jz near .return
push ecx push ecx
@@ -79,28 +79,28 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
pop ecx ; col pop ecx ; col
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movpic eax, POINTER [gotptr] ; load GOT address (eax) movpic eax, POINTER [gotptr] ; load GOT address (eax)
movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF) movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF)
movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF) movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF)
pxor xmm1,xmm1 ; xmm1=(all 0's) pxor xmm1, xmm1 ; xmm1=(all 0's)
pcmpeqw xmm3,xmm3 pcmpeqw xmm3, xmm3
psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
movdqa xmm4,xmm6 movdqa xmm4, xmm6
punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
movdqa xmm0,xmm7 movdqa xmm0, xmm7
punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
paddw xmm6,xmm3 paddw xmm6, xmm3
paddw xmm4,xmm3 paddw xmm4, xmm3
paddw xmm7,xmm3 paddw xmm7, xmm3
paddw xmm0,xmm3 paddw xmm0, xmm3
; (Original) ; (Original)
; R = Y + 1.40200 * Cr ; R = Y + 1.40200 * Cr
@@ -112,102 +112,102 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb ; B = Y - 0.22800 * Cb + Cb + Cb
movdqa xmm5,xmm6 ; xmm5=CbH movdqa xmm5, xmm6 ; xmm5=CbH
movdqa xmm2,xmm4 ; xmm2=CbL movdqa xmm2, xmm4 ; xmm2=CbL
paddw xmm6,xmm6 ; xmm6=2*CbH paddw xmm6, xmm6 ; xmm6=2*CbH
paddw xmm4,xmm4 ; xmm4=2*CbL paddw xmm4, xmm4 ; xmm4=2*CbL
movdqa xmm1,xmm7 ; xmm1=CrH movdqa xmm1, xmm7 ; xmm1=CrH
movdqa xmm3,xmm0 ; xmm3=CrL movdqa xmm3, xmm0 ; xmm3=CrL
paddw xmm7,xmm7 ; xmm7=2*CrH paddw xmm7, xmm7 ; xmm7=2*CrH
paddw xmm0,xmm0 ; xmm0=2*CrL paddw xmm0, xmm0 ; xmm0=2*CrL
pmulhw xmm6,[GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800)) pmulhw xmm6, [GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800))
pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800)) pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800))
pmulhw xmm7,[GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200)) pmulhw xmm7, [GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200))
pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200)) pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200))
paddw xmm6,[GOTOFF(eax,PW_ONE)] paddw xmm6, [GOTOFF(eax,PW_ONE)]
paddw xmm4,[GOTOFF(eax,PW_ONE)] paddw xmm4, [GOTOFF(eax,PW_ONE)]
psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
paddw xmm7,[GOTOFF(eax,PW_ONE)] paddw xmm7, [GOTOFF(eax,PW_ONE)]
paddw xmm0,[GOTOFF(eax,PW_ONE)] paddw xmm0, [GOTOFF(eax,PW_ONE)]
psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
paddw xmm6,xmm5 paddw xmm6, xmm5
paddw xmm4,xmm2 paddw xmm4, xmm2
paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
movdqa xmm6,xmm5 movdqa xmm6, xmm5
movdqa xmm7,xmm2 movdqa xmm7, xmm2
punpcklwd xmm5,xmm1 punpcklwd xmm5, xmm1
punpckhwd xmm6,xmm1 punpckhwd xmm6, xmm1
pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm6,[GOTOFF(eax,PW_MF0344_F0285)] pmaddwd xmm6, [GOTOFF(eax,PW_MF0344_F0285)]
punpcklwd xmm2,xmm3 punpcklwd xmm2, xmm3
punpckhwd xmm7,xmm3 punpckhwd xmm7, xmm3
pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm7,[GOTOFF(eax,PW_MF0344_F0285)] pmaddwd xmm7, [GOTOFF(eax,PW_MF0344_F0285)]
paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
paddd xmm6,[GOTOFF(eax,PD_ONEHALF)] paddd xmm6, [GOTOFF(eax,PD_ONEHALF)]
psrad xmm5,SCALEBITS psrad xmm5, SCALEBITS
psrad xmm6,SCALEBITS psrad xmm6, SCALEBITS
paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
paddd xmm7,[GOTOFF(eax,PD_ONEHALF)] paddd xmm7, [GOTOFF(eax,PD_ONEHALF)]
psrad xmm2,SCALEBITS psrad xmm2, SCALEBITS
psrad xmm7,SCALEBITS psrad xmm7, SCALEBITS
packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
mov al,2 ; Yctr mov al, 2 ; Yctr
jmp short .Yloop_1st jmp short .Yloop_1st
alignx 16,7 alignx 16, 7
.Yloop_2nd: .Yloop_2nd:
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
alignx 16,7 alignx 16, 7
.Yloop_1st: .Yloop_1st:
movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF) movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF)
pcmpeqw xmm6,xmm6 pcmpeqw xmm6, xmm6
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
@@ -216,44 +216,44 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
movdqa xmmG,xmmA movdqa xmmG, xmmA
movdqa xmmH,xmmA movdqa xmmH, xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
movdqa xmmC,xmmD movdqa xmmC, xmmD
movdqa xmmB,xmmD movdqa xmmB, xmmD
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
movdqa xmmF,xmmE movdqa xmmF, xmmE
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB,xmmE movdqa xmmB, xmmE
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB,xmmF movdqa xmmB, xmmF
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32 jb short .column_st32
@@ -281,7 +281,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
add ebx, byte SIZEOF_XMMWORD ; inptr1 add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2 add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16,7 alignx 16, 7
.column_st32: .column_st32:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
@@ -290,7 +290,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF movdqa xmmA, xmmF
sub ecx, byte 2*SIZEOF_XMMWORD sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15 jmp short .column_st15
.column_st16: .column_st16:
@@ -298,7 +298,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
jb short .column_st15 jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA, xmmD
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
.column_st15: .column_st15:
; Store the lower 8 bytes of xmmA to the output when it has enough ; Store the lower 8 bytes of xmmA to the output when it has enough
@@ -338,35 +338,35 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF %ifdef RGBX_FILLER_0XFF
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else %else
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif %endif
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
movdqa xmmC,xmmA movdqa xmmC, xmmA
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG,xmmB movdqa xmmG, xmmB
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmD,xmmA movdqa xmmD, xmmA
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH,xmmC movdqa xmmH, xmmC
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32 jb short .column_st32
@@ -396,7 +396,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
add ebx, byte SIZEOF_XMMWORD ; inptr1 add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2 add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16,7 alignx 16, 7
.column_st32: .column_st32:
cmp ecx, byte SIZEOF_XMMWORD/2 cmp ecx, byte SIZEOF_XMMWORD/2
@@ -404,15 +404,15 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC movdqa xmmA, xmmC
movdqa xmmD,xmmH movdqa xmmD, xmmH
sub ecx, byte SIZEOF_XMMWORD/2 sub ecx, byte SIZEOF_XMMWORD/2
.column_st16: .column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4 cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15 jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA, xmmD
sub ecx, byte SIZEOF_XMMWORD/4 sub ecx, byte SIZEOF_XMMWORD/4
.column_st15: .column_st15:
; Store two pixels (8 bytes) of xmmA to the output when it has enough ; Store two pixels (8 bytes) of xmmA to the output when it has enough
@@ -441,7 +441,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
pop ebx pop ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret
@@ -467,7 +467,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
EXTN(jsimd_h2v2_merged_upsample_sse2): EXTN(jsimd_h2v2_merged_upsample_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
@@ -487,7 +487,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
push edx ; inptr2 push edx ; inptr2
push ebx ; inptr1 push ebx ; inptr1
push esi ; inptr00 push esi ; inptr00
mov ebx,esp mov ebx, esp
push edi ; output_buf (outptr0) push edi ; output_buf (outptr0)
push ecx ; in_row_group_ctr push ecx ; in_row_group_ctr

View File

@@ -62,16 +62,16 @@ PW_EIGHT times 8 dw 8
EXTN(jsimd_h2v1_fancy_upsample_sse2): EXTN(jsimd_h2v1_fancy_upsample_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
mov eax, r11d ; colctr mov eax, r11d ; colctr
test rax,rax test rax, rax
jz near .return jz near .return
mov rcx, r10 ; rowctr mov rcx, r10 ; rowctr
test rcx,rcx test rcx, rcx
jz near .return jz near .return
mov rsi, r12 ; input_data mov rsi, r12 ; input_data
@@ -90,9 +90,9 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
.skip: .skip:
pxor xmm0,xmm0 ; xmm0=(all 0's) pxor xmm0, xmm0 ; xmm0=(all 0's)
pcmpeqb xmm7,xmm7 pcmpeqb xmm7, xmm7
psrldq xmm7,(SIZEOF_XMMWORD-1) psrldq xmm7, (SIZEOF_XMMWORD-1)
pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD] pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
add rax, byte SIZEOF_XMMWORD-1 add rax, byte SIZEOF_XMMWORD-1
@@ -101,58 +101,58 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
ja short .columnloop ja short .columnloop
.columnloop_last: .columnloop_last:
pcmpeqb xmm6,xmm6 pcmpeqb xmm6, xmm6
pslldq xmm6,(SIZEOF_XMMWORD-1) pslldq xmm6, (SIZEOF_XMMWORD-1)
pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD] pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
jmp short .upsample jmp short .upsample
.columnloop: .columnloop:
movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
pslldq xmm6,(SIZEOF_XMMWORD-1) pslldq xmm6, (SIZEOF_XMMWORD-1)
.upsample: .upsample:
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqa xmm2,xmm1 movdqa xmm2, xmm1
movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14)
psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --)
por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
movdqa xmm7,xmm1 movdqa xmm7, xmm1
psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
movdqa xmm4,xmm1 movdqa xmm4, xmm1
punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
movdqa xmm5,xmm2 movdqa xmm5, xmm2
punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
movdqa xmm6,xmm3 movdqa xmm6, xmm3
punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
pmullw xmm1,[rel PW_THREE] pmullw xmm1, [rel PW_THREE]
pmullw xmm4,[rel PW_THREE] pmullw xmm4, [rel PW_THREE]
paddw xmm2,[rel PW_ONE] paddw xmm2, [rel PW_ONE]
paddw xmm5,[rel PW_ONE] paddw xmm5, [rel PW_ONE]
paddw xmm3,[rel PW_TWO] paddw xmm3, [rel PW_TWO]
paddw xmm6,[rel PW_TWO] paddw xmm6, [rel PW_TWO]
paddw xmm2,xmm1 paddw xmm2, xmm1
paddw xmm5,xmm4 paddw xmm5, xmm4
psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
paddw xmm3,xmm1 paddw xmm3, xmm1
paddw xmm6,xmm4 paddw xmm6, xmm4
psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
psllw xmm3,BYTE_BIT psllw xmm3, BYTE_BIT
psllw xmm6,BYTE_BIT psllw xmm6, BYTE_BIT
por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
@@ -162,7 +162,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
add rdi, byte 2*SIZEOF_XMMWORD ; outptr add rdi, byte 2*SIZEOF_XMMWORD ; outptr
cmp rax, byte SIZEOF_XMMWORD cmp rax, byte SIZEOF_XMMWORD
ja near .columnloop ja near .columnloop
test eax,eax test eax, eax
jnz near .columnloop_last jnz near .columnloop_last
pop rsi pop rsi
@@ -204,21 +204,21 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
EXTN(jsimd_h2v2_fancy_upsample_sse2): EXTN(jsimd_h2v2_fancy_upsample_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
push rbx push rbx
mov eax, r11d ; colctr mov eax, r11d ; colctr
test rax,rax test rax, rax
jz near .return jz near .return
mov rcx, r10 ; rowctr mov rcx, r10 ; rowctr
test rcx,rcx test rcx, rcx
jz near .return jz near .return
mov rsi, r12 ; input_data mov rsi, r12 ; input_data
@@ -253,35 +253,35 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
pxor xmm3,xmm3 ; xmm3=(all 0's) pxor xmm3, xmm3 ; xmm3=(all 0's)
movdqa xmm4,xmm0 movdqa xmm4, xmm0
punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
movdqa xmm5,xmm1 movdqa xmm5, xmm1
punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
movdqa xmm6,xmm2 movdqa xmm6, xmm2
punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
pmullw xmm0,[rel PW_THREE] pmullw xmm0, [rel PW_THREE]
pmullw xmm4,[rel PW_THREE] pmullw xmm4, [rel PW_THREE]
pcmpeqb xmm7,xmm7 pcmpeqb xmm7, xmm7
psrldq xmm7,(SIZEOF_XMMWORD-2) psrldq xmm7, (SIZEOF_XMMWORD-2)
paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
movdqa XMMWORD [wk(0)], xmm1 movdqa XMMWORD [wk(0)], xmm1
movdqa XMMWORD [wk(1)], xmm2 movdqa XMMWORD [wk(1)], xmm2
@@ -294,9 +294,9 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
.columnloop_last: .columnloop_last:
; -- process the last column block ; -- process the last column block
pcmpeqb xmm1,xmm1 pcmpeqb xmm1, xmm1
pslldq xmm1,(SIZEOF_XMMWORD-2) pslldq xmm1, (SIZEOF_XMMWORD-2)
movdqa xmm2,xmm1 movdqa xmm2, xmm1
pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD] pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD] pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
@@ -313,32 +313,32 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
pxor xmm3,xmm3 ; xmm3=(all 0's) pxor xmm3, xmm3 ; xmm3=(all 0's)
movdqa xmm4,xmm0 movdqa xmm4, xmm0
punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
movdqa xmm5,xmm1 movdqa xmm5, xmm1
punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
movdqa xmm6,xmm2 movdqa xmm6, xmm2
punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
pmullw xmm0,[rel PW_THREE] pmullw xmm0, [rel PW_THREE]
pmullw xmm4,[rel PW_THREE] pmullw xmm4, [rel PW_THREE]
paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
movdqa XMMWORD [wk(2)], xmm1 movdqa XMMWORD [wk(2)], xmm1
movdqa XMMWORD [wk(3)], xmm2 movdqa XMMWORD [wk(3)], xmm2
@@ -349,50 +349,50 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD] movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD] movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --)
pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
movdqa xmm5,xmm7 movdqa xmm5, xmm7
movdqa xmm6,xmm3 movdqa xmm6, xmm3
psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14)
por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
movdqa xmm1,xmm7 movdqa xmm1, xmm7
movdqa xmm2,xmm3 movdqa xmm2, xmm3
pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --)
movdqa xmm4,xmm3 movdqa xmm4, xmm3
psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
movdqa XMMWORD [wk(0)], xmm4 movdqa XMMWORD [wk(0)], xmm4
pmullw xmm7,[rel PW_THREE] pmullw xmm7, [rel PW_THREE]
pmullw xmm3,[rel PW_THREE] pmullw xmm3, [rel PW_THREE]
paddw xmm1,[rel PW_EIGHT] paddw xmm1, [rel PW_EIGHT]
paddw xmm5,[rel PW_EIGHT] paddw xmm5, [rel PW_EIGHT]
paddw xmm0,[rel PW_SEVEN] paddw xmm0, [rel PW_SEVEN]
paddw xmm2,[rel PW_SEVEN] paddw xmm2, [rel PW_SEVEN]
paddw xmm1,xmm7 paddw xmm1, xmm7
paddw xmm5,xmm3 paddw xmm5, xmm3
psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm2,xmm3 paddw xmm2, xmm3
psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
psllw xmm0,BYTE_BIT psllw xmm0, BYTE_BIT
psllw xmm2,BYTE_BIT psllw xmm2, BYTE_BIT
por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
@@ -402,50 +402,50 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD] movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD] movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --)
pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
movdqa xmm0,xmm6 movdqa xmm0, xmm6
movdqa xmm2,xmm4 movdqa xmm2, xmm4
psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14)
por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
movdqa xmm1,xmm6 movdqa xmm1, xmm6
movdqa xmm5,xmm4 movdqa xmm5, xmm4
pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --)
movdqa xmm3,xmm4 movdqa xmm3, xmm4
psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
movdqa XMMWORD [wk(1)], xmm3 movdqa XMMWORD [wk(1)], xmm3
pmullw xmm6,[rel PW_THREE] pmullw xmm6, [rel PW_THREE]
pmullw xmm4,[rel PW_THREE] pmullw xmm4, [rel PW_THREE]
paddw xmm1,[rel PW_EIGHT] paddw xmm1, [rel PW_EIGHT]
paddw xmm0,[rel PW_EIGHT] paddw xmm0, [rel PW_EIGHT]
paddw xmm7,[rel PW_SEVEN] paddw xmm7, [rel PW_SEVEN]
paddw xmm5,[rel PW_SEVEN] paddw xmm5, [rel PW_SEVEN]
paddw xmm1,xmm6 paddw xmm1, xmm6
paddw xmm0,xmm4 paddw xmm0, xmm4
psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
paddw xmm7,xmm6 paddw xmm7, xmm6
paddw xmm5,xmm4 paddw xmm5, xmm4
psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
psllw xmm7,BYTE_BIT psllw xmm7, BYTE_BIT
psllw xmm5,BYTE_BIT psllw xmm5, BYTE_BIT
por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
@@ -458,7 +458,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
add rdi, byte 2*SIZEOF_XMMWORD ; outptr1 add rdi, byte 2*SIZEOF_XMMWORD ; outptr1
cmp rax, byte SIZEOF_XMMWORD cmp rax, byte SIZEOF_XMMWORD
ja near .columnloop ja near .columnloop
test rax,rax test rax, rax
jnz near .columnloop_last jnz near .columnloop_last
pop rsi pop rsi
@@ -474,7 +474,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret
@@ -501,8 +501,8 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
EXTN(jsimd_h2v1_upsample_sse2): EXTN(jsimd_h2v1_upsample_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
mov edx, r11d mov edx, r11d
@@ -511,7 +511,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
jz near .return jz near .return
mov rcx, r10 ; rowctr mov rcx, r10 ; rowctr
test rcx,rcx test rcx, rcx
jz short .return jz short .return
mov rsi, r12 ; input_data mov rsi, r12 ; input_data
@@ -523,14 +523,14 @@ EXTN(jsimd_h2v1_upsample_sse2):
mov rsi, JSAMPROW [rsi] ; inptr mov rsi, JSAMPROW [rsi] ; inptr
mov rdi, JSAMPROW [rdi] ; outptr mov rdi, JSAMPROW [rdi] ; outptr
mov rax,rdx ; colctr mov rax, rdx ; colctr
.columnloop: .columnloop:
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqa xmm1,xmm0 movdqa xmm1, xmm0
punpcklbw xmm0,xmm0 punpcklbw xmm0, xmm0
punpckhbw xmm1,xmm1 punpckhbw xmm1, xmm1
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
@@ -540,9 +540,9 @@ EXTN(jsimd_h2v1_upsample_sse2):
movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqa xmm3,xmm2 movdqa xmm3, xmm2
punpcklbw xmm2,xmm2 punpcklbw xmm2, xmm2
punpckhbw xmm3,xmm3 punpckhbw xmm3, xmm3
movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
@@ -574,7 +574,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
; It's still a box filter. ; It's still a box filter.
; ;
; GLOBAL(void) ; GLOBAL(void)
; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor, ; jsimd_h2v2_upsample_sse2 (int max_v_samp_factor,
; JDIMENSION output_width, ; JDIMENSION output_width,
; JSAMPARRAY input_data, ; JSAMPARRAY input_data,
; JSAMPARRAY *output_data_ptr); ; JSAMPARRAY *output_data_ptr);
@@ -590,8 +590,8 @@ EXTN(jsimd_h2v1_upsample_sse2):
EXTN(jsimd_h2v2_upsample_sse2): EXTN(jsimd_h2v2_upsample_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
push rbx push rbx
@@ -601,7 +601,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
jz near .return jz near .return
mov rcx, r10 ; rowctr mov rcx, r10 ; rowctr
test rcx,rcx test rcx, rcx
jz near .return jz near .return
mov rsi, r12 ; input_data mov rsi, r12 ; input_data
@@ -614,14 +614,14 @@ EXTN(jsimd_h2v2_upsample_sse2):
mov rsi, JSAMPROW [rsi] ; inptr mov rsi, JSAMPROW [rsi] ; inptr
mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
mov rax,rdx ; colctr mov rax, rdx ; colctr
.columnloop: .columnloop:
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqa xmm1,xmm0 movdqa xmm1, xmm0
punpcklbw xmm0,xmm0 punpcklbw xmm0, xmm0
punpckhbw xmm1,xmm1 punpckhbw xmm1, xmm1
movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
@@ -633,9 +633,9 @@ EXTN(jsimd_h2v2_upsample_sse2):
movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqa xmm3,xmm2 movdqa xmm3, xmm2
punpcklbw xmm2,xmm2 punpcklbw xmm2, xmm2
punpckhbw xmm3,xmm3 punpckhbw xmm3, xmm3
movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2 movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3 movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3

View File

@@ -61,7 +61,7 @@ PW_EIGHT times 8 dw 8
EXTN(jsimd_h2v1_fancy_upsample_sse2): EXTN(jsimd_h2v1_fancy_upsample_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
pushpic ebx pushpic ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
@@ -71,17 +71,17 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
get_GOT ebx ; get GOT address get_GOT ebx ; get GOT address
mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
test eax,eax test eax, eax
jz near .return jz near .return
mov ecx, INT [max_v_samp(ebp)] ; rowctr mov ecx, INT [max_v_samp(ebp)] ; rowctr
test ecx,ecx test ecx, ecx
jz near .return jz near .return
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)] mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data mov edi, JSAMPARRAY [edi] ; output_data
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
push eax ; colctr push eax ; colctr
push edi push edi
@@ -95,71 +95,71 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
.skip: .skip:
pxor xmm0,xmm0 ; xmm0=(all 0's) pxor xmm0, xmm0 ; xmm0=(all 0's)
pcmpeqb xmm7,xmm7 pcmpeqb xmm7, xmm7
psrldq xmm7,(SIZEOF_XMMWORD-1) psrldq xmm7, (SIZEOF_XMMWORD-1)
pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD] pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
add eax, byte SIZEOF_XMMWORD-1 add eax, byte SIZEOF_XMMWORD-1
and eax, byte -SIZEOF_XMMWORD and eax, byte -SIZEOF_XMMWORD
cmp eax, byte SIZEOF_XMMWORD cmp eax, byte SIZEOF_XMMWORD
ja short .columnloop ja short .columnloop
alignx 16,7 alignx 16, 7
.columnloop_last: .columnloop_last:
pcmpeqb xmm6,xmm6 pcmpeqb xmm6, xmm6
pslldq xmm6,(SIZEOF_XMMWORD-1) pslldq xmm6, (SIZEOF_XMMWORD-1)
pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
jmp short .upsample jmp short .upsample
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
pslldq xmm6,(SIZEOF_XMMWORD-1) pslldq xmm6, (SIZEOF_XMMWORD-1)
.upsample: .upsample:
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqa xmm2,xmm1 movdqa xmm2, xmm1
movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14)
psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --)
por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
movdqa xmm7,xmm1 movdqa xmm7, xmm1
psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
movdqa xmm4,xmm1 movdqa xmm4, xmm1
punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
movdqa xmm5,xmm2 movdqa xmm5, xmm2
punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
movdqa xmm6,xmm3 movdqa xmm6, xmm3
punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
pmullw xmm1,[GOTOFF(ebx,PW_THREE)] pmullw xmm1, [GOTOFF(ebx,PW_THREE)]
pmullw xmm4,[GOTOFF(ebx,PW_THREE)] pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
paddw xmm2,[GOTOFF(ebx,PW_ONE)] paddw xmm2, [GOTOFF(ebx,PW_ONE)]
paddw xmm5,[GOTOFF(ebx,PW_ONE)] paddw xmm5, [GOTOFF(ebx,PW_ONE)]
paddw xmm3,[GOTOFF(ebx,PW_TWO)] paddw xmm3, [GOTOFF(ebx,PW_TWO)]
paddw xmm6,[GOTOFF(ebx,PW_TWO)] paddw xmm6, [GOTOFF(ebx,PW_TWO)]
paddw xmm2,xmm1 paddw xmm2, xmm1
paddw xmm5,xmm4 paddw xmm5, xmm4
psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
paddw xmm3,xmm1 paddw xmm3, xmm1
paddw xmm6,xmm4 paddw xmm6, xmm4
psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
psllw xmm3,BYTE_BIT psllw xmm3, BYTE_BIT
psllw xmm6,BYTE_BIT psllw xmm6, BYTE_BIT
por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
@@ -169,7 +169,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
add edi, byte 2*SIZEOF_XMMWORD ; outptr add edi, byte 2*SIZEOF_XMMWORD ; outptr
cmp eax, byte SIZEOF_XMMWORD cmp eax, byte SIZEOF_XMMWORD
ja near .columnloop ja near .columnloop
test eax,eax test eax, eax
jnz near .columnloop_last jnz near .columnloop_last
pop esi pop esi
@@ -217,11 +217,11 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
EXTN(jsimd_h2v2_fancy_upsample_sse2): EXTN(jsimd_h2v2_fancy_upsample_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address pushpic eax ; make a room for GOT address
push ebx push ebx
@@ -233,19 +233,19 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
get_GOT ebx ; get GOT address get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address movpic POINTER [gotptr], ebx ; save GOT address
mov edx,eax ; edx = original ebp mov edx, eax ; edx = original ebp
mov eax, JDIMENSION [downsamp_width(edx)] ; colctr mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
test eax,eax test eax, eax
jz near .return jz near .return
mov ecx, INT [max_v_samp(edx)] ; rowctr mov ecx, INT [max_v_samp(edx)] ; rowctr
test ecx,ecx test ecx, ecx
jz near .return jz near .return
mov esi, JSAMPARRAY [input_data(edx)] ; input_data mov esi, JSAMPARRAY [input_data(edx)] ; input_data
mov edi, POINTER [output_data_ptr(edx)] mov edi, POINTER [output_data_ptr(edx)]
mov edi, JSAMPARRAY [edi] ; output_data mov edi, JSAMPARRAY [edi] ; output_data
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
push eax ; colctr push eax ; colctr
push ecx push ecx
@@ -278,35 +278,35 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
pushpic ebx pushpic ebx
movpic ebx, POINTER [gotptr] ; load GOT address movpic ebx, POINTER [gotptr] ; load GOT address
pxor xmm3,xmm3 ; xmm3=(all 0's) pxor xmm3, xmm3 ; xmm3=(all 0's)
movdqa xmm4,xmm0 movdqa xmm4, xmm0
punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
movdqa xmm5,xmm1 movdqa xmm5, xmm1
punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
movdqa xmm6,xmm2 movdqa xmm6, xmm2
punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
pmullw xmm0,[GOTOFF(ebx,PW_THREE)] pmullw xmm0, [GOTOFF(ebx,PW_THREE)]
pmullw xmm4,[GOTOFF(ebx,PW_THREE)] pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
pcmpeqb xmm7,xmm7 pcmpeqb xmm7, xmm7
psrldq xmm7,(SIZEOF_XMMWORD-2) psrldq xmm7, (SIZEOF_XMMWORD-2)
paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
movdqa XMMWORD [wk(0)], xmm1 movdqa XMMWORD [wk(0)], xmm1
movdqa XMMWORD [wk(1)], xmm2 movdqa XMMWORD [wk(1)], xmm2
@@ -317,7 +317,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
and eax, byte -SIZEOF_XMMWORD and eax, byte -SIZEOF_XMMWORD
cmp eax, byte SIZEOF_XMMWORD cmp eax, byte SIZEOF_XMMWORD
ja short .columnloop ja short .columnloop
alignx 16,7 alignx 16, 7
.columnloop_last: .columnloop_last:
; -- process the last column block ; -- process the last column block
@@ -325,9 +325,9 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
pushpic ebx pushpic ebx
movpic ebx, POINTER [gotptr] ; load GOT address movpic ebx, POINTER [gotptr] ; load GOT address
pcmpeqb xmm1,xmm1 pcmpeqb xmm1, xmm1
pslldq xmm1,(SIZEOF_XMMWORD-2) pslldq xmm1, (SIZEOF_XMMWORD-2)
movdqa xmm2,xmm1 movdqa xmm2, xmm1
pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD] pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD] pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
@@ -336,7 +336,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
jmp near .upsample jmp near .upsample
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
; -- process the next column block ; -- process the next column block
@@ -348,32 +348,32 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
pushpic ebx pushpic ebx
movpic ebx, POINTER [gotptr] ; load GOT address movpic ebx, POINTER [gotptr] ; load GOT address
pxor xmm3,xmm3 ; xmm3=(all 0's) pxor xmm3, xmm3 ; xmm3=(all 0's)
movdqa xmm4,xmm0 movdqa xmm4, xmm0
punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
movdqa xmm5,xmm1 movdqa xmm5, xmm1
punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
movdqa xmm6,xmm2 movdqa xmm6, xmm2
punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
pmullw xmm0,[GOTOFF(ebx,PW_THREE)] pmullw xmm0, [GOTOFF(ebx,PW_THREE)]
pmullw xmm4,[GOTOFF(ebx,PW_THREE)] pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
movdqa XMMWORD [wk(2)], xmm1 movdqa XMMWORD [wk(2)], xmm1
movdqa XMMWORD [wk(3)], xmm2 movdqa XMMWORD [wk(3)], xmm2
@@ -384,50 +384,50 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD] movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD] movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --)
pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
movdqa xmm5,xmm7 movdqa xmm5, xmm7
movdqa xmm6,xmm3 movdqa xmm6, xmm3
psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14)
por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
movdqa xmm1,xmm7 movdqa xmm1, xmm7
movdqa xmm2,xmm3 movdqa xmm2, xmm3
pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --)
movdqa xmm4,xmm3 movdqa xmm4, xmm3
psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
movdqa XMMWORD [wk(0)], xmm4 movdqa XMMWORD [wk(0)], xmm4
pmullw xmm7,[GOTOFF(ebx,PW_THREE)] pmullw xmm7, [GOTOFF(ebx,PW_THREE)]
pmullw xmm3,[GOTOFF(ebx,PW_THREE)] pmullw xmm3, [GOTOFF(ebx,PW_THREE)]
paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] paddw xmm1, [GOTOFF(ebx,PW_EIGHT)]
paddw xmm5,[GOTOFF(ebx,PW_EIGHT)] paddw xmm5, [GOTOFF(ebx,PW_EIGHT)]
paddw xmm0,[GOTOFF(ebx,PW_SEVEN)] paddw xmm0, [GOTOFF(ebx,PW_SEVEN)]
paddw xmm2,[GOTOFF(ebx,PW_SEVEN)] paddw xmm2, [GOTOFF(ebx,PW_SEVEN)]
paddw xmm1,xmm7 paddw xmm1, xmm7
paddw xmm5,xmm3 paddw xmm5, xmm3
psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm2,xmm3 paddw xmm2, xmm3
psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
psllw xmm0,BYTE_BIT psllw xmm0, BYTE_BIT
psllw xmm2,BYTE_BIT psllw xmm2, BYTE_BIT
por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
@@ -437,50 +437,50 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD] movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD] movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --)
pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
movdqa xmm0,xmm6 movdqa xmm0, xmm6
movdqa xmm2,xmm4 movdqa xmm2, xmm4
psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14)
por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
movdqa xmm1,xmm6 movdqa xmm1, xmm6
movdqa xmm5,xmm4 movdqa xmm5, xmm4
pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --)
movdqa xmm3,xmm4 movdqa xmm3, xmm4
psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
movdqa XMMWORD [wk(1)], xmm3 movdqa XMMWORD [wk(1)], xmm3
pmullw xmm6,[GOTOFF(ebx,PW_THREE)] pmullw xmm6, [GOTOFF(ebx,PW_THREE)]
pmullw xmm4,[GOTOFF(ebx,PW_THREE)] pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] paddw xmm1, [GOTOFF(ebx,PW_EIGHT)]
paddw xmm0,[GOTOFF(ebx,PW_EIGHT)] paddw xmm0, [GOTOFF(ebx,PW_EIGHT)]
paddw xmm7,[GOTOFF(ebx,PW_SEVEN)] paddw xmm7, [GOTOFF(ebx,PW_SEVEN)]
paddw xmm5,[GOTOFF(ebx,PW_SEVEN)] paddw xmm5, [GOTOFF(ebx,PW_SEVEN)]
paddw xmm1,xmm6 paddw xmm1, xmm6
paddw xmm0,xmm4 paddw xmm0, xmm4
psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
paddw xmm7,xmm6 paddw xmm7, xmm6
paddw xmm5,xmm4 paddw xmm5, xmm4
psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
psllw xmm7,BYTE_BIT psllw xmm7, BYTE_BIT
psllw xmm5,BYTE_BIT psllw xmm5, BYTE_BIT
por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
@@ -495,7 +495,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
add edi, byte 2*SIZEOF_XMMWORD ; outptr1 add edi, byte 2*SIZEOF_XMMWORD ; outptr1
cmp eax, byte SIZEOF_XMMWORD cmp eax, byte SIZEOF_XMMWORD
ja near .columnloop ja near .columnloop
test eax,eax test eax, eax
jnz near .columnloop_last jnz near .columnloop_last
pop esi pop esi
@@ -514,7 +514,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
pop ebx pop ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret
@@ -541,7 +541,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
EXTN(jsimd_h2v1_upsample_sse2): EXTN(jsimd_h2v1_upsample_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
; push ebx ; unused ; push ebx ; unused
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
@@ -554,28 +554,28 @@ EXTN(jsimd_h2v1_upsample_sse2):
jz short .return jz short .return
mov ecx, INT [max_v_samp(ebp)] ; rowctr mov ecx, INT [max_v_samp(ebp)] ; rowctr
test ecx,ecx test ecx, ecx
jz short .return jz short .return
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)] mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data mov edi, JSAMPARRAY [edi] ; output_data
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
push edi push edi
push esi push esi
mov esi, JSAMPROW [esi] ; inptr mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr mov edi, JSAMPROW [edi] ; outptr
mov eax,edx ; colctr mov eax, edx ; colctr
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqa xmm1,xmm0 movdqa xmm1, xmm0
punpcklbw xmm0,xmm0 punpcklbw xmm0, xmm0
punpckhbw xmm1,xmm1 punpckhbw xmm1, xmm1
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
@@ -585,9 +585,9 @@ EXTN(jsimd_h2v1_upsample_sse2):
movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqa xmm3,xmm2 movdqa xmm3, xmm2
punpcklbw xmm2,xmm2 punpcklbw xmm2, xmm2
punpckhbw xmm3,xmm3 punpckhbw xmm3, xmm3
movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
@@ -598,7 +598,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
add esi, byte 2*SIZEOF_XMMWORD ; inptr add esi, byte 2*SIZEOF_XMMWORD ; inptr
add edi, byte 4*SIZEOF_XMMWORD ; outptr add edi, byte 4*SIZEOF_XMMWORD ; outptr
jmp short .columnloop jmp short .columnloop
alignx 16,7 alignx 16, 7
.nextrow: .nextrow:
pop esi pop esi
@@ -640,7 +640,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
EXTN(jsimd_h2v2_upsample_sse2): EXTN(jsimd_h2v2_upsample_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
@@ -653,13 +653,13 @@ EXTN(jsimd_h2v2_upsample_sse2):
jz near .return jz near .return
mov ecx, INT [max_v_samp(ebp)] ; rowctr mov ecx, INT [max_v_samp(ebp)] ; rowctr
test ecx,ecx test ecx, ecx
jz near .return jz near .return
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)] mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data mov edi, JSAMPARRAY [edi] ; output_data
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
push edi push edi
push esi push esi
@@ -667,15 +667,15 @@ EXTN(jsimd_h2v2_upsample_sse2):
mov esi, JSAMPROW [esi] ; inptr mov esi, JSAMPROW [esi] ; inptr
mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
mov eax,edx ; colctr mov eax, edx ; colctr
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqa xmm1,xmm0 movdqa xmm1, xmm0
punpcklbw xmm0,xmm0 punpcklbw xmm0, xmm0
punpckhbw xmm1,xmm1 punpckhbw xmm1, xmm1
movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
@@ -687,9 +687,9 @@ EXTN(jsimd_h2v2_upsample_sse2):
movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqa xmm3,xmm2 movdqa xmm3, xmm2
punpcklbw xmm2,xmm2 punpcklbw xmm2, xmm2
punpckhbw xmm3,xmm3 punpckhbw xmm3, xmm3
movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2 movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3 movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
@@ -703,7 +703,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 add ebx, byte 4*SIZEOF_XMMWORD ; outptr0
add edi, byte 4*SIZEOF_XMMWORD ; outptr1 add edi, byte 4*SIZEOF_XMMWORD ; outptr1
jmp short .columnloop jmp short .columnloop
alignx 16,7 alignx 16, 7
.nextrow: .nextrow:
pop esi pop esi

View File

@@ -26,11 +26,11 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1,%2,0x44 shufps %1, %2, 0x44
%endmacro %endmacro
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1,%2,0xEE shufps %1, %2, 0xEE
%endmacro %endmacro
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -68,11 +68,11 @@ PD_1_306 times 4 dd 1.306562964876376527856643
EXTN(jsimd_fdct_float_sse): EXTN(jsimd_fdct_float_sse):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
@@ -90,12 +90,12 @@ EXTN(jsimd_fdct_float_sse):
; xmm0=(20 21 22 23), xmm2=(24 25 26 27) ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
; xmm1=(30 31 32 33), xmm3=(34 35 36 37) ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
movaps xmm4,xmm0 ; transpose coefficients(phase 1) movaps xmm4, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31) unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33) unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
movaps xmm5,xmm2 ; transpose coefficients(phase 1) movaps xmm5, xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35) unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37) unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
@@ -108,64 +108,64 @@ EXTN(jsimd_fdct_float_sse):
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
movaps xmm4,xmm6 ; transpose coefficients(phase 1) movaps xmm4, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13) unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
movaps xmm2,xmm1 ; transpose coefficients(phase 1) movaps xmm2, xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15) unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17) unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
movaps xmm7,xmm6 ; transpose coefficients(phase 2) movaps xmm7, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0 unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1 unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
movaps xmm3,xmm2 ; transpose coefficients(phase 2) movaps xmm3, xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6 unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7 unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
movaps xmm0,xmm7 movaps xmm0, xmm7
movaps xmm5,xmm6 movaps xmm5, xmm6
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm7,xmm4 ; transpose coefficients(phase 2) movaps xmm7, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2 unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3 unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
movaps xmm6,xmm1 ; transpose coefficients(phase 2) movaps xmm6, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4 unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5 unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
movaps xmm2,xmm7 movaps xmm2, xmm7
movaps xmm3,xmm4 movaps xmm3, xmm4
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part ; -- Even part
movaps xmm1,xmm5 movaps xmm1, xmm5
movaps xmm6,xmm0 movaps xmm6, xmm0
subps xmm5,xmm7 ; xmm5=tmp13 subps xmm5, xmm7 ; xmm5=tmp13
subps xmm0,xmm4 ; xmm0=tmp12 subps xmm0, xmm4 ; xmm0=tmp12
addps xmm1,xmm7 ; xmm1=tmp10 addps xmm1, xmm7 ; xmm1=tmp10
addps xmm6,xmm4 ; xmm6=tmp11 addps xmm6, xmm4 ; xmm6=tmp11
addps xmm0,xmm5 addps xmm0, xmm5
mulps xmm0,[rel PD_0_707] ; xmm0=z1 mulps xmm0, [rel PD_0_707] ; xmm0=z1
movaps xmm7,xmm1 movaps xmm7, xmm1
movaps xmm4,xmm5 movaps xmm4, xmm5
subps xmm1,xmm6 ; xmm1=data4 subps xmm1, xmm6 ; xmm1=data4
subps xmm5,xmm0 ; xmm5=data6 subps xmm5, xmm0 ; xmm5=data6
addps xmm7,xmm6 ; xmm7=data0 addps xmm7, xmm6 ; xmm7=data0
addps xmm4,xmm0 ; xmm4=data2 addps xmm4, xmm0 ; xmm4=data2
movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
@@ -177,30 +177,30 @@ EXTN(jsimd_fdct_float_sse):
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
addps xmm2,xmm3 ; xmm2=tmp10 addps xmm2, xmm3 ; xmm2=tmp10
addps xmm3,xmm6 ; xmm3=tmp11 addps xmm3, xmm6 ; xmm3=tmp11
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
mulps xmm3,[rel PD_0_707] ; xmm3=z3 mulps xmm3, [rel PD_0_707] ; xmm3=z3
movaps xmm1,xmm2 ; xmm1=tmp10 movaps xmm1, xmm2 ; xmm1=tmp10
subps xmm2,xmm6 subps xmm2, xmm6
mulps xmm2,[rel PD_0_382] ; xmm2=z5 mulps xmm2, [rel PD_0_382] ; xmm2=z5
mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1,xmm2 ; xmm1=z2 addps xmm1, xmm2 ; xmm1=z2
addps xmm6,xmm2 ; xmm6=z4 addps xmm6, xmm2 ; xmm6=z4
movaps xmm5,xmm0 movaps xmm5, xmm0
subps xmm0,xmm3 ; xmm0=z13 subps xmm0, xmm3 ; xmm0=z13
addps xmm5,xmm3 ; xmm5=z11 addps xmm5, xmm3 ; xmm5=z11
movaps xmm7,xmm0 movaps xmm7, xmm0
movaps xmm4,xmm5 movaps xmm4, xmm5
subps xmm0,xmm1 ; xmm0=data3 subps xmm0, xmm1 ; xmm0=data3
subps xmm5,xmm6 ; xmm5=data7 subps xmm5, xmm6 ; xmm5=data7
addps xmm7,xmm1 ; xmm7=data5 addps xmm7, xmm1 ; xmm7=data5
addps xmm4,xmm6 ; xmm4=data1 addps xmm4, xmm6 ; xmm4=data1
movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
@@ -225,12 +225,12 @@ EXTN(jsimd_fdct_float_sse):
; xmm0=(02 12 22 32), xmm2=(42 52 62 72) ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
; xmm1=(03 13 23 33), xmm3=(43 53 63 73) ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
movaps xmm4,xmm0 ; transpose coefficients(phase 1) movaps xmm4, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13) unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33) unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
movaps xmm5,xmm2 ; transpose coefficients(phase 1) movaps xmm5, xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53) unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73) unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
@@ -243,64 +243,64 @@ EXTN(jsimd_fdct_float_sse):
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
movaps xmm4,xmm6 ; transpose coefficients(phase 1) movaps xmm4, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11) unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31) unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
movaps xmm2,xmm1 ; transpose coefficients(phase 1) movaps xmm2, xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51) unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71) unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
movaps xmm7,xmm6 ; transpose coefficients(phase 2) movaps xmm7, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0 unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1 unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
movaps xmm3,xmm2 ; transpose coefficients(phase 2) movaps xmm3, xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6 unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7 unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
movaps xmm0,xmm7 movaps xmm0, xmm7
movaps xmm5,xmm6 movaps xmm5, xmm6
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm7,xmm4 ; transpose coefficients(phase 2) movaps xmm7, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2 unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3 unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
movaps xmm6,xmm1 ; transpose coefficients(phase 2) movaps xmm6, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4 unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5 unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
movaps xmm2,xmm7 movaps xmm2, xmm7
movaps xmm3,xmm4 movaps xmm3, xmm4
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part ; -- Even part
movaps xmm1,xmm5 movaps xmm1, xmm5
movaps xmm6,xmm0 movaps xmm6, xmm0
subps xmm5,xmm7 ; xmm5=tmp13 subps xmm5, xmm7 ; xmm5=tmp13
subps xmm0,xmm4 ; xmm0=tmp12 subps xmm0, xmm4 ; xmm0=tmp12
addps xmm1,xmm7 ; xmm1=tmp10 addps xmm1, xmm7 ; xmm1=tmp10
addps xmm6,xmm4 ; xmm6=tmp11 addps xmm6, xmm4 ; xmm6=tmp11
addps xmm0,xmm5 addps xmm0, xmm5
mulps xmm0,[rel PD_0_707] ; xmm0=z1 mulps xmm0, [rel PD_0_707] ; xmm0=z1
movaps xmm7,xmm1 movaps xmm7, xmm1
movaps xmm4,xmm5 movaps xmm4, xmm5
subps xmm1,xmm6 ; xmm1=data4 subps xmm1, xmm6 ; xmm1=data4
subps xmm5,xmm0 ; xmm5=data6 subps xmm5, xmm0 ; xmm5=data6
addps xmm7,xmm6 ; xmm7=data0 addps xmm7, xmm6 ; xmm7=data0
addps xmm4,xmm0 ; xmm4=data2 addps xmm4, xmm0 ; xmm4=data2
movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
@@ -312,30 +312,30 @@ EXTN(jsimd_fdct_float_sse):
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
addps xmm2,xmm3 ; xmm2=tmp10 addps xmm2, xmm3 ; xmm2=tmp10
addps xmm3,xmm6 ; xmm3=tmp11 addps xmm3, xmm6 ; xmm3=tmp11
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
mulps xmm3,[rel PD_0_707] ; xmm3=z3 mulps xmm3, [rel PD_0_707] ; xmm3=z3
movaps xmm1,xmm2 ; xmm1=tmp10 movaps xmm1, xmm2 ; xmm1=tmp10
subps xmm2,xmm6 subps xmm2, xmm6
mulps xmm2,[rel PD_0_382] ; xmm2=z5 mulps xmm2, [rel PD_0_382] ; xmm2=z5
mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1,xmm2 ; xmm1=z2 addps xmm1, xmm2 ; xmm1=z2
addps xmm6,xmm2 ; xmm6=z4 addps xmm6, xmm2 ; xmm6=z4
movaps xmm5,xmm0 movaps xmm5, xmm0
subps xmm0,xmm3 ; xmm0=z13 subps xmm0, xmm3 ; xmm0=z13
addps xmm5,xmm3 ; xmm5=z11 addps xmm5, xmm3 ; xmm5=z11
movaps xmm7,xmm0 movaps xmm7, xmm0
movaps xmm4,xmm5 movaps xmm4, xmm5
subps xmm0,xmm1 ; xmm0=data3 subps xmm0, xmm1 ; xmm0=data3
subps xmm5,xmm6 ; xmm5=data7 subps xmm5, xmm6 ; xmm5=data7
addps xmm7,xmm1 ; xmm7=data5 addps xmm7, xmm1 ; xmm7=data5
addps xmm4,xmm6 ; xmm4=data1 addps xmm4, xmm6 ; xmm4=data1
movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
@@ -347,7 +347,7 @@ EXTN(jsimd_fdct_float_sse):
jnz near .columnloop jnz near .columnloop
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret

View File

@@ -25,11 +25,11 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1,%2,0x44 shufps %1, %2, 0x44
%endmacro %endmacro
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1,%2,0xEE shufps %1, %2, 0xEE
%endmacro %endmacro
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -68,11 +68,11 @@ PD_1_306 times 4 dd 1.306562964876376527856643
EXTN(jsimd_fdct_float_sse): EXTN(jsimd_fdct_float_sse):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx pushpic ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
@@ -86,7 +86,7 @@ EXTN(jsimd_fdct_float_sse):
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE/4 mov ecx, DCTSIZE/4
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
@@ -97,12 +97,12 @@ EXTN(jsimd_fdct_float_sse):
; xmm0=(20 21 22 23), xmm2=(24 25 26 27) ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
; xmm1=(30 31 32 33), xmm3=(34 35 36 37) ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
movaps xmm4,xmm0 ; transpose coefficients(phase 1) movaps xmm4, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31) unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33) unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
movaps xmm5,xmm2 ; transpose coefficients(phase 1) movaps xmm5, xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35) unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37) unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
@@ -115,64 +115,64 @@ EXTN(jsimd_fdct_float_sse):
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
movaps xmm4,xmm6 ; transpose coefficients(phase 1) movaps xmm4, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13) unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
movaps xmm2,xmm1 ; transpose coefficients(phase 1) movaps xmm2, xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15) unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17) unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
movaps xmm7,xmm6 ; transpose coefficients(phase 2) movaps xmm7, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0 unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1 unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
movaps xmm3,xmm2 ; transpose coefficients(phase 2) movaps xmm3, xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6 unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7 unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
movaps xmm0,xmm7 movaps xmm0, xmm7
movaps xmm5,xmm6 movaps xmm5, xmm6
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm7,xmm4 ; transpose coefficients(phase 2) movaps xmm7, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2 unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3 unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
movaps xmm6,xmm1 ; transpose coefficients(phase 2) movaps xmm6, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4 unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5 unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
movaps xmm2,xmm7 movaps xmm2, xmm7
movaps xmm3,xmm4 movaps xmm3, xmm4
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part ; -- Even part
movaps xmm1,xmm5 movaps xmm1, xmm5
movaps xmm6,xmm0 movaps xmm6, xmm0
subps xmm5,xmm7 ; xmm5=tmp13 subps xmm5, xmm7 ; xmm5=tmp13
subps xmm0,xmm4 ; xmm0=tmp12 subps xmm0, xmm4 ; xmm0=tmp12
addps xmm1,xmm7 ; xmm1=tmp10 addps xmm1, xmm7 ; xmm1=tmp10
addps xmm6,xmm4 ; xmm6=tmp11 addps xmm6, xmm4 ; xmm6=tmp11
addps xmm0,xmm5 addps xmm0, xmm5
mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1 mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
movaps xmm7,xmm1 movaps xmm7, xmm1
movaps xmm4,xmm5 movaps xmm4, xmm5
subps xmm1,xmm6 ; xmm1=data4 subps xmm1, xmm6 ; xmm1=data4
subps xmm5,xmm0 ; xmm5=data6 subps xmm5, xmm0 ; xmm5=data6
addps xmm7,xmm6 ; xmm7=data0 addps xmm7, xmm6 ; xmm7=data0
addps xmm4,xmm0 ; xmm4=data2 addps xmm4, xmm0 ; xmm4=data2
movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
@@ -184,30 +184,30 @@ EXTN(jsimd_fdct_float_sse):
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
addps xmm2,xmm3 ; xmm2=tmp10 addps xmm2, xmm3 ; xmm2=tmp10
addps xmm3,xmm6 ; xmm3=tmp11 addps xmm3, xmm6 ; xmm3=tmp11
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3 mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
movaps xmm1,xmm2 ; xmm1=tmp10 movaps xmm1, xmm2 ; xmm1=tmp10
subps xmm2,xmm6 subps xmm2, xmm6
mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5 mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1,xmm2 ; xmm1=z2 addps xmm1, xmm2 ; xmm1=z2
addps xmm6,xmm2 ; xmm6=z4 addps xmm6, xmm2 ; xmm6=z4
movaps xmm5,xmm0 movaps xmm5, xmm0
subps xmm0,xmm3 ; xmm0=z13 subps xmm0, xmm3 ; xmm0=z13
addps xmm5,xmm3 ; xmm5=z11 addps xmm5, xmm3 ; xmm5=z11
movaps xmm7,xmm0 movaps xmm7, xmm0
movaps xmm4,xmm5 movaps xmm4, xmm5
subps xmm0,xmm1 ; xmm0=data3 subps xmm0, xmm1 ; xmm0=data3
subps xmm5,xmm6 ; xmm5=data7 subps xmm5, xmm6 ; xmm5=data7
addps xmm7,xmm1 ; xmm7=data5 addps xmm7, xmm1 ; xmm7=data5
addps xmm4,xmm6 ; xmm4=data1 addps xmm4, xmm6 ; xmm4=data1
movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
@@ -222,7 +222,7 @@ EXTN(jsimd_fdct_float_sse):
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE/4 mov ecx, DCTSIZE/4
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
@@ -233,12 +233,12 @@ EXTN(jsimd_fdct_float_sse):
; xmm0=(02 12 22 32), xmm2=(42 52 62 72) ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
; xmm1=(03 13 23 33), xmm3=(43 53 63 73) ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
movaps xmm4,xmm0 ; transpose coefficients(phase 1) movaps xmm4, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13) unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33) unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
movaps xmm5,xmm2 ; transpose coefficients(phase 1) movaps xmm5, xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53) unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73) unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
@@ -251,64 +251,64 @@ EXTN(jsimd_fdct_float_sse):
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
movaps xmm4,xmm6 ; transpose coefficients(phase 1) movaps xmm4, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11) unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31) unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
movaps xmm2,xmm1 ; transpose coefficients(phase 1) movaps xmm2, xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51) unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71) unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
movaps xmm7,xmm6 ; transpose coefficients(phase 2) movaps xmm7, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0 unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1 unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
movaps xmm3,xmm2 ; transpose coefficients(phase 2) movaps xmm3, xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6 unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7 unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
movaps xmm0,xmm7 movaps xmm0, xmm7
movaps xmm5,xmm6 movaps xmm5, xmm6
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm7,xmm4 ; transpose coefficients(phase 2) movaps xmm7, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2 unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3 unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
movaps xmm6,xmm1 ; transpose coefficients(phase 2) movaps xmm6, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4 unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5 unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
movaps xmm2,xmm7 movaps xmm2, xmm7
movaps xmm3,xmm4 movaps xmm3, xmm4
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part ; -- Even part
movaps xmm1,xmm5 movaps xmm1, xmm5
movaps xmm6,xmm0 movaps xmm6, xmm0
subps xmm5,xmm7 ; xmm5=tmp13 subps xmm5, xmm7 ; xmm5=tmp13
subps xmm0,xmm4 ; xmm0=tmp12 subps xmm0, xmm4 ; xmm0=tmp12
addps xmm1,xmm7 ; xmm1=tmp10 addps xmm1, xmm7 ; xmm1=tmp10
addps xmm6,xmm4 ; xmm6=tmp11 addps xmm6, xmm4 ; xmm6=tmp11
addps xmm0,xmm5 addps xmm0, xmm5
mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1 mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
movaps xmm7,xmm1 movaps xmm7, xmm1
movaps xmm4,xmm5 movaps xmm4, xmm5
subps xmm1,xmm6 ; xmm1=data4 subps xmm1, xmm6 ; xmm1=data4
subps xmm5,xmm0 ; xmm5=data6 subps xmm5, xmm0 ; xmm5=data6
addps xmm7,xmm6 ; xmm7=data0 addps xmm7, xmm6 ; xmm7=data0
addps xmm4,xmm0 ; xmm4=data2 addps xmm4, xmm0 ; xmm4=data2
movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
@@ -320,30 +320,30 @@ EXTN(jsimd_fdct_float_sse):
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
addps xmm2,xmm3 ; xmm2=tmp10 addps xmm2, xmm3 ; xmm2=tmp10
addps xmm3,xmm6 ; xmm3=tmp11 addps xmm3, xmm6 ; xmm3=tmp11
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3 mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
movaps xmm1,xmm2 ; xmm1=tmp10 movaps xmm1, xmm2 ; xmm1=tmp10
subps xmm2,xmm6 subps xmm2, xmm6
mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5 mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1,xmm2 ; xmm1=z2 addps xmm1, xmm2 ; xmm1=z2
addps xmm6,xmm2 ; xmm6=z4 addps xmm6, xmm2 ; xmm6=z4
movaps xmm5,xmm0 movaps xmm5, xmm0
subps xmm0,xmm3 ; xmm0=z13 subps xmm0, xmm3 ; xmm0=z13
addps xmm5,xmm3 ; xmm5=z11 addps xmm5, xmm3 ; xmm5=z11
movaps xmm7,xmm0 movaps xmm7, xmm0
movaps xmm4,xmm5 movaps xmm4, xmm5
subps xmm0,xmm1 ; xmm0=data3 subps xmm0, xmm1 ; xmm0=data3
subps xmm5,xmm6 ; xmm5=data7 subps xmm5, xmm6 ; xmm5=data7
addps xmm7,xmm1 ; xmm7=data5 addps xmm7, xmm1 ; xmm7=data5
addps xmm4,xmm6 ; xmm4=data1 addps xmm4, xmm6 ; xmm4=data1
movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
@@ -359,7 +359,7 @@ EXTN(jsimd_fdct_float_sse):
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
poppic ebx poppic ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret

View File

@@ -36,10 +36,10 @@ F_1_306 equ 334 ; FIX(1.306562965)
%else %else
; NASM cannot do compile-time arithmetic on floating-point constants. ; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) F_0_382 equ DESCALE( 410903207, 30-CONST_BITS) ; FIX(0.382683433)
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) F_0_707 equ DESCALE( 759250124, 30-CONST_BITS) ; FIX(0.707106781)
F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) F_1_306 equ DESCALE(1402911301, 30-CONST_BITS) ; FIX(1.306562965)
%endif %endif
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -83,11 +83,11 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
EXTN(jsimd_fdct_ifast_sse2): EXTN(jsimd_fdct_ifast_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
@@ -103,12 +103,12 @@ EXTN(jsimd_fdct_ifast_sse2):
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
movdqa xmm4,xmm0 ; transpose coefficients(phase 1) movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
movdqa xmm5,xmm2 ; transpose coefficients(phase 1) movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
@@ -121,84 +121,84 @@ EXTN(jsimd_fdct_ifast_sse2):
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
movdqa xmm2,xmm6 ; transpose coefficients(phase 1) movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm5,xmm1 ; transpose coefficients(phase 1) movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
movdqa xmm7,xmm6 ; transpose coefficients(phase 2) movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
movdqa xmm3,xmm2 ; transpose coefficients(phase 2) movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
movdqa xmm7,xmm0 ; transpose coefficients(phase 2) movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
movdqa xmm2,xmm4 ; transpose coefficients(phase 2) movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
movdqa xmm1,xmm0 ; transpose coefficients(phase 3) movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
movdqa xmm5,xmm2 ; transpose coefficients(phase 3) movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
movdqa xmm6,xmm1 movdqa xmm6, xmm1
movdqa xmm3,xmm0 movdqa xmm3, xmm0
psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
movdqa xmm1,xmm7 ; transpose coefficients(phase 3) movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
movdqa xmm0,xmm4 ; transpose coefficients(phase 3) movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
movdqa xmm2,xmm1 movdqa xmm2, xmm1
movdqa xmm5,xmm7 movdqa xmm5, xmm7
paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
; -- Even part ; -- Even part
movdqa xmm4,xmm3 movdqa xmm4, xmm3
movdqa xmm0,xmm6 movdqa xmm0, xmm6
psubw xmm3,xmm1 ; xmm3=tmp13 psubw xmm3, xmm1 ; xmm3=tmp13
psubw xmm6,xmm7 ; xmm6=tmp12 psubw xmm6, xmm7 ; xmm6=tmp12
paddw xmm4,xmm1 ; xmm4=tmp10 paddw xmm4, xmm1 ; xmm4=tmp10
paddw xmm0,xmm7 ; xmm0=tmp11 paddw xmm0, xmm7 ; xmm0=tmp11
paddw xmm6,xmm3 paddw xmm6, xmm3
psllw xmm6,PRE_MULTIPLY_SCALE_BITS psllw xmm6, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm6,[rel PW_F0707] ; xmm6=z1 pmulhw xmm6, [rel PW_F0707] ; xmm6=z1
movdqa xmm1,xmm4 movdqa xmm1, xmm4
movdqa xmm7,xmm3 movdqa xmm7, xmm3
psubw xmm4,xmm0 ; xmm4=data4 psubw xmm4, xmm0 ; xmm4=data4
psubw xmm3,xmm6 ; xmm3=data6 psubw xmm3, xmm6 ; xmm3=data6
paddw xmm1,xmm0 ; xmm1=data0 paddw xmm1, xmm0 ; xmm1=data0
paddw xmm7,xmm6 ; xmm7=data2 paddw xmm7, xmm6 ; xmm7=data2
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
@@ -207,46 +207,46 @@ EXTN(jsimd_fdct_ifast_sse2):
; -- Odd part ; -- Odd part
paddw xmm2,xmm5 ; xmm2=tmp10 paddw xmm2, xmm5 ; xmm2=tmp10
paddw xmm5,xmm0 ; xmm5=tmp11 paddw xmm5, xmm0 ; xmm5=tmp11
paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
psllw xmm2,PRE_MULTIPLY_SCALE_BITS psllw xmm2, PRE_MULTIPLY_SCALE_BITS
psllw xmm0,PRE_MULTIPLY_SCALE_BITS psllw xmm0, PRE_MULTIPLY_SCALE_BITS
psllw xmm5,PRE_MULTIPLY_SCALE_BITS psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[rel PW_F0707] ; xmm5=z3 pmulhw xmm5, [rel PW_F0707] ; xmm5=z3
movdqa xmm4,xmm2 ; xmm4=tmp10 movdqa xmm4, xmm2 ; xmm4=tmp10
psubw xmm2,xmm0 psubw xmm2, xmm0
pmulhw xmm2,[rel PW_F0382] ; xmm2=z5 pmulhw xmm2, [rel PW_F0382] ; xmm2=z5
pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) pmulhw xmm0, [rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4,xmm2 ; xmm4=z2 paddw xmm4, xmm2 ; xmm4=z2
paddw xmm0,xmm2 ; xmm0=z4 paddw xmm0, xmm2 ; xmm0=z4
movdqa xmm3,xmm6 movdqa xmm3, xmm6
psubw xmm6,xmm5 ; xmm6=z13 psubw xmm6, xmm5 ; xmm6=z13
paddw xmm3,xmm5 ; xmm3=z11 paddw xmm3, xmm5 ; xmm3=z11
movdqa xmm2,xmm6 movdqa xmm2, xmm6
movdqa xmm5,xmm3 movdqa xmm5, xmm3
psubw xmm6,xmm4 ; xmm6=data3 psubw xmm6, xmm4 ; xmm6=data3
psubw xmm3,xmm0 ; xmm3=data7 psubw xmm3, xmm0 ; xmm3=data7
paddw xmm2,xmm4 ; xmm2=data5 paddw xmm2, xmm4 ; xmm2=data5
paddw xmm5,xmm0 ; xmm5=data1 paddw xmm5, xmm0 ; xmm5=data1
; ---- Pass 2: process columns. ; ---- Pass 2: process columns.
; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
movdqa xmm4,xmm1 ; transpose coefficients(phase 1) movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
movdqa xmm0,xmm7 ; transpose coefficients(phase 1) movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
@@ -257,84 +257,84 @@ EXTN(jsimd_fdct_ifast_sse2):
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
movdqa xmm7,xmm5 ; transpose coefficients(phase 1) movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
movdqa xmm0,xmm6 ; transpose coefficients(phase 1) movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
movdqa xmm2,xmm5 ; transpose coefficients(phase 2) movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
movdqa xmm3,xmm7 ; transpose coefficients(phase 2) movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
movdqa xmm2,xmm1 ; transpose coefficients(phase 2) movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
movdqa xmm7,xmm4 ; transpose coefficients(phase 2) movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
movdqa xmm6,xmm1 ; transpose coefficients(phase 3) movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
movdqa xmm0,xmm7 ; transpose coefficients(phase 3) movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
movdqa xmm5,xmm6 movdqa xmm5, xmm6
movdqa xmm3,xmm1 movdqa xmm3, xmm1
psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
movdqa xmm6,xmm2 ; transpose coefficients(phase 3) movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
movdqa xmm1,xmm4 ; transpose coefficients(phase 3) movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
movdqa xmm7,xmm6 movdqa xmm7, xmm6
movdqa xmm0,xmm2 movdqa xmm0, xmm2
paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
; -- Even part ; -- Even part
movdqa xmm4,xmm3 movdqa xmm4, xmm3
movdqa xmm1,xmm5 movdqa xmm1, xmm5
psubw xmm3,xmm6 ; xmm3=tmp13 psubw xmm3, xmm6 ; xmm3=tmp13
psubw xmm5,xmm2 ; xmm5=tmp12 psubw xmm5, xmm2 ; xmm5=tmp12
paddw xmm4,xmm6 ; xmm4=tmp10 paddw xmm4, xmm6 ; xmm4=tmp10
paddw xmm1,xmm2 ; xmm1=tmp11 paddw xmm1, xmm2 ; xmm1=tmp11
paddw xmm5,xmm3 paddw xmm5, xmm3
psllw xmm5,PRE_MULTIPLY_SCALE_BITS psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[rel PW_F0707] ; xmm5=z1 pmulhw xmm5, [rel PW_F0707] ; xmm5=z1
movdqa xmm6,xmm4 movdqa xmm6, xmm4
movdqa xmm2,xmm3 movdqa xmm2, xmm3
psubw xmm4,xmm1 ; xmm4=data4 psubw xmm4, xmm1 ; xmm4=data4
psubw xmm3,xmm5 ; xmm3=data6 psubw xmm3, xmm5 ; xmm3=data6
paddw xmm6,xmm1 ; xmm6=data0 paddw xmm6, xmm1 ; xmm6=data0
paddw xmm2,xmm5 ; xmm2=data2 paddw xmm2, xmm5 ; xmm2=data2
movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4 movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
@@ -346,34 +346,34 @@ EXTN(jsimd_fdct_ifast_sse2):
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
paddw xmm7,xmm0 ; xmm7=tmp10 paddw xmm7, xmm0 ; xmm7=tmp10
paddw xmm0,xmm1 ; xmm0=tmp11 paddw xmm0, xmm1 ; xmm0=tmp11
paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
psllw xmm7,PRE_MULTIPLY_SCALE_BITS psllw xmm7, PRE_MULTIPLY_SCALE_BITS
psllw xmm1,PRE_MULTIPLY_SCALE_BITS psllw xmm1, PRE_MULTIPLY_SCALE_BITS
psllw xmm0,PRE_MULTIPLY_SCALE_BITS psllw xmm0, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm0,[rel PW_F0707] ; xmm0=z3 pmulhw xmm0, [rel PW_F0707] ; xmm0=z3
movdqa xmm4,xmm7 ; xmm4=tmp10 movdqa xmm4, xmm7 ; xmm4=tmp10
psubw xmm7,xmm1 psubw xmm7, xmm1
pmulhw xmm7,[rel PW_F0382] ; xmm7=z5 pmulhw xmm7, [rel PW_F0382] ; xmm7=z5
pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) pmulhw xmm1, [rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4,xmm7 ; xmm4=z2 paddw xmm4, xmm7 ; xmm4=z2
paddw xmm1,xmm7 ; xmm1=z4 paddw xmm1, xmm7 ; xmm1=z4
movdqa xmm3,xmm5 movdqa xmm3, xmm5
psubw xmm5,xmm0 ; xmm5=z13 psubw xmm5, xmm0 ; xmm5=z13
paddw xmm3,xmm0 ; xmm3=z11 paddw xmm3, xmm0 ; xmm3=z11
movdqa xmm6,xmm5 movdqa xmm6, xmm5
movdqa xmm2,xmm3 movdqa xmm2, xmm3
psubw xmm5,xmm4 ; xmm5=data3 psubw xmm5, xmm4 ; xmm5=data3
psubw xmm3,xmm1 ; xmm3=data7 psubw xmm3, xmm1 ; xmm3=data7
paddw xmm6,xmm4 ; xmm6=data5 paddw xmm6, xmm4 ; xmm6=data5
paddw xmm2,xmm1 ; xmm2=data1 paddw xmm2, xmm1 ; xmm2=data1
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5 movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
@@ -381,7 +381,7 @@ EXTN(jsimd_fdct_ifast_sse2):
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret

View File

@@ -35,10 +35,10 @@ F_1_306 equ 334 ; FIX(1.306562965)
%else %else
; NASM cannot do compile-time arithmetic on floating-point constants. ; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) F_0_382 equ DESCALE( 410903207, 30-CONST_BITS) ; FIX(0.382683433)
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) F_0_707 equ DESCALE( 759250124, 30-CONST_BITS) ; FIX(0.707106781)
F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) F_1_306 equ DESCALE(1402911301, 30-CONST_BITS) ; FIX(1.306562965)
%endif %endif
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -83,11 +83,11 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
EXTN(jsimd_fdct_ifast_sse2): EXTN(jsimd_fdct_ifast_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx pushpic ebx
; push ecx ; unused ; push ecx ; unused
@@ -109,12 +109,12 @@ EXTN(jsimd_fdct_ifast_sse2):
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
movdqa xmm4,xmm0 ; transpose coefficients(phase 1) movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
movdqa xmm5,xmm2 ; transpose coefficients(phase 1) movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
@@ -127,84 +127,84 @@ EXTN(jsimd_fdct_ifast_sse2):
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
movdqa xmm2,xmm6 ; transpose coefficients(phase 1) movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm5,xmm1 ; transpose coefficients(phase 1) movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
movdqa xmm7,xmm6 ; transpose coefficients(phase 2) movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
movdqa xmm3,xmm2 ; transpose coefficients(phase 2) movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
movdqa xmm7,xmm0 ; transpose coefficients(phase 2) movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
movdqa xmm2,xmm4 ; transpose coefficients(phase 2) movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
movdqa xmm1,xmm0 ; transpose coefficients(phase 3) movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
movdqa xmm5,xmm2 ; transpose coefficients(phase 3) movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
movdqa xmm6,xmm1 movdqa xmm6, xmm1
movdqa xmm3,xmm0 movdqa xmm3, xmm0
psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
movdqa xmm1,xmm7 ; transpose coefficients(phase 3) movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
movdqa xmm0,xmm4 ; transpose coefficients(phase 3) movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
movdqa xmm2,xmm1 movdqa xmm2, xmm1
movdqa xmm5,xmm7 movdqa xmm5, xmm7
paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
; -- Even part ; -- Even part
movdqa xmm4,xmm3 movdqa xmm4, xmm3
movdqa xmm0,xmm6 movdqa xmm0, xmm6
psubw xmm3,xmm1 ; xmm3=tmp13 psubw xmm3, xmm1 ; xmm3=tmp13
psubw xmm6,xmm7 ; xmm6=tmp12 psubw xmm6, xmm7 ; xmm6=tmp12
paddw xmm4,xmm1 ; xmm4=tmp10 paddw xmm4, xmm1 ; xmm4=tmp10
paddw xmm0,xmm7 ; xmm0=tmp11 paddw xmm0, xmm7 ; xmm0=tmp11
paddw xmm6,xmm3 paddw xmm6, xmm3
psllw xmm6,PRE_MULTIPLY_SCALE_BITS psllw xmm6, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1 pmulhw xmm6, [GOTOFF(ebx,PW_F0707)] ; xmm6=z1
movdqa xmm1,xmm4 movdqa xmm1, xmm4
movdqa xmm7,xmm3 movdqa xmm7, xmm3
psubw xmm4,xmm0 ; xmm4=data4 psubw xmm4, xmm0 ; xmm4=data4
psubw xmm3,xmm6 ; xmm3=data6 psubw xmm3, xmm6 ; xmm3=data6
paddw xmm1,xmm0 ; xmm1=data0 paddw xmm1, xmm0 ; xmm1=data0
paddw xmm7,xmm6 ; xmm7=data2 paddw xmm7, xmm6 ; xmm7=data2
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
@@ -213,34 +213,34 @@ EXTN(jsimd_fdct_ifast_sse2):
; -- Odd part ; -- Odd part
paddw xmm2,xmm5 ; xmm2=tmp10 paddw xmm2, xmm5 ; xmm2=tmp10
paddw xmm5,xmm0 ; xmm5=tmp11 paddw xmm5, xmm0 ; xmm5=tmp11
paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
psllw xmm2,PRE_MULTIPLY_SCALE_BITS psllw xmm2, PRE_MULTIPLY_SCALE_BITS
psllw xmm0,PRE_MULTIPLY_SCALE_BITS psllw xmm0, PRE_MULTIPLY_SCALE_BITS
psllw xmm5,PRE_MULTIPLY_SCALE_BITS psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3 pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z3
movdqa xmm4,xmm2 ; xmm4=tmp10 movdqa xmm4, xmm2 ; xmm4=tmp10
psubw xmm2,xmm0 psubw xmm2, xmm0
pmulhw xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5 pmulhw xmm2, [GOTOFF(ebx,PW_F0382)] ; xmm2=z5
pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) pmulhw xmm0, [GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4,xmm2 ; xmm4=z2 paddw xmm4, xmm2 ; xmm4=z2
paddw xmm0,xmm2 ; xmm0=z4 paddw xmm0, xmm2 ; xmm0=z4
movdqa xmm3,xmm6 movdqa xmm3, xmm6
psubw xmm6,xmm5 ; xmm6=z13 psubw xmm6, xmm5 ; xmm6=z13
paddw xmm3,xmm5 ; xmm3=z11 paddw xmm3, xmm5 ; xmm3=z11
movdqa xmm2,xmm6 movdqa xmm2, xmm6
movdqa xmm5,xmm3 movdqa xmm5, xmm3
psubw xmm6,xmm4 ; xmm6=data3 psubw xmm6, xmm4 ; xmm6=data3
psubw xmm3,xmm0 ; xmm3=data7 psubw xmm3, xmm0 ; xmm3=data7
paddw xmm2,xmm4 ; xmm2=data5 paddw xmm2, xmm4 ; xmm2=data5
paddw xmm5,xmm0 ; xmm5=data1 paddw xmm5, xmm0 ; xmm5=data1
; ---- Pass 2: process columns. ; ---- Pass 2: process columns.
@@ -249,12 +249,12 @@ EXTN(jsimd_fdct_ifast_sse2):
; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
movdqa xmm4,xmm1 ; transpose coefficients(phase 1) movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
movdqa xmm0,xmm7 ; transpose coefficients(phase 1) movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
@@ -265,84 +265,84 @@ EXTN(jsimd_fdct_ifast_sse2):
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
movdqa xmm7,xmm5 ; transpose coefficients(phase 1) movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
movdqa xmm0,xmm6 ; transpose coefficients(phase 1) movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
movdqa xmm2,xmm5 ; transpose coefficients(phase 2) movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
movdqa xmm3,xmm7 ; transpose coefficients(phase 2) movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
movdqa xmm2,xmm1 ; transpose coefficients(phase 2) movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
movdqa xmm7,xmm4 ; transpose coefficients(phase 2) movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
movdqa xmm6,xmm1 ; transpose coefficients(phase 3) movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
movdqa xmm0,xmm7 ; transpose coefficients(phase 3) movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
movdqa xmm5,xmm6 movdqa xmm5, xmm6
movdqa xmm3,xmm1 movdqa xmm3, xmm1
psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
movdqa xmm6,xmm2 ; transpose coefficients(phase 3) movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
movdqa xmm1,xmm4 ; transpose coefficients(phase 3) movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
movdqa xmm7,xmm6 movdqa xmm7, xmm6
movdqa xmm0,xmm2 movdqa xmm0, xmm2
paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
; -- Even part ; -- Even part
movdqa xmm4,xmm3 movdqa xmm4, xmm3
movdqa xmm1,xmm5 movdqa xmm1, xmm5
psubw xmm3,xmm6 ; xmm3=tmp13 psubw xmm3, xmm6 ; xmm3=tmp13
psubw xmm5,xmm2 ; xmm5=tmp12 psubw xmm5, xmm2 ; xmm5=tmp12
paddw xmm4,xmm6 ; xmm4=tmp10 paddw xmm4, xmm6 ; xmm4=tmp10
paddw xmm1,xmm2 ; xmm1=tmp11 paddw xmm1, xmm2 ; xmm1=tmp11
paddw xmm5,xmm3 paddw xmm5, xmm3
psllw xmm5,PRE_MULTIPLY_SCALE_BITS psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1 pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z1
movdqa xmm6,xmm4 movdqa xmm6, xmm4
movdqa xmm2,xmm3 movdqa xmm2, xmm3
psubw xmm4,xmm1 ; xmm4=data4 psubw xmm4, xmm1 ; xmm4=data4
psubw xmm3,xmm5 ; xmm3=data6 psubw xmm3, xmm5 ; xmm3=data6
paddw xmm6,xmm1 ; xmm6=data0 paddw xmm6, xmm1 ; xmm6=data0
paddw xmm2,xmm5 ; xmm2=data2 paddw xmm2, xmm5 ; xmm2=data2
movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4 movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
@@ -354,34 +354,34 @@ EXTN(jsimd_fdct_ifast_sse2):
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
paddw xmm7,xmm0 ; xmm7=tmp10 paddw xmm7, xmm0 ; xmm7=tmp10
paddw xmm0,xmm1 ; xmm0=tmp11 paddw xmm0, xmm1 ; xmm0=tmp11
paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
psllw xmm7,PRE_MULTIPLY_SCALE_BITS psllw xmm7, PRE_MULTIPLY_SCALE_BITS
psllw xmm1,PRE_MULTIPLY_SCALE_BITS psllw xmm1, PRE_MULTIPLY_SCALE_BITS
psllw xmm0,PRE_MULTIPLY_SCALE_BITS psllw xmm0, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3 pmulhw xmm0, [GOTOFF(ebx,PW_F0707)] ; xmm0=z3
movdqa xmm4,xmm7 ; xmm4=tmp10 movdqa xmm4, xmm7 ; xmm4=tmp10
psubw xmm7,xmm1 psubw xmm7, xmm1
pmulhw xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5 pmulhw xmm7, [GOTOFF(ebx,PW_F0382)] ; xmm7=z5
pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) pmulhw xmm1, [GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4,xmm7 ; xmm4=z2 paddw xmm4, xmm7 ; xmm4=z2
paddw xmm1,xmm7 ; xmm1=z4 paddw xmm1, xmm7 ; xmm1=z4
movdqa xmm3,xmm5 movdqa xmm3, xmm5
psubw xmm5,xmm0 ; xmm5=z13 psubw xmm5, xmm0 ; xmm5=z13
paddw xmm3,xmm0 ; xmm3=z11 paddw xmm3, xmm0 ; xmm3=z11
movdqa xmm6,xmm5 movdqa xmm6, xmm5
movdqa xmm2,xmm3 movdqa xmm2, xmm3
psubw xmm5,xmm4 ; xmm5=data3 psubw xmm5, xmm4 ; xmm5=data3
psubw xmm3,xmm1 ; xmm3=data7 psubw xmm3, xmm1 ; xmm3=data7
paddw xmm6,xmm4 ; xmm6=data5 paddw xmm6, xmm4 ; xmm6=data5
paddw xmm2,xmm1 ; xmm2=data1 paddw xmm2, xmm1 ; xmm2=data1
movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5 movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3

View File

@@ -48,18 +48,18 @@ F_3_072 equ 25172 ; FIX(3.072711026)
%else %else
; NASM cannot do compile-time arithmetic on floating-point constants. ; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) F_0_298 equ DESCALE( 320652955, 30-CONST_BITS) ; FIX(0.298631336)
F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) F_0_390 equ DESCALE( 418953276, 30-CONST_BITS) ; FIX(0.390180644)
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) F_1_175 equ DESCALE(1262586813, 30-CONST_BITS) ; FIX(1.175875602)
F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) F_1_501 equ DESCALE(1612031267, 30-CONST_BITS) ; FIX(1.501321110)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) F_1_961 equ DESCALE(2106220350, 30-CONST_BITS) ; FIX(1.961570560)
F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) F_2_053 equ DESCALE(2204520673, 30-CONST_BITS) ; FIX(2.053119869)
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026)
%endif %endif
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -104,11 +104,11 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1)
EXTN(jsimd_fdct_islow_sse2): EXTN(jsimd_fdct_islow_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
@@ -124,12 +124,12 @@ EXTN(jsimd_fdct_islow_sse2):
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
movdqa xmm4,xmm0 ; transpose coefficients(phase 1) movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
movdqa xmm5,xmm2 ; transpose coefficients(phase 1) movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
@@ -142,80 +142,80 @@ EXTN(jsimd_fdct_islow_sse2):
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
movdqa xmm2,xmm6 ; transpose coefficients(phase 1) movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm5,xmm1 ; transpose coefficients(phase 1) movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
movdqa xmm7,xmm6 ; transpose coefficients(phase 2) movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
movdqa xmm3,xmm2 ; transpose coefficients(phase 2) movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
movdqa xmm7,xmm0 ; transpose coefficients(phase 2) movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
movdqa xmm2,xmm4 ; transpose coefficients(phase 2) movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
movdqa xmm1,xmm0 ; transpose coefficients(phase 3) movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
movdqa xmm5,xmm2 ; transpose coefficients(phase 3) movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
movdqa xmm6,xmm1 movdqa xmm6, xmm1
movdqa xmm3,xmm0 movdqa xmm3, xmm0
psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
movdqa xmm1,xmm7 ; transpose coefficients(phase 3) movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
movdqa xmm0,xmm4 ; transpose coefficients(phase 3) movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
movdqa xmm2,xmm1 movdqa xmm2, xmm1
movdqa xmm5,xmm7 movdqa xmm5, xmm7
paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
; -- Even part ; -- Even part
movdqa xmm4,xmm3 movdqa xmm4, xmm3
movdqa xmm0,xmm6 movdqa xmm0, xmm6
paddw xmm3,xmm1 ; xmm3=tmp10 paddw xmm3, xmm1 ; xmm3=tmp10
paddw xmm6,xmm7 ; xmm6=tmp11 paddw xmm6, xmm7 ; xmm6=tmp11
psubw xmm4,xmm1 ; xmm4=tmp13 psubw xmm4, xmm1 ; xmm4=tmp13
psubw xmm0,xmm7 ; xmm0=tmp12 psubw xmm0, xmm7 ; xmm0=tmp12
movdqa xmm1,xmm3 movdqa xmm1, xmm3
paddw xmm3,xmm6 ; xmm3=tmp10+tmp11 paddw xmm3, xmm6 ; xmm3=tmp10+tmp11
psubw xmm1,xmm6 ; xmm1=tmp10-tmp11 psubw xmm1, xmm6 ; xmm1=tmp10-tmp11
psllw xmm3,PASS1_BITS ; xmm3=data0 psllw xmm3, PASS1_BITS ; xmm3=data0
psllw xmm1,PASS1_BITS ; xmm1=data4 psllw xmm1, PASS1_BITS ; xmm1=data4
movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
@@ -229,28 +229,28 @@ EXTN(jsimd_fdct_islow_sse2):
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
movdqa xmm7,xmm4 ; xmm4=tmp13 movdqa xmm7, xmm4 ; xmm4=tmp13
movdqa xmm6,xmm4 movdqa xmm6, xmm4
punpcklwd xmm7,xmm0 ; xmm0=tmp12 punpcklwd xmm7, xmm0 ; xmm0=tmp12
punpckhwd xmm6,xmm0 punpckhwd xmm6, xmm0
movdqa xmm4,xmm7 movdqa xmm4, xmm7
movdqa xmm0,xmm6 movdqa xmm0, xmm6
pmaddwd xmm7,[rel PW_F130_F054] ; xmm7=data2L pmaddwd xmm7, [rel PW_F130_F054] ; xmm7=data2L
pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=data2H pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=data2H
pmaddwd xmm4,[rel PW_F054_MF130] ; xmm4=data6L pmaddwd xmm4, [rel PW_F054_MF130] ; xmm4=data6L
pmaddwd xmm0,[rel PW_F054_MF130] ; xmm0=data6H pmaddwd xmm0, [rel PW_F054_MF130] ; xmm0=data6H
paddd xmm7,[rel PD_DESCALE_P1] paddd xmm7, [rel PD_DESCALE_P1]
paddd xmm6,[rel PD_DESCALE_P1] paddd xmm6, [rel PD_DESCALE_P1]
psrad xmm7,DESCALE_P1 psrad xmm7, DESCALE_P1
psrad xmm6,DESCALE_P1 psrad xmm6, DESCALE_P1
paddd xmm4,[rel PD_DESCALE_P1] paddd xmm4, [rel PD_DESCALE_P1]
paddd xmm0,[rel PD_DESCALE_P1] paddd xmm0, [rel PD_DESCALE_P1]
psrad xmm4,DESCALE_P1 psrad xmm4, DESCALE_P1
psrad xmm0,DESCALE_P1 psrad xmm0, DESCALE_P1
packssdw xmm7,xmm6 ; xmm7=data2 packssdw xmm7, xmm6 ; xmm7=data2
packssdw xmm4,xmm0 ; xmm4=data6 packssdw xmm4, xmm0 ; xmm4=data6
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
@@ -260,10 +260,10 @@ EXTN(jsimd_fdct_islow_sse2):
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
movdqa xmm6,xmm2 ; xmm2=tmp4 movdqa xmm6, xmm2 ; xmm2=tmp4
movdqa xmm0,xmm5 ; xmm5=tmp5 movdqa xmm0, xmm5 ; xmm5=tmp5
paddw xmm6,xmm3 ; xmm6=z3 paddw xmm6, xmm3 ; xmm6=z3
paddw xmm0,xmm1 ; xmm0=z4 paddw xmm0, xmm1 ; xmm0=z4
; (Original) ; (Original)
; z5 = (z3 + z4) * 1.175875602; ; z5 = (z3 + z4) * 1.175875602;
@@ -274,16 +274,16 @@ EXTN(jsimd_fdct_islow_sse2):
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
movdqa xmm7,xmm6 movdqa xmm7, xmm6
movdqa xmm4,xmm6 movdqa xmm4, xmm6
punpcklwd xmm7,xmm0 punpcklwd xmm7, xmm0
punpckhwd xmm4,xmm0 punpckhwd xmm4, xmm0
movdqa xmm6,xmm7 movdqa xmm6, xmm7
movdqa xmm0,xmm4 movdqa xmm0, xmm4
pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3L pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3L
pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3H pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3H
pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4L pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4L
pmaddwd xmm0,[rel PW_F117_F078] ; xmm0=z4H pmaddwd xmm0, [rel PW_F117_F078] ; xmm0=z4H
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
@@ -304,61 +304,61 @@ EXTN(jsimd_fdct_islow_sse2):
; data7 = tmp4 + z3; data5 = tmp5 + z4; ; data7 = tmp4 + z3; data5 = tmp5 + z4;
; data3 = tmp6 + z3; data1 = tmp7 + z4; ; data3 = tmp6 + z3; data1 = tmp7 + z4;
movdqa xmm7,xmm2 movdqa xmm7, xmm2
movdqa xmm4,xmm2 movdqa xmm4, xmm2
punpcklwd xmm7,xmm1 punpcklwd xmm7, xmm1
punpckhwd xmm4,xmm1 punpckhwd xmm4, xmm1
movdqa xmm2,xmm7 movdqa xmm2, xmm7
movdqa xmm1,xmm4 movdqa xmm1, xmm4
pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp4L pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp4L
pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4H pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4H
pmaddwd xmm2,[rel PW_MF089_F060] ; xmm2=tmp7L pmaddwd xmm2, [rel PW_MF089_F060] ; xmm2=tmp7L
pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp7H pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp7H
paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
paddd xmm2,xmm6 ; xmm2=data1L paddd xmm2, xmm6 ; xmm2=data1L
paddd xmm1,xmm0 ; xmm1=data1H paddd xmm1, xmm0 ; xmm1=data1H
paddd xmm7,[rel PD_DESCALE_P1] paddd xmm7, [rel PD_DESCALE_P1]
paddd xmm4,[rel PD_DESCALE_P1] paddd xmm4, [rel PD_DESCALE_P1]
psrad xmm7,DESCALE_P1 psrad xmm7, DESCALE_P1
psrad xmm4,DESCALE_P1 psrad xmm4, DESCALE_P1
paddd xmm2,[rel PD_DESCALE_P1] paddd xmm2, [rel PD_DESCALE_P1]
paddd xmm1,[rel PD_DESCALE_P1] paddd xmm1, [rel PD_DESCALE_P1]
psrad xmm2,DESCALE_P1 psrad xmm2, DESCALE_P1
psrad xmm1,DESCALE_P1 psrad xmm1, DESCALE_P1
packssdw xmm7,xmm4 ; xmm7=data7 packssdw xmm7, xmm4 ; xmm7=data7
packssdw xmm2,xmm1 ; xmm2=data1 packssdw xmm2, xmm1 ; xmm2=data1
movdqa xmm4,xmm5 movdqa xmm4, xmm5
movdqa xmm1,xmm5 movdqa xmm1, xmm5
punpcklwd xmm4,xmm3 punpcklwd xmm4, xmm3
punpckhwd xmm1,xmm3 punpckhwd xmm1, xmm3
movdqa xmm5,xmm4 movdqa xmm5, xmm4
movdqa xmm3,xmm1 movdqa xmm3, xmm1
pmaddwd xmm4,[rel PW_MF050_MF256] ; xmm4=tmp5L pmaddwd xmm4, [rel PW_MF050_MF256] ; xmm4=tmp5L
pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5H pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5H
pmaddwd xmm5,[rel PW_MF256_F050] ; xmm5=tmp6L pmaddwd xmm5, [rel PW_MF256_F050] ; xmm5=tmp6L
pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6H pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6H
paddd xmm4,xmm6 ; xmm4=data5L paddd xmm4, xmm6 ; xmm4=data5L
paddd xmm1,xmm0 ; xmm1=data5H paddd xmm1, xmm0 ; xmm1=data5H
paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
paddd xmm4,[rel PD_DESCALE_P1] paddd xmm4, [rel PD_DESCALE_P1]
paddd xmm1,[rel PD_DESCALE_P1] paddd xmm1, [rel PD_DESCALE_P1]
psrad xmm4,DESCALE_P1 psrad xmm4, DESCALE_P1
psrad xmm1,DESCALE_P1 psrad xmm1, DESCALE_P1
paddd xmm5,[rel PD_DESCALE_P1] paddd xmm5, [rel PD_DESCALE_P1]
paddd xmm3,[rel PD_DESCALE_P1] paddd xmm3, [rel PD_DESCALE_P1]
psrad xmm5,DESCALE_P1 psrad xmm5, DESCALE_P1
psrad xmm3,DESCALE_P1 psrad xmm3, DESCALE_P1
packssdw xmm4,xmm1 ; xmm4=data5 packssdw xmm4, xmm1 ; xmm4=data5
packssdw xmm5,xmm3 ; xmm5=data3 packssdw xmm5, xmm3 ; xmm5=data3
; ---- Pass 2: process columns. ; ---- Pass 2: process columns.
@@ -368,12 +368,12 @@ EXTN(jsimd_fdct_islow_sse2):
; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
movdqa xmm1,xmm6 ; transpose coefficients(phase 1) movdqa xmm1, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31) punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71) punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
movdqa xmm3,xmm0 ; transpose coefficients(phase 1) movdqa xmm3, xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33) punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73) punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
@@ -384,82 +384,82 @@ EXTN(jsimd_fdct_islow_sse2):
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
movdqa xmm0,xmm2 ; transpose coefficients(phase 1) movdqa xmm0, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35) punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75) punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
movdqa xmm3,xmm5 ; transpose coefficients(phase 1) movdqa xmm3, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37) punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77) punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
movdqa xmm4,xmm2 ; transpose coefficients(phase 2) movdqa xmm4, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17) punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37) punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
movdqa xmm7,xmm0 ; transpose coefficients(phase 2) movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57) punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77) punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
movdqa xmm4,xmm6 ; transpose coefficients(phase 2) movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13) punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33) punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
movdqa xmm0,xmm1 ; transpose coefficients(phase 2) movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53) punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73) punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
movdqa xmm5,xmm6 ; transpose coefficients(phase 3) movdqa xmm5, xmm6 ; transpose coefficients(phase 3)
punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
movdqa xmm3,xmm0 ; transpose coefficients(phase 3) movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
movdqa xmm2,xmm5 movdqa xmm2, xmm5
movdqa xmm7,xmm6 movdqa xmm7, xmm6
psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6 psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6
psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7 psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7
paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1 paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1
paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0 paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0
movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movdqa xmm5,xmm4 ; transpose coefficients(phase 3) movdqa xmm5, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
movdqa xmm6,xmm1 ; transpose coefficients(phase 3) movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
movdqa xmm0,xmm5 movdqa xmm0, xmm5
movdqa xmm3,xmm4 movdqa xmm3, xmm4
paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3 paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3
paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2 paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2
psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4 psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4
psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5 psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part ; -- Even part
movdqa xmm1,xmm7 movdqa xmm1, xmm7
movdqa xmm6,xmm2 movdqa xmm6, xmm2
paddw xmm7,xmm5 ; xmm7=tmp10 paddw xmm7, xmm5 ; xmm7=tmp10
paddw xmm2,xmm4 ; xmm2=tmp11 paddw xmm2, xmm4 ; xmm2=tmp11
psubw xmm1,xmm5 ; xmm1=tmp13 psubw xmm1, xmm5 ; xmm1=tmp13
psubw xmm6,xmm4 ; xmm6=tmp12 psubw xmm6, xmm4 ; xmm6=tmp12
movdqa xmm5,xmm7 movdqa xmm5, xmm7
paddw xmm7,xmm2 ; xmm7=tmp10+tmp11 paddw xmm7, xmm2 ; xmm7=tmp10+tmp11
psubw xmm5,xmm2 ; xmm5=tmp10-tmp11 psubw xmm5, xmm2 ; xmm5=tmp10-tmp11
paddw xmm7,[rel PW_DESCALE_P2X] paddw xmm7, [rel PW_DESCALE_P2X]
paddw xmm5,[rel PW_DESCALE_P2X] paddw xmm5, [rel PW_DESCALE_P2X]
psraw xmm7,PASS1_BITS ; xmm7=data0 psraw xmm7, PASS1_BITS ; xmm7=data0
psraw xmm5,PASS1_BITS ; xmm5=data4 psraw xmm5, PASS1_BITS ; xmm5=data4
movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7 movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5 movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
@@ -473,28 +473,28 @@ EXTN(jsimd_fdct_islow_sse2):
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
movdqa xmm4,xmm1 ; xmm1=tmp13 movdqa xmm4, xmm1 ; xmm1=tmp13
movdqa xmm2,xmm1 movdqa xmm2, xmm1
punpcklwd xmm4,xmm6 ; xmm6=tmp12 punpcklwd xmm4, xmm6 ; xmm6=tmp12
punpckhwd xmm2,xmm6 punpckhwd xmm2, xmm6
movdqa xmm1,xmm4 movdqa xmm1, xmm4
movdqa xmm6,xmm2 movdqa xmm6, xmm2
pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=data2L pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=data2L
pmaddwd xmm2,[rel PW_F130_F054] ; xmm2=data2H pmaddwd xmm2, [rel PW_F130_F054] ; xmm2=data2H
pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=data6L pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=data6L
pmaddwd xmm6,[rel PW_F054_MF130] ; xmm6=data6H pmaddwd xmm6, [rel PW_F054_MF130] ; xmm6=data6H
paddd xmm4,[rel PD_DESCALE_P2] paddd xmm4, [rel PD_DESCALE_P2]
paddd xmm2,[rel PD_DESCALE_P2] paddd xmm2, [rel PD_DESCALE_P2]
psrad xmm4,DESCALE_P2 psrad xmm4, DESCALE_P2
psrad xmm2,DESCALE_P2 psrad xmm2, DESCALE_P2
paddd xmm1,[rel PD_DESCALE_P2] paddd xmm1, [rel PD_DESCALE_P2]
paddd xmm6,[rel PD_DESCALE_P2] paddd xmm6, [rel PD_DESCALE_P2]
psrad xmm1,DESCALE_P2 psrad xmm1, DESCALE_P2
psrad xmm6,DESCALE_P2 psrad xmm6, DESCALE_P2
packssdw xmm4,xmm2 ; xmm4=data2 packssdw xmm4, xmm2 ; xmm4=data2
packssdw xmm1,xmm6 ; xmm1=data6 packssdw xmm1, xmm6 ; xmm1=data6
movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4 movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1 movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
@@ -504,10 +504,10 @@ EXTN(jsimd_fdct_islow_sse2):
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
movdqa xmm2,xmm0 ; xmm0=tmp4 movdqa xmm2, xmm0 ; xmm0=tmp4
movdqa xmm6,xmm3 ; xmm3=tmp5 movdqa xmm6, xmm3 ; xmm3=tmp5
paddw xmm2,xmm7 ; xmm2=z3 paddw xmm2, xmm7 ; xmm2=z3
paddw xmm6,xmm5 ; xmm6=z4 paddw xmm6, xmm5 ; xmm6=z4
; (Original) ; (Original)
; z5 = (z3 + z4) * 1.175875602; ; z5 = (z3 + z4) * 1.175875602;
@@ -518,16 +518,16 @@ EXTN(jsimd_fdct_islow_sse2):
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
movdqa xmm4,xmm2 movdqa xmm4, xmm2
movdqa xmm1,xmm2 movdqa xmm1, xmm2
punpcklwd xmm4,xmm6 punpcklwd xmm4, xmm6
punpckhwd xmm1,xmm6 punpckhwd xmm1, xmm6
movdqa xmm2,xmm4 movdqa xmm2, xmm4
movdqa xmm6,xmm1 movdqa xmm6, xmm1
pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3L pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3L
pmaddwd xmm1,[rel PW_MF078_F117] ; xmm1=z3H pmaddwd xmm1, [rel PW_MF078_F117] ; xmm1=z3H
pmaddwd xmm2,[rel PW_F117_F078] ; xmm2=z4L pmaddwd xmm2, [rel PW_F117_F078] ; xmm2=z4L
pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4H pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4H
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
@@ -548,70 +548,70 @@ EXTN(jsimd_fdct_islow_sse2):
; data7 = tmp4 + z3; data5 = tmp5 + z4; ; data7 = tmp4 + z3; data5 = tmp5 + z4;
; data3 = tmp6 + z3; data1 = tmp7 + z4; ; data3 = tmp6 + z3; data1 = tmp7 + z4;
movdqa xmm4,xmm0 movdqa xmm4, xmm0
movdqa xmm1,xmm0 movdqa xmm1, xmm0
punpcklwd xmm4,xmm5 punpcklwd xmm4, xmm5
punpckhwd xmm1,xmm5 punpckhwd xmm1, xmm5
movdqa xmm0,xmm4 movdqa xmm0, xmm4
movdqa xmm5,xmm1 movdqa xmm5, xmm1
pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4L pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4L
pmaddwd xmm1,[rel PW_MF060_MF089] ; xmm1=tmp4H pmaddwd xmm1, [rel PW_MF060_MF089] ; xmm1=tmp4H
pmaddwd xmm0,[rel PW_MF089_F060] ; xmm0=tmp7L pmaddwd xmm0, [rel PW_MF089_F060] ; xmm0=tmp7L
pmaddwd xmm5,[rel PW_MF089_F060] ; xmm5=tmp7H pmaddwd xmm5, [rel PW_MF089_F060] ; xmm5=tmp7H
paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
paddd xmm0,xmm2 ; xmm0=data1L paddd xmm0, xmm2 ; xmm0=data1L
paddd xmm5,xmm6 ; xmm5=data1H paddd xmm5, xmm6 ; xmm5=data1H
paddd xmm4,[rel PD_DESCALE_P2] paddd xmm4, [rel PD_DESCALE_P2]
paddd xmm1,[rel PD_DESCALE_P2] paddd xmm1, [rel PD_DESCALE_P2]
psrad xmm4,DESCALE_P2 psrad xmm4, DESCALE_P2
psrad xmm1,DESCALE_P2 psrad xmm1, DESCALE_P2
paddd xmm0,[rel PD_DESCALE_P2] paddd xmm0, [rel PD_DESCALE_P2]
paddd xmm5,[rel PD_DESCALE_P2] paddd xmm5, [rel PD_DESCALE_P2]
psrad xmm0,DESCALE_P2 psrad xmm0, DESCALE_P2
psrad xmm5,DESCALE_P2 psrad xmm5, DESCALE_P2
packssdw xmm4,xmm1 ; xmm4=data7 packssdw xmm4, xmm1 ; xmm4=data7
packssdw xmm0,xmm5 ; xmm0=data1 packssdw xmm0, xmm5 ; xmm0=data1
movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4 movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0 movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
movdqa xmm1,xmm3 movdqa xmm1, xmm3
movdqa xmm5,xmm3 movdqa xmm5, xmm3
punpcklwd xmm1,xmm7 punpcklwd xmm1, xmm7
punpckhwd xmm5,xmm7 punpckhwd xmm5, xmm7
movdqa xmm3,xmm1 movdqa xmm3, xmm1
movdqa xmm7,xmm5 movdqa xmm7, xmm5
pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5L pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5L
pmaddwd xmm5,[rel PW_MF050_MF256] ; xmm5=tmp5H pmaddwd xmm5, [rel PW_MF050_MF256] ; xmm5=tmp5H
pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6L pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6L
pmaddwd xmm7,[rel PW_MF256_F050] ; xmm7=tmp6H pmaddwd xmm7, [rel PW_MF256_F050] ; xmm7=tmp6H
paddd xmm1,xmm2 ; xmm1=data5L paddd xmm1, xmm2 ; xmm1=data5L
paddd xmm5,xmm6 ; xmm5=data5H paddd xmm5, xmm6 ; xmm5=data5H
paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
paddd xmm1,[rel PD_DESCALE_P2] paddd xmm1, [rel PD_DESCALE_P2]
paddd xmm5,[rel PD_DESCALE_P2] paddd xmm5, [rel PD_DESCALE_P2]
psrad xmm1,DESCALE_P2 psrad xmm1, DESCALE_P2
psrad xmm5,DESCALE_P2 psrad xmm5, DESCALE_P2
paddd xmm3,[rel PD_DESCALE_P2] paddd xmm3, [rel PD_DESCALE_P2]
paddd xmm7,[rel PD_DESCALE_P2] paddd xmm7, [rel PD_DESCALE_P2]
psrad xmm3,DESCALE_P2 psrad xmm3, DESCALE_P2
psrad xmm7,DESCALE_P2 psrad xmm7, DESCALE_P2
packssdw xmm1,xmm5 ; xmm1=data5 packssdw xmm1, xmm5 ; xmm1=data5
packssdw xmm3,xmm7 ; xmm3=data3 packssdw xmm3, xmm7 ; xmm3=data3
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1 movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret

View File

@@ -47,18 +47,18 @@ F_3_072 equ 25172 ; FIX(3.072711026)
%else %else
; NASM cannot do compile-time arithmetic on floating-point constants. ; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) F_0_298 equ DESCALE( 320652955, 30-CONST_BITS) ; FIX(0.298631336)
F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) F_0_390 equ DESCALE( 418953276, 30-CONST_BITS) ; FIX(0.390180644)
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) F_1_175 equ DESCALE(1262586813, 30-CONST_BITS) ; FIX(1.175875602)
F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) F_1_501 equ DESCALE(1612031267, 30-CONST_BITS) ; FIX(1.501321110)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) F_1_961 equ DESCALE(2106220350, 30-CONST_BITS) ; FIX(1.961570560)
F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) F_2_053 equ DESCALE(2204520673, 30-CONST_BITS) ; FIX(2.053119869)
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026)
%endif %endif
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -104,11 +104,11 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1)
EXTN(jsimd_fdct_islow_sse2): EXTN(jsimd_fdct_islow_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx pushpic ebx
; push ecx ; unused ; push ecx ; unused
@@ -130,12 +130,12 @@ EXTN(jsimd_fdct_islow_sse2):
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
movdqa xmm4,xmm0 ; transpose coefficients(phase 1) movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
movdqa xmm5,xmm2 ; transpose coefficients(phase 1) movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
@@ -148,80 +148,80 @@ EXTN(jsimd_fdct_islow_sse2):
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
movdqa xmm2,xmm6 ; transpose coefficients(phase 1) movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm5,xmm1 ; transpose coefficients(phase 1) movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
movdqa xmm7,xmm6 ; transpose coefficients(phase 2) movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
movdqa xmm3,xmm2 ; transpose coefficients(phase 2) movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
movdqa xmm7,xmm0 ; transpose coefficients(phase 2) movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
movdqa xmm2,xmm4 ; transpose coefficients(phase 2) movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
movdqa xmm1,xmm0 ; transpose coefficients(phase 3) movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
movdqa xmm5,xmm2 ; transpose coefficients(phase 3) movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
movdqa xmm6,xmm1 movdqa xmm6, xmm1
movdqa xmm3,xmm0 movdqa xmm3, xmm0
psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
movdqa xmm1,xmm7 ; transpose coefficients(phase 3) movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
movdqa xmm0,xmm4 ; transpose coefficients(phase 3) movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
movdqa xmm2,xmm1 movdqa xmm2, xmm1
movdqa xmm5,xmm7 movdqa xmm5, xmm7
paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
; -- Even part ; -- Even part
movdqa xmm4,xmm3 movdqa xmm4, xmm3
movdqa xmm0,xmm6 movdqa xmm0, xmm6
paddw xmm3,xmm1 ; xmm3=tmp10 paddw xmm3, xmm1 ; xmm3=tmp10
paddw xmm6,xmm7 ; xmm6=tmp11 paddw xmm6, xmm7 ; xmm6=tmp11
psubw xmm4,xmm1 ; xmm4=tmp13 psubw xmm4, xmm1 ; xmm4=tmp13
psubw xmm0,xmm7 ; xmm0=tmp12 psubw xmm0, xmm7 ; xmm0=tmp12
movdqa xmm1,xmm3 movdqa xmm1, xmm3
paddw xmm3,xmm6 ; xmm3=tmp10+tmp11 paddw xmm3, xmm6 ; xmm3=tmp10+tmp11
psubw xmm1,xmm6 ; xmm1=tmp10-tmp11 psubw xmm1, xmm6 ; xmm1=tmp10-tmp11
psllw xmm3,PASS1_BITS ; xmm3=data0 psllw xmm3, PASS1_BITS ; xmm3=data0
psllw xmm1,PASS1_BITS ; xmm1=data4 psllw xmm1, PASS1_BITS ; xmm1=data4
movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
@@ -235,28 +235,28 @@ EXTN(jsimd_fdct_islow_sse2):
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
movdqa xmm7,xmm4 ; xmm4=tmp13 movdqa xmm7, xmm4 ; xmm4=tmp13
movdqa xmm6,xmm4 movdqa xmm6, xmm4
punpcklwd xmm7,xmm0 ; xmm0=tmp12 punpcklwd xmm7, xmm0 ; xmm0=tmp12
punpckhwd xmm6,xmm0 punpckhwd xmm6, xmm0
movdqa xmm4,xmm7 movdqa xmm4, xmm7
movdqa xmm0,xmm6 movdqa xmm0, xmm6
pmaddwd xmm7,[GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L pmaddwd xmm7, [GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L
pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H
pmaddwd xmm4,[GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L pmaddwd xmm4, [GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L
pmaddwd xmm0,[GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H pmaddwd xmm0, [GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H
paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P1)] paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad xmm7,DESCALE_P1 psrad xmm7, DESCALE_P1
psrad xmm6,DESCALE_P1 psrad xmm6, DESCALE_P1
paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P1)] paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad xmm4,DESCALE_P1 psrad xmm4, DESCALE_P1
psrad xmm0,DESCALE_P1 psrad xmm0, DESCALE_P1
packssdw xmm7,xmm6 ; xmm7=data2 packssdw xmm7, xmm6 ; xmm7=data2
packssdw xmm4,xmm0 ; xmm4=data6 packssdw xmm4, xmm0 ; xmm4=data6
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
@@ -266,10 +266,10 @@ EXTN(jsimd_fdct_islow_sse2):
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
movdqa xmm6,xmm2 ; xmm2=tmp4 movdqa xmm6, xmm2 ; xmm2=tmp4
movdqa xmm0,xmm5 ; xmm5=tmp5 movdqa xmm0, xmm5 ; xmm5=tmp5
paddw xmm6,xmm3 ; xmm6=z3 paddw xmm6, xmm3 ; xmm6=z3
paddw xmm0,xmm1 ; xmm0=z4 paddw xmm0, xmm1 ; xmm0=z4
; (Original) ; (Original)
; z5 = (z3 + z4) * 1.175875602; ; z5 = (z3 + z4) * 1.175875602;
@@ -280,16 +280,16 @@ EXTN(jsimd_fdct_islow_sse2):
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
movdqa xmm7,xmm6 movdqa xmm7, xmm6
movdqa xmm4,xmm6 movdqa xmm4, xmm6
punpcklwd xmm7,xmm0 punpcklwd xmm7, xmm0
punpckhwd xmm4,xmm0 punpckhwd xmm4, xmm0
movdqa xmm6,xmm7 movdqa xmm6, xmm7
movdqa xmm0,xmm4 movdqa xmm0, xmm4
pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L
pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H
pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L
pmaddwd xmm0,[GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H pmaddwd xmm0, [GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
@@ -310,61 +310,61 @@ EXTN(jsimd_fdct_islow_sse2):
; data7 = tmp4 + z3; data5 = tmp5 + z4; ; data7 = tmp4 + z3; data5 = tmp5 + z4;
; data3 = tmp6 + z3; data1 = tmp7 + z4; ; data3 = tmp6 + z3; data1 = tmp7 + z4;
movdqa xmm7,xmm2 movdqa xmm7, xmm2
movdqa xmm4,xmm2 movdqa xmm4, xmm2
punpcklwd xmm7,xmm1 punpcklwd xmm7, xmm1
punpckhwd xmm4,xmm1 punpckhwd xmm4, xmm1
movdqa xmm2,xmm7 movdqa xmm2, xmm7
movdqa xmm1,xmm4 movdqa xmm1, xmm4
pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L
pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H
pmaddwd xmm2,[GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L pmaddwd xmm2, [GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L
pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H
paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
paddd xmm2,xmm6 ; xmm2=data1L paddd xmm2, xmm6 ; xmm2=data1L
paddd xmm1,xmm0 ; xmm1=data1H paddd xmm1, xmm0 ; xmm1=data1H
paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad xmm7,DESCALE_P1 psrad xmm7, DESCALE_P1
psrad xmm4,DESCALE_P1 psrad xmm4, DESCALE_P1
paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad xmm2,DESCALE_P1 psrad xmm2, DESCALE_P1
psrad xmm1,DESCALE_P1 psrad xmm1, DESCALE_P1
packssdw xmm7,xmm4 ; xmm7=data7 packssdw xmm7, xmm4 ; xmm7=data7
packssdw xmm2,xmm1 ; xmm2=data1 packssdw xmm2, xmm1 ; xmm2=data1
movdqa xmm4,xmm5 movdqa xmm4, xmm5
movdqa xmm1,xmm5 movdqa xmm1, xmm5
punpcklwd xmm4,xmm3 punpcklwd xmm4, xmm3
punpckhwd xmm1,xmm3 punpckhwd xmm1, xmm3
movdqa xmm5,xmm4 movdqa xmm5, xmm4
movdqa xmm3,xmm1 movdqa xmm3, xmm1
pmaddwd xmm4,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L pmaddwd xmm4, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L
pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H
pmaddwd xmm5,[GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L pmaddwd xmm5, [GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L
pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H
paddd xmm4,xmm6 ; xmm4=data5L paddd xmm4, xmm6 ; xmm4=data5L
paddd xmm1,xmm0 ; xmm1=data5H paddd xmm1, xmm0 ; xmm1=data5H
paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad xmm4,DESCALE_P1 psrad xmm4, DESCALE_P1
psrad xmm1,DESCALE_P1 psrad xmm1, DESCALE_P1
paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P1)] paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad xmm5,DESCALE_P1 psrad xmm5, DESCALE_P1
psrad xmm3,DESCALE_P1 psrad xmm3, DESCALE_P1
packssdw xmm4,xmm1 ; xmm4=data5 packssdw xmm4, xmm1 ; xmm4=data5
packssdw xmm5,xmm3 ; xmm5=data3 packssdw xmm5, xmm3 ; xmm5=data3
; ---- Pass 2: process columns. ; ---- Pass 2: process columns.
@@ -376,12 +376,12 @@ EXTN(jsimd_fdct_islow_sse2):
; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
movdqa xmm1,xmm6 ; transpose coefficients(phase 1) movdqa xmm1, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31) punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71) punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
movdqa xmm3,xmm0 ; transpose coefficients(phase 1) movdqa xmm3, xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33) punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73) punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
@@ -392,82 +392,82 @@ EXTN(jsimd_fdct_islow_sse2):
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
movdqa xmm0,xmm2 ; transpose coefficients(phase 1) movdqa xmm0, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35) punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75) punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
movdqa xmm3,xmm5 ; transpose coefficients(phase 1) movdqa xmm3, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37) punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77) punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
movdqa xmm4,xmm2 ; transpose coefficients(phase 2) movdqa xmm4, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17) punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37) punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
movdqa xmm7,xmm0 ; transpose coefficients(phase 2) movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57) punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77) punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
movdqa xmm4,xmm6 ; transpose coefficients(phase 2) movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13) punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33) punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
movdqa xmm0,xmm1 ; transpose coefficients(phase 2) movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53) punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73) punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
movdqa xmm5,xmm6 ; transpose coefficients(phase 3) movdqa xmm5, xmm6 ; transpose coefficients(phase 3)
punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
movdqa xmm3,xmm0 ; transpose coefficients(phase 3) movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
movdqa xmm2,xmm5 movdqa xmm2, xmm5
movdqa xmm7,xmm6 movdqa xmm7, xmm6
psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6 psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6
psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7 psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7
paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1 paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1
paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0 paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0
movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movdqa xmm5,xmm4 ; transpose coefficients(phase 3) movdqa xmm5, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
movdqa xmm6,xmm1 ; transpose coefficients(phase 3) movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
movdqa xmm0,xmm5 movdqa xmm0, xmm5
movdqa xmm3,xmm4 movdqa xmm3, xmm4
paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3 paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3
paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2 paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2
psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4 psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4
psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5 psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part ; -- Even part
movdqa xmm1,xmm7 movdqa xmm1, xmm7
movdqa xmm6,xmm2 movdqa xmm6, xmm2
paddw xmm7,xmm5 ; xmm7=tmp10 paddw xmm7, xmm5 ; xmm7=tmp10
paddw xmm2,xmm4 ; xmm2=tmp11 paddw xmm2, xmm4 ; xmm2=tmp11
psubw xmm1,xmm5 ; xmm1=tmp13 psubw xmm1, xmm5 ; xmm1=tmp13
psubw xmm6,xmm4 ; xmm6=tmp12 psubw xmm6, xmm4 ; xmm6=tmp12
movdqa xmm5,xmm7 movdqa xmm5, xmm7
paddw xmm7,xmm2 ; xmm7=tmp10+tmp11 paddw xmm7, xmm2 ; xmm7=tmp10+tmp11
psubw xmm5,xmm2 ; xmm5=tmp10-tmp11 psubw xmm5, xmm2 ; xmm5=tmp10-tmp11
paddw xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)] paddw xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
paddw xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)] paddw xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
psraw xmm7,PASS1_BITS ; xmm7=data0 psraw xmm7, PASS1_BITS ; xmm7=data0
psraw xmm5,PASS1_BITS ; xmm5=data4 psraw xmm5, PASS1_BITS ; xmm5=data4
movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7 movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5 movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
@@ -481,28 +481,28 @@ EXTN(jsimd_fdct_islow_sse2):
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
movdqa xmm4,xmm1 ; xmm1=tmp13 movdqa xmm4, xmm1 ; xmm1=tmp13
movdqa xmm2,xmm1 movdqa xmm2, xmm1
punpcklwd xmm4,xmm6 ; xmm6=tmp12 punpcklwd xmm4, xmm6 ; xmm6=tmp12
punpckhwd xmm2,xmm6 punpckhwd xmm2, xmm6
movdqa xmm1,xmm4 movdqa xmm1, xmm4
movdqa xmm6,xmm2 movdqa xmm6, xmm2
pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L
pmaddwd xmm2,[GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H pmaddwd xmm2, [GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H
pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L
pmaddwd xmm6,[GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H pmaddwd xmm6, [GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H
paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)] paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad xmm4,DESCALE_P2 psrad xmm4, DESCALE_P2
psrad xmm2,DESCALE_P2 psrad xmm2, DESCALE_P2
paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2)] paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad xmm1,DESCALE_P2 psrad xmm1, DESCALE_P2
psrad xmm6,DESCALE_P2 psrad xmm6, DESCALE_P2
packssdw xmm4,xmm2 ; xmm4=data2 packssdw xmm4, xmm2 ; xmm4=data2
packssdw xmm1,xmm6 ; xmm1=data6 packssdw xmm1, xmm6 ; xmm1=data6
movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4 movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1 movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
@@ -512,10 +512,10 @@ EXTN(jsimd_fdct_islow_sse2):
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
movdqa xmm2,xmm0 ; xmm0=tmp4 movdqa xmm2, xmm0 ; xmm0=tmp4
movdqa xmm6,xmm3 ; xmm3=tmp5 movdqa xmm6, xmm3 ; xmm3=tmp5
paddw xmm2,xmm7 ; xmm2=z3 paddw xmm2, xmm7 ; xmm2=z3
paddw xmm6,xmm5 ; xmm6=z4 paddw xmm6, xmm5 ; xmm6=z4
; (Original) ; (Original)
; z5 = (z3 + z4) * 1.175875602; ; z5 = (z3 + z4) * 1.175875602;
@@ -526,16 +526,16 @@ EXTN(jsimd_fdct_islow_sse2):
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
movdqa xmm4,xmm2 movdqa xmm4, xmm2
movdqa xmm1,xmm2 movdqa xmm1, xmm2
punpcklwd xmm4,xmm6 punpcklwd xmm4, xmm6
punpckhwd xmm1,xmm6 punpckhwd xmm1, xmm6
movdqa xmm2,xmm4 movdqa xmm2, xmm4
movdqa xmm6,xmm1 movdqa xmm6, xmm1
pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L
pmaddwd xmm1,[GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H pmaddwd xmm1, [GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H
pmaddwd xmm2,[GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L pmaddwd xmm2, [GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L
pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
@@ -556,64 +556,64 @@ EXTN(jsimd_fdct_islow_sse2):
; data7 = tmp4 + z3; data5 = tmp5 + z4; ; data7 = tmp4 + z3; data5 = tmp5 + z4;
; data3 = tmp6 + z3; data1 = tmp7 + z4; ; data3 = tmp6 + z3; data1 = tmp7 + z4;
movdqa xmm4,xmm0 movdqa xmm4, xmm0
movdqa xmm1,xmm0 movdqa xmm1, xmm0
punpcklwd xmm4,xmm5 punpcklwd xmm4, xmm5
punpckhwd xmm1,xmm5 punpckhwd xmm1, xmm5
movdqa xmm0,xmm4 movdqa xmm0, xmm4
movdqa xmm5,xmm1 movdqa xmm5, xmm1
pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L
pmaddwd xmm1,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H pmaddwd xmm1, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H
pmaddwd xmm0,[GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L pmaddwd xmm0, [GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L
pmaddwd xmm5,[GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H pmaddwd xmm5, [GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H
paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
paddd xmm0,xmm2 ; xmm0=data1L paddd xmm0, xmm2 ; xmm0=data1L
paddd xmm5,xmm6 ; xmm5=data1H paddd xmm5, xmm6 ; xmm5=data1H
paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)] paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad xmm4,DESCALE_P2 psrad xmm4, DESCALE_P2
psrad xmm1,DESCALE_P2 psrad xmm1, DESCALE_P2
paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P2)] paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad xmm0,DESCALE_P2 psrad xmm0, DESCALE_P2
psrad xmm5,DESCALE_P2 psrad xmm5, DESCALE_P2
packssdw xmm4,xmm1 ; xmm4=data7 packssdw xmm4, xmm1 ; xmm4=data7
packssdw xmm0,xmm5 ; xmm0=data1 packssdw xmm0, xmm5 ; xmm0=data1
movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4 movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0 movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
movdqa xmm1,xmm3 movdqa xmm1, xmm3
movdqa xmm5,xmm3 movdqa xmm5, xmm3
punpcklwd xmm1,xmm7 punpcklwd xmm1, xmm7
punpckhwd xmm5,xmm7 punpckhwd xmm5, xmm7
movdqa xmm3,xmm1 movdqa xmm3, xmm1
movdqa xmm7,xmm5 movdqa xmm7, xmm5
pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L
pmaddwd xmm5,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H pmaddwd xmm5, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H
pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L
pmaddwd xmm7,[GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H pmaddwd xmm7, [GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H
paddd xmm1,xmm2 ; xmm1=data5L paddd xmm1, xmm2 ; xmm1=data5L
paddd xmm5,xmm6 ; xmm5=data5H paddd xmm5, xmm6 ; xmm5=data5H
paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad xmm1,DESCALE_P2 psrad xmm1, DESCALE_P2
psrad xmm5,DESCALE_P2 psrad xmm5, DESCALE_P2
paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P2)] paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad xmm3,DESCALE_P2 psrad xmm3, DESCALE_P2
psrad xmm7,DESCALE_P2 psrad xmm7, DESCALE_P2
packssdw xmm1,xmm5 ; xmm1=data5 packssdw xmm1, xmm5 ; xmm1=data5
packssdw xmm3,xmm7 ; xmm3=data3 packssdw xmm3, xmm7 ; xmm3=data3
movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1 movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
@@ -623,7 +623,7 @@ EXTN(jsimd_fdct_islow_sse2):
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; unused ; pop ecx ; unused
poppic ebx poppic ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret

View File

@@ -26,11 +26,11 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1,%2,0x44 shufps %1, %2, 0x44
%endmacro %endmacro
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1,%2,0xEE shufps %1, %2, 0xEE
%endmacro %endmacro
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -77,11 +77,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
EXTN(jsimd_idct_float_sse2): EXTN(jsimd_idct_float_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [workspace] lea rsp, [workspace]
collect_args collect_args
push rbx push rbx
@@ -105,35 +105,35 @@ EXTN(jsimd_idct_float_sse2):
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
por xmm1,xmm2 por xmm1, xmm2
por xmm3,xmm4 por xmm3, xmm4
por xmm5,xmm6 por xmm5, xmm6
por xmm1,xmm3 por xmm1, xmm3
por xmm5,xmm7 por xmm5, xmm7
por xmm1,xmm5 por xmm1, xmm5
packsswb xmm1,xmm1 packsswb xmm1, xmm1
movd eax,xmm1 movd eax, xmm1
test rax,rax test rax, rax
jnz short .columnDCT jnz short .columnDCT
; -- AC terms all zero ; -- AC terms all zero
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm1,xmm0 movaps xmm1, xmm0
movaps xmm2,xmm0 movaps xmm2, xmm0
movaps xmm3,xmm0 movaps xmm3, xmm0
shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
@@ -154,41 +154,41 @@ EXTN(jsimd_idct_float_sse2):
movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm4,xmm0 movaps xmm4, xmm0
movaps xmm5,xmm1 movaps xmm5, xmm1
subps xmm0,xmm2 ; xmm0=tmp11 subps xmm0, xmm2 ; xmm0=tmp11
subps xmm1,xmm3 subps xmm1, xmm3
addps xmm4,xmm2 ; xmm4=tmp10 addps xmm4, xmm2 ; xmm4=tmp10
addps xmm5,xmm3 ; xmm5=tmp13 addps xmm5, xmm3 ; xmm5=tmp13
mulps xmm1,[rel PD_1_414] mulps xmm1, [rel PD_1_414]
subps xmm1,xmm5 ; xmm1=tmp12 subps xmm1, xmm5 ; xmm1=tmp12
movaps xmm6,xmm4 movaps xmm6, xmm4
movaps xmm7,xmm0 movaps xmm7, xmm0
subps xmm4,xmm5 ; xmm4=tmp3 subps xmm4, xmm5 ; xmm4=tmp3
subps xmm0,xmm1 ; xmm0=tmp2 subps xmm0, xmm1 ; xmm0=tmp2
addps xmm6,xmm5 ; xmm6=tmp0 addps xmm6, xmm5 ; xmm6=tmp0
addps xmm7,xmm1 ; xmm7=tmp1 addps xmm7, xmm1 ; xmm7=tmp1
movaps XMMWORD [wk(1)], xmm4 ; tmp3 movaps XMMWORD [wk(1)], xmm4 ; tmp3
movaps XMMWORD [wk(0)], xmm0 ; tmp2 movaps XMMWORD [wk(0)], xmm0 ; tmp2
@@ -200,63 +200,63 @@ EXTN(jsimd_idct_float_sse2):
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm4,xmm2 movaps xmm4, xmm2
movaps xmm0,xmm5 movaps xmm0, xmm5
addps xmm2,xmm1 ; xmm2=z11 addps xmm2, xmm1 ; xmm2=z11
addps xmm5,xmm3 ; xmm5=z13 addps xmm5, xmm3 ; xmm5=z13
subps xmm4,xmm1 ; xmm4=z12 subps xmm4, xmm1 ; xmm4=z12
subps xmm0,xmm3 ; xmm0=z10 subps xmm0, xmm3 ; xmm0=z10
movaps xmm1,xmm2 movaps xmm1, xmm2
subps xmm2,xmm5 subps xmm2, xmm5
addps xmm1,xmm5 ; xmm1=tmp7 addps xmm1, xmm5 ; xmm1=tmp7
mulps xmm2,[rel PD_1_414] ; xmm2=tmp11 mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
movaps xmm3,xmm0 movaps xmm3, xmm0
addps xmm0,xmm4 addps xmm0, xmm4
mulps xmm0,[rel PD_1_847] ; xmm0=z5 mulps xmm0, [rel PD_1_847] ; xmm0=z5
mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930) mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200) mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
addps xmm3,xmm0 ; xmm3=tmp12 addps xmm3, xmm0 ; xmm3=tmp12
subps xmm4,xmm0 ; xmm4=tmp10 subps xmm4, xmm0 ; xmm4=tmp10
; -- Final output stage ; -- Final output stage
subps xmm3,xmm1 ; xmm3=tmp6 subps xmm3, xmm1 ; xmm3=tmp6
movaps xmm5,xmm6 movaps xmm5, xmm6
movaps xmm0,xmm7 movaps xmm0, xmm7
addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
subps xmm2,xmm3 ; xmm2=tmp5 subps xmm2, xmm3 ; xmm2=tmp5
movaps xmm1,xmm6 ; transpose coefficients(phase 1) movaps xmm1, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
movaps xmm3,xmm0 ; transpose coefficients(phase 1) movaps xmm3, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
@@ -264,27 +264,27 @@ EXTN(jsimd_idct_float_sse2):
movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
addps xmm4,xmm2 ; xmm4=tmp4 addps xmm4, xmm2 ; xmm4=tmp4
movaps xmm0,xmm7 movaps xmm0, xmm7
movaps xmm3,xmm5 movaps xmm3, xmm5
addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
movaps xmm2,xmm7 ; transpose coefficients(phase 1) movaps xmm2, xmm7 ; transpose coefficients(phase 1)
unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
movaps xmm4,xmm5 ; transpose coefficients(phase 1) movaps xmm4, xmm5 ; transpose coefficients(phase 1)
unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
movaps xmm3,xmm6 ; transpose coefficients(phase 2) movaps xmm3, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
movaps xmm0,xmm1 ; transpose coefficients(phase 2) movaps xmm0, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
@@ -294,12 +294,12 @@ EXTN(jsimd_idct_float_sse2):
movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
movaps xmm6,xmm5 ; transpose coefficients(phase 2) movaps xmm6, xmm5 ; transpose coefficients(phase 2)
unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
movaps xmm3,xmm4 ; transpose coefficients(phase 2) movaps xmm3, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
@@ -336,22 +336,22 @@ EXTN(jsimd_idct_float_sse2):
movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm4,xmm0 movaps xmm4, xmm0
movaps xmm5,xmm1 movaps xmm5, xmm1
subps xmm0,xmm2 ; xmm0=tmp11 subps xmm0, xmm2 ; xmm0=tmp11
subps xmm1,xmm3 subps xmm1, xmm3
addps xmm4,xmm2 ; xmm4=tmp10 addps xmm4, xmm2 ; xmm4=tmp10
addps xmm5,xmm3 ; xmm5=tmp13 addps xmm5, xmm3 ; xmm5=tmp13
mulps xmm1,[rel PD_1_414] mulps xmm1, [rel PD_1_414]
subps xmm1,xmm5 ; xmm1=tmp12 subps xmm1, xmm5 ; xmm1=tmp12
movaps xmm6,xmm4 movaps xmm6, xmm4
movaps xmm7,xmm0 movaps xmm7, xmm0
subps xmm4,xmm5 ; xmm4=tmp3 subps xmm4, xmm5 ; xmm4=tmp3
subps xmm0,xmm1 ; xmm0=tmp2 subps xmm0, xmm1 ; xmm0=tmp2
addps xmm6,xmm5 ; xmm6=tmp0 addps xmm6, xmm5 ; xmm6=tmp0
addps xmm7,xmm1 ; xmm7=tmp1 addps xmm7, xmm1 ; xmm7=tmp1
movaps XMMWORD [wk(1)], xmm4 ; tmp3 movaps XMMWORD [wk(1)], xmm4 ; tmp3
movaps XMMWORD [wk(0)], xmm0 ; tmp2 movaps XMMWORD [wk(0)], xmm0 ; tmp2
@@ -363,98 +363,98 @@ EXTN(jsimd_idct_float_sse2):
movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)] movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
movaps xmm4,xmm2 movaps xmm4, xmm2
movaps xmm0,xmm5 movaps xmm0, xmm5
addps xmm2,xmm1 ; xmm2=z11 addps xmm2, xmm1 ; xmm2=z11
addps xmm5,xmm3 ; xmm5=z13 addps xmm5, xmm3 ; xmm5=z13
subps xmm4,xmm1 ; xmm4=z12 subps xmm4, xmm1 ; xmm4=z12
subps xmm0,xmm3 ; xmm0=z10 subps xmm0, xmm3 ; xmm0=z10
movaps xmm1,xmm2 movaps xmm1, xmm2
subps xmm2,xmm5 subps xmm2, xmm5
addps xmm1,xmm5 ; xmm1=tmp7 addps xmm1, xmm5 ; xmm1=tmp7
mulps xmm2,[rel PD_1_414] ; xmm2=tmp11 mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
movaps xmm3,xmm0 movaps xmm3, xmm0
addps xmm0,xmm4 addps xmm0, xmm4
mulps xmm0,[rel PD_1_847] ; xmm0=z5 mulps xmm0, [rel PD_1_847] ; xmm0=z5
mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930) mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200) mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
addps xmm3,xmm0 ; xmm3=tmp12 addps xmm3, xmm0 ; xmm3=tmp12
subps xmm4,xmm0 ; xmm4=tmp10 subps xmm4, xmm0 ; xmm4=tmp10
; -- Final output stage ; -- Final output stage
subps xmm3,xmm1 ; xmm3=tmp6 subps xmm3, xmm1 ; xmm3=tmp6
movaps xmm5,xmm6 movaps xmm5, xmm6
movaps xmm0,xmm7 movaps xmm0, xmm7
addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
subps xmm2,xmm3 ; xmm2=tmp5 subps xmm2, xmm3 ; xmm2=tmp5
movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC] movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
pcmpeqd xmm3,xmm3 pcmpeqd xmm3, xmm3
psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
addps xmm4,xmm2 ; xmm4=tmp4 addps xmm4, xmm2 ; xmm4=tmp4
movaps xmm7,xmm1 movaps xmm7, xmm1
movaps xmm5,xmm3 movaps xmm5, xmm3
addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC] movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
pcmpeqd xmm4,xmm4 pcmpeqd xmm4, xmm4
psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
paddb xmm6,xmm2 paddb xmm6, xmm2
paddb xmm1,xmm2 paddb xmm1, xmm2
movdqa xmm4,xmm6 ; transpose coefficients(phase 2) movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
movdqa xmm7,xmm6 ; transpose coefficients(phase 3) movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
@@ -472,7 +472,7 @@ EXTN(jsimd_idct_float_sse2):
pop rbx pop rbx
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret

View File

@@ -25,11 +25,11 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1,%2,0x44 shufps %1, %2, 0x44
%endmacro %endmacro
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1,%2,0xEE shufps %1, %2, 0xEE
%endmacro %endmacro
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -76,11 +76,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
EXTN(jsimd_idct_float_sse2): EXTN(jsimd_idct_float_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [workspace] lea esp, [workspace]
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
@@ -97,7 +97,7 @@ EXTN(jsimd_idct_float_sse2):
mov esi, JCOEFPTR [coef_block(eax)] ; inptr mov esi, JCOEFPTR [coef_block(eax)] ; inptr
lea edi, [workspace] ; FAST_FLOAT *wsptr lea edi, [workspace] ; FAST_FLOAT *wsptr
mov ecx, DCTSIZE/4 ; ctr mov ecx, DCTSIZE/4 ; ctr
alignx 16,7 alignx 16, 7
.columnloop: .columnloop:
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -111,35 +111,35 @@ EXTN(jsimd_idct_float_sse2):
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
por xmm1,xmm2 por xmm1, xmm2
por xmm3,xmm4 por xmm3, xmm4
por xmm5,xmm6 por xmm5, xmm6
por xmm1,xmm3 por xmm1, xmm3
por xmm5,xmm7 por xmm5, xmm7
por xmm1,xmm5 por xmm1, xmm5
packsswb xmm1,xmm1 packsswb xmm1, xmm1
movd eax,xmm1 movd eax, xmm1
test eax,eax test eax, eax
jnz short .columnDCT jnz short .columnDCT
; -- AC terms all zero ; -- AC terms all zero
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm1,xmm0 movaps xmm1, xmm0
movaps xmm2,xmm0 movaps xmm2, xmm0
movaps xmm3,xmm0 movaps xmm3, xmm0
shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
@@ -150,7 +150,7 @@ EXTN(jsimd_idct_float_sse2):
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
jmp near .nextcolumn jmp near .nextcolumn
alignx 16,7 alignx 16, 7
%endif %endif
.columnDCT: .columnDCT:
@@ -161,41 +161,41 @@ EXTN(jsimd_idct_float_sse2):
movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm4,xmm0 movaps xmm4, xmm0
movaps xmm5,xmm1 movaps xmm5, xmm1
subps xmm0,xmm2 ; xmm0=tmp11 subps xmm0, xmm2 ; xmm0=tmp11
subps xmm1,xmm3 subps xmm1, xmm3
addps xmm4,xmm2 ; xmm4=tmp10 addps xmm4, xmm2 ; xmm4=tmp10
addps xmm5,xmm3 ; xmm5=tmp13 addps xmm5, xmm3 ; xmm5=tmp13
mulps xmm1,[GOTOFF(ebx,PD_1_414)] mulps xmm1, [GOTOFF(ebx,PD_1_414)]
subps xmm1,xmm5 ; xmm1=tmp12 subps xmm1, xmm5 ; xmm1=tmp12
movaps xmm6,xmm4 movaps xmm6, xmm4
movaps xmm7,xmm0 movaps xmm7, xmm0
subps xmm4,xmm5 ; xmm4=tmp3 subps xmm4, xmm5 ; xmm4=tmp3
subps xmm0,xmm1 ; xmm0=tmp2 subps xmm0, xmm1 ; xmm0=tmp2
addps xmm6,xmm5 ; xmm6=tmp0 addps xmm6, xmm5 ; xmm6=tmp0
addps xmm7,xmm1 ; xmm7=tmp1 addps xmm7, xmm1 ; xmm7=tmp1
movaps XMMWORD [wk(1)], xmm4 ; tmp3 movaps XMMWORD [wk(1)], xmm4 ; tmp3
movaps XMMWORD [wk(0)], xmm0 ; tmp2 movaps XMMWORD [wk(0)], xmm0 ; tmp2
@@ -207,63 +207,63 @@ EXTN(jsimd_idct_float_sse2):
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
movaps xmm4,xmm2 movaps xmm4, xmm2
movaps xmm0,xmm5 movaps xmm0, xmm5
addps xmm2,xmm1 ; xmm2=z11 addps xmm2, xmm1 ; xmm2=z11
addps xmm5,xmm3 ; xmm5=z13 addps xmm5, xmm3 ; xmm5=z13
subps xmm4,xmm1 ; xmm4=z12 subps xmm4, xmm1 ; xmm4=z12
subps xmm0,xmm3 ; xmm0=z10 subps xmm0, xmm3 ; xmm0=z10
movaps xmm1,xmm2 movaps xmm1, xmm2
subps xmm2,xmm5 subps xmm2, xmm5
addps xmm1,xmm5 ; xmm1=tmp7 addps xmm1, xmm5 ; xmm1=tmp7
mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
movaps xmm3,xmm0 movaps xmm3, xmm0
addps xmm0,xmm4 addps xmm0, xmm4
mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
addps xmm3,xmm0 ; xmm3=tmp12 addps xmm3, xmm0 ; xmm3=tmp12
subps xmm4,xmm0 ; xmm4=tmp10 subps xmm4, xmm0 ; xmm4=tmp10
; -- Final output stage ; -- Final output stage
subps xmm3,xmm1 ; xmm3=tmp6 subps xmm3, xmm1 ; xmm3=tmp6
movaps xmm5,xmm6 movaps xmm5, xmm6
movaps xmm0,xmm7 movaps xmm0, xmm7
addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
subps xmm2,xmm3 ; xmm2=tmp5 subps xmm2, xmm3 ; xmm2=tmp5
movaps xmm1,xmm6 ; transpose coefficients(phase 1) movaps xmm1, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
movaps xmm3,xmm0 ; transpose coefficients(phase 1) movaps xmm3, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
@@ -271,27 +271,27 @@ EXTN(jsimd_idct_float_sse2):
movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
addps xmm4,xmm2 ; xmm4=tmp4 addps xmm4, xmm2 ; xmm4=tmp4
movaps xmm0,xmm7 movaps xmm0, xmm7
movaps xmm3,xmm5 movaps xmm3, xmm5
addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
movaps xmm2,xmm7 ; transpose coefficients(phase 1) movaps xmm2, xmm7 ; transpose coefficients(phase 1)
unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
movaps xmm4,xmm5 ; transpose coefficients(phase 1) movaps xmm4, xmm5 ; transpose coefficients(phase 1)
unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
movaps xmm3,xmm6 ; transpose coefficients(phase 2) movaps xmm3, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
movaps xmm0,xmm1 ; transpose coefficients(phase 2) movaps xmm0, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
@@ -301,12 +301,12 @@ EXTN(jsimd_idct_float_sse2):
movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
movaps xmm6,xmm5 ; transpose coefficients(phase 2) movaps xmm6, xmm5 ; transpose coefficients(phase 2)
unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
movaps xmm3,xmm4 ; transpose coefficients(phase 2) movaps xmm3, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
@@ -334,7 +334,7 @@ EXTN(jsimd_idct_float_sse2):
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(eax)] mov eax, JDIMENSION [output_col(eax)]
mov ecx, DCTSIZE/4 ; ctr mov ecx, DCTSIZE/4 ; ctr
alignx 16,7 alignx 16, 7
.rowloop: .rowloop:
; -- Even part ; -- Even part
@@ -344,22 +344,22 @@ EXTN(jsimd_idct_float_sse2):
movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm4,xmm0 movaps xmm4, xmm0
movaps xmm5,xmm1 movaps xmm5, xmm1
subps xmm0,xmm2 ; xmm0=tmp11 subps xmm0, xmm2 ; xmm0=tmp11
subps xmm1,xmm3 subps xmm1, xmm3
addps xmm4,xmm2 ; xmm4=tmp10 addps xmm4, xmm2 ; xmm4=tmp10
addps xmm5,xmm3 ; xmm5=tmp13 addps xmm5, xmm3 ; xmm5=tmp13
mulps xmm1,[GOTOFF(ebx,PD_1_414)] mulps xmm1, [GOTOFF(ebx,PD_1_414)]
subps xmm1,xmm5 ; xmm1=tmp12 subps xmm1, xmm5 ; xmm1=tmp12
movaps xmm6,xmm4 movaps xmm6, xmm4
movaps xmm7,xmm0 movaps xmm7, xmm0
subps xmm4,xmm5 ; xmm4=tmp3 subps xmm4, xmm5 ; xmm4=tmp3
subps xmm0,xmm1 ; xmm0=tmp2 subps xmm0, xmm1 ; xmm0=tmp2
addps xmm6,xmm5 ; xmm6=tmp0 addps xmm6, xmm5 ; xmm6=tmp0
addps xmm7,xmm1 ; xmm7=tmp1 addps xmm7, xmm1 ; xmm7=tmp1
movaps XMMWORD [wk(1)], xmm4 ; tmp3 movaps XMMWORD [wk(1)], xmm4 ; tmp3
movaps XMMWORD [wk(0)], xmm0 ; tmp2 movaps XMMWORD [wk(0)], xmm0 ; tmp2
@@ -371,98 +371,98 @@ EXTN(jsimd_idct_float_sse2):
movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm4,xmm2 movaps xmm4, xmm2
movaps xmm0,xmm5 movaps xmm0, xmm5
addps xmm2,xmm1 ; xmm2=z11 addps xmm2, xmm1 ; xmm2=z11
addps xmm5,xmm3 ; xmm5=z13 addps xmm5, xmm3 ; xmm5=z13
subps xmm4,xmm1 ; xmm4=z12 subps xmm4, xmm1 ; xmm4=z12
subps xmm0,xmm3 ; xmm0=z10 subps xmm0, xmm3 ; xmm0=z10
movaps xmm1,xmm2 movaps xmm1, xmm2
subps xmm2,xmm5 subps xmm2, xmm5
addps xmm1,xmm5 ; xmm1=tmp7 addps xmm1, xmm5 ; xmm1=tmp7
mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
movaps xmm3,xmm0 movaps xmm3, xmm0
addps xmm0,xmm4 addps xmm0, xmm4
mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
addps xmm3,xmm0 ; xmm3=tmp12 addps xmm3, xmm0 ; xmm3=tmp12
subps xmm4,xmm0 ; xmm4=tmp10 subps xmm4, xmm0 ; xmm4=tmp10
; -- Final output stage ; -- Final output stage
subps xmm3,xmm1 ; xmm3=tmp6 subps xmm3, xmm1 ; xmm3=tmp6
movaps xmm5,xmm6 movaps xmm5, xmm6
movaps xmm0,xmm7 movaps xmm0, xmm7
addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
subps xmm2,xmm3 ; xmm2=tmp5 subps xmm2, xmm3 ; xmm2=tmp5
movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
pcmpeqd xmm3,xmm3 pcmpeqd xmm3, xmm3
psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
addps xmm4,xmm2 ; xmm4=tmp4 addps xmm4, xmm2 ; xmm4=tmp4
movaps xmm7,xmm1 movaps xmm7, xmm1
movaps xmm5,xmm3 movaps xmm5, xmm3
addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
pcmpeqd xmm4,xmm4 pcmpeqd xmm4, xmm4
psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
paddb xmm6,xmm2 paddb xmm6, xmm2
paddb xmm1,xmm2 paddb xmm1, xmm2
movdqa xmm4,xmm6 ; transpose coefficients(phase 2) movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
movdqa xmm7,xmm6 ; transpose coefficients(phase 3) movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
pushpic ebx ; save GOT address pushpic ebx ; save GOT address
@@ -487,7 +487,7 @@ EXTN(jsimd_idct_float_sse2):
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
pop ebx pop ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret

View File

@@ -42,10 +42,10 @@ F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
%else %else
; NASM cannot do compile-time arithmetic on floating-point constants. ; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) F_1_082 equ DESCALE(1162209775, 30-CONST_BITS) ; FIX(1.082392200)
F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) F_1_414 equ DESCALE(1518500249, 30-CONST_BITS) ; FIX(1.414213562)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) F_2_613 equ DESCALE(2805822602, 30-CONST_BITS) ; FIX(2.613125930)
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
%endif %endif
@@ -96,11 +96,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
EXTN(jsimd_idct_ifast_sse2): EXTN(jsimd_idct_ifast_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
@@ -121,11 +121,11 @@ EXTN(jsimd_idct_ifast_sse2):
por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
por xmm1,xmm0 por xmm1, xmm0
packsswb xmm1,xmm1 packsswb xmm1, xmm1
packsswb xmm1,xmm1 packsswb xmm1, xmm1
movd eax,xmm1 movd eax, xmm1
test rax,rax test rax, rax
jnz short .columnDCT jnz short .columnDCT
; -- AC terms all zero ; -- AC terms all zero
@@ -133,18 +133,18 @@ EXTN(jsimd_idct_ifast_sse2):
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
@@ -163,23 +163,23 @@ EXTN(jsimd_idct_ifast_sse2):
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm4,xmm0 movdqa xmm4, xmm0
movdqa xmm5,xmm1 movdqa xmm5, xmm1
psubw xmm0,xmm2 ; xmm0=tmp11 psubw xmm0, xmm2 ; xmm0=tmp11
psubw xmm1,xmm3 psubw xmm1, xmm3
paddw xmm4,xmm2 ; xmm4=tmp10 paddw xmm4, xmm2 ; xmm4=tmp10
paddw xmm5,xmm3 ; xmm5=tmp13 paddw xmm5, xmm3 ; xmm5=tmp13
psllw xmm1,PRE_MULTIPLY_SCALE_BITS psllw xmm1, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm1,[rel PW_F1414] pmulhw xmm1, [rel PW_F1414]
psubw xmm1,xmm5 ; xmm1=tmp12 psubw xmm1, xmm5 ; xmm1=tmp12
movdqa xmm6,xmm4 movdqa xmm6, xmm4
movdqa xmm7,xmm0 movdqa xmm7, xmm0
psubw xmm4,xmm5 ; xmm4=tmp3 psubw xmm4, xmm5 ; xmm4=tmp3
psubw xmm0,xmm1 ; xmm0=tmp2 psubw xmm0, xmm1 ; xmm0=tmp2
paddw xmm6,xmm5 ; xmm6=tmp0 paddw xmm6, xmm5 ; xmm6=tmp0
paddw xmm7,xmm1 ; xmm7=tmp1 paddw xmm7, xmm1 ; xmm7=tmp1
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
@@ -195,23 +195,23 @@ EXTN(jsimd_idct_ifast_sse2):
pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm4,xmm2 movdqa xmm4, xmm2
movdqa xmm0,xmm5 movdqa xmm0, xmm5
psubw xmm2,xmm1 ; xmm2=z12 psubw xmm2, xmm1 ; xmm2=z12
psubw xmm5,xmm3 ; xmm5=z10 psubw xmm5, xmm3 ; xmm5=z10
paddw xmm4,xmm1 ; xmm4=z11 paddw xmm4, xmm1 ; xmm4=z11
paddw xmm0,xmm3 ; xmm0=z13 paddw xmm0, xmm3 ; xmm0=z13
movdqa xmm1,xmm5 ; xmm1=z10(unscaled) movdqa xmm1, xmm5 ; xmm1=z10(unscaled)
psllw xmm2,PRE_MULTIPLY_SCALE_BITS psllw xmm2, PRE_MULTIPLY_SCALE_BITS
psllw xmm5,PRE_MULTIPLY_SCALE_BITS psllw xmm5, PRE_MULTIPLY_SCALE_BITS
movdqa xmm3,xmm4 movdqa xmm3, xmm4
psubw xmm4,xmm0 psubw xmm4, xmm0
paddw xmm3,xmm0 ; xmm3=tmp7 paddw xmm3, xmm0 ; xmm3=tmp7
psllw xmm4,PRE_MULTIPLY_SCALE_BITS psllw xmm4, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm4,[rel PW_F1414] ; xmm4=tmp11 pmulhw xmm4, [rel PW_F1414] ; xmm4=tmp11
; To avoid overflow... ; To avoid overflow...
; ;
@@ -222,32 +222,32 @@ EXTN(jsimd_idct_ifast_sse2):
; tmp12 = (-1.613125930 - 1) * z10 + z5; ; tmp12 = (-1.613125930 - 1) * z10 + z5;
; = -1.613125930 * z10 - z10 + z5; ; = -1.613125930 * z10 - z10 + z5;
movdqa xmm0,xmm5 movdqa xmm0, xmm5
paddw xmm5,xmm2 paddw xmm5, xmm2
pmulhw xmm5,[rel PW_F1847] ; xmm5=z5 pmulhw xmm5, [rel PW_F1847] ; xmm5=z5
pmulhw xmm0,[rel PW_MF1613] pmulhw xmm0, [rel PW_MF1613]
pmulhw xmm2,[rel PW_F1082] pmulhw xmm2, [rel PW_F1082]
psubw xmm0,xmm1 psubw xmm0, xmm1
psubw xmm2,xmm5 ; xmm2=tmp10 psubw xmm2, xmm5 ; xmm2=tmp10
paddw xmm0,xmm5 ; xmm0=tmp12 paddw xmm0, xmm5 ; xmm0=tmp12
; -- Final output stage ; -- Final output stage
psubw xmm0,xmm3 ; xmm0=tmp6 psubw xmm0, xmm3 ; xmm0=tmp6
movdqa xmm1,xmm6 movdqa xmm1, xmm6
movdqa xmm5,xmm7 movdqa xmm5, xmm7
paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
psubw xmm4,xmm0 ; xmm4=tmp5 psubw xmm4, xmm0 ; xmm4=tmp5
movdqa xmm3,xmm6 ; transpose coefficients(phase 1) movdqa xmm3, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
movdqa xmm0,xmm5 ; transpose coefficients(phase 1) movdqa xmm0, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
@@ -255,27 +255,27 @@ EXTN(jsimd_idct_ifast_sse2):
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
paddw xmm2,xmm4 ; xmm2=tmp4 paddw xmm2, xmm4 ; xmm2=tmp4
movdqa xmm5,xmm7 movdqa xmm5, xmm7
movdqa xmm0,xmm1 movdqa xmm0, xmm1
paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
movdqa xmm4,xmm7 ; transpose coefficients(phase 1) movdqa xmm4, xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
movdqa xmm2,xmm1 ; transpose coefficients(phase 1) movdqa xmm2, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm0,xmm3 ; transpose coefficients(phase 2) movdqa xmm0, xmm3 ; transpose coefficients(phase 2)
punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
movdqa xmm5,xmm6 ; transpose coefficients(phase 2) movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
@@ -283,19 +283,19 @@ EXTN(jsimd_idct_ifast_sse2):
movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
movdqa xmm3,xmm1 ; transpose coefficients(phase 2) movdqa xmm3, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
movdqa xmm0,xmm2 ; transpose coefficients(phase 2) movdqa xmm0, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
movdqa xmm4,xmm6 ; transpose coefficients(phase 3) movdqa xmm4, xmm6 ; transpose coefficients(phase 3)
punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
movdqa xmm7,xmm5 ; transpose coefficients(phase 3) movdqa xmm7, xmm5 ; transpose coefficients(phase 3)
punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
@@ -303,12 +303,12 @@ EXTN(jsimd_idct_ifast_sse2):
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
movdqa xmm4,xmm1 ; transpose coefficients(phase 3) movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
movdqa xmm7,xmm3 ; transpose coefficients(phase 3) movdqa xmm7, xmm3 ; transpose coefficients(phase 3)
punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
.column_end: .column_end:
; -- Prefetch the next coefficient block ; -- Prefetch the next coefficient block
@@ -328,23 +328,23 @@ EXTN(jsimd_idct_ifast_sse2):
; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
movdqa xmm2,xmm6 movdqa xmm2, xmm6
movdqa xmm0,xmm5 movdqa xmm0, xmm5
psubw xmm6,xmm1 ; xmm6=tmp11 psubw xmm6, xmm1 ; xmm6=tmp11
psubw xmm5,xmm3 psubw xmm5, xmm3
paddw xmm2,xmm1 ; xmm2=tmp10 paddw xmm2, xmm1 ; xmm2=tmp10
paddw xmm0,xmm3 ; xmm0=tmp13 paddw xmm0, xmm3 ; xmm0=tmp13
psllw xmm5,PRE_MULTIPLY_SCALE_BITS psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[rel PW_F1414] pmulhw xmm5, [rel PW_F1414]
psubw xmm5,xmm0 ; xmm5=tmp12 psubw xmm5, xmm0 ; xmm5=tmp12
movdqa xmm1,xmm2 movdqa xmm1, xmm2
movdqa xmm3,xmm6 movdqa xmm3, xmm6
psubw xmm2,xmm0 ; xmm2=tmp3 psubw xmm2, xmm0 ; xmm2=tmp3
psubw xmm6,xmm5 ; xmm6=tmp2 psubw xmm6, xmm5 ; xmm6=tmp2
paddw xmm1,xmm0 ; xmm1=tmp0 paddw xmm1, xmm0 ; xmm1=tmp0
paddw xmm3,xmm5 ; xmm3=tmp1 paddw xmm3, xmm5 ; xmm3=tmp1
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
@@ -356,23 +356,23 @@ EXTN(jsimd_idct_ifast_sse2):
; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
movdqa xmm2,xmm0 movdqa xmm2, xmm0
movdqa xmm6,xmm4 movdqa xmm6, xmm4
psubw xmm0,xmm7 ; xmm0=z12 psubw xmm0, xmm7 ; xmm0=z12
psubw xmm4,xmm5 ; xmm4=z10 psubw xmm4, xmm5 ; xmm4=z10
paddw xmm2,xmm7 ; xmm2=z11 paddw xmm2, xmm7 ; xmm2=z11
paddw xmm6,xmm5 ; xmm6=z13 paddw xmm6, xmm5 ; xmm6=z13
movdqa xmm7,xmm4 ; xmm7=z10(unscaled) movdqa xmm7, xmm4 ; xmm7=z10(unscaled)
psllw xmm0,PRE_MULTIPLY_SCALE_BITS psllw xmm0, PRE_MULTIPLY_SCALE_BITS
psllw xmm4,PRE_MULTIPLY_SCALE_BITS psllw xmm4, PRE_MULTIPLY_SCALE_BITS
movdqa xmm5,xmm2 movdqa xmm5, xmm2
psubw xmm2,xmm6 psubw xmm2, xmm6
paddw xmm5,xmm6 ; xmm5=tmp7 paddw xmm5, xmm6 ; xmm5=tmp7
psllw xmm2,PRE_MULTIPLY_SCALE_BITS psllw xmm2, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm2,[rel PW_F1414] ; xmm2=tmp11 pmulhw xmm2, [rel PW_F1414] ; xmm2=tmp11
; To avoid overflow... ; To avoid overflow...
; ;
@@ -383,83 +383,83 @@ EXTN(jsimd_idct_ifast_sse2):
; tmp12 = (-1.613125930 - 1) * z10 + z5; ; tmp12 = (-1.613125930 - 1) * z10 + z5;
; = -1.613125930 * z10 - z10 + z5; ; = -1.613125930 * z10 - z10 + z5;
movdqa xmm6,xmm4 movdqa xmm6, xmm4
paddw xmm4,xmm0 paddw xmm4, xmm0
pmulhw xmm4,[rel PW_F1847] ; xmm4=z5 pmulhw xmm4, [rel PW_F1847] ; xmm4=z5
pmulhw xmm6,[rel PW_MF1613] pmulhw xmm6, [rel PW_MF1613]
pmulhw xmm0,[rel PW_F1082] pmulhw xmm0, [rel PW_F1082]
psubw xmm6,xmm7 psubw xmm6, xmm7
psubw xmm0,xmm4 ; xmm0=tmp10 psubw xmm0, xmm4 ; xmm0=tmp10
paddw xmm6,xmm4 ; xmm6=tmp12 paddw xmm6, xmm4 ; xmm6=tmp12
; -- Final output stage ; -- Final output stage
psubw xmm6,xmm5 ; xmm6=tmp6 psubw xmm6, xmm5 ; xmm6=tmp6
movdqa xmm7,xmm1 movdqa xmm7, xmm1
movdqa xmm4,xmm3 movdqa xmm4, xmm3
paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
psraw xmm1,(PASS1_BITS+3) ; descale psraw xmm1, (PASS1_BITS+3) ; descale
psraw xmm3,(PASS1_BITS+3) ; descale psraw xmm3, (PASS1_BITS+3) ; descale
psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
psraw xmm7,(PASS1_BITS+3) ; descale psraw xmm7, (PASS1_BITS+3) ; descale
psraw xmm4,(PASS1_BITS+3) ; descale psraw xmm4, (PASS1_BITS+3) ; descale
psubw xmm2,xmm6 ; xmm2=tmp5 psubw xmm2, xmm6 ; xmm2=tmp5
packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
paddw xmm0,xmm2 ; xmm0=tmp4 paddw xmm0, xmm2 ; xmm0=tmp4
movdqa xmm4,xmm5 movdqa xmm4, xmm5
movdqa xmm7,xmm6 movdqa xmm7, xmm6
paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
psraw xmm5,(PASS1_BITS+3) ; descale psraw xmm5, (PASS1_BITS+3) ; descale
psraw xmm6,(PASS1_BITS+3) ; descale psraw xmm6, (PASS1_BITS+3) ; descale
psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
psraw xmm4,(PASS1_BITS+3) ; descale psraw xmm4, (PASS1_BITS+3) ; descale
psraw xmm7,(PASS1_BITS+3) ; descale psraw xmm7, (PASS1_BITS+3) ; descale
movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
paddb xmm1,xmm2 paddb xmm1, xmm2
paddb xmm3,xmm2 paddb xmm3, xmm2
paddb xmm5,xmm2 paddb xmm5, xmm2
paddb xmm7,xmm2 paddb xmm7, xmm2
movdqa xmm0,xmm1 ; transpose coefficients(phase 1) movdqa xmm0, xmm1 ; transpose coefficients(phase 1)
punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
movdqa xmm6,xmm5 ; transpose coefficients(phase 1) movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
movdqa xmm4,xmm1 ; transpose coefficients(phase 2) movdqa xmm4, xmm1 ; transpose coefficients(phase 2)
punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
movdqa xmm2,xmm6 ; transpose coefficients(phase 2) movdqa xmm2, xmm6 ; transpose coefficients(phase 2)
punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
movdqa xmm3,xmm1 ; transpose coefficients(phase 3) movdqa xmm3, xmm1 ; transpose coefficients(phase 3)
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
movdqa xmm7,xmm4 ; transpose coefficients(phase 3) movdqa xmm7, xmm4 ; transpose coefficients(phase 3)
punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
@@ -480,7 +480,7 @@ EXTN(jsimd_idct_ifast_sse2):
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret

View File

@@ -41,10 +41,10 @@ F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
%else %else
; NASM cannot do compile-time arithmetic on floating-point constants. ; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) F_1_082 equ DESCALE(1162209775, 30-CONST_BITS) ; FIX(1.082392200)
F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) F_1_414 equ DESCALE(1518500249, 30-CONST_BITS) ; FIX(1.414213562)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) F_2_613 equ DESCALE(2805822602, 30-CONST_BITS) ; FIX(2.613125930)
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
%endif %endif
@@ -95,11 +95,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
EXTN(jsimd_idct_ifast_sse2): EXTN(jsimd_idct_ifast_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx pushpic ebx
; push ecx ; unused ; push ecx ; unused
@@ -127,11 +127,11 @@ EXTN(jsimd_idct_ifast_sse2):
por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
por xmm1,xmm0 por xmm1, xmm0
packsswb xmm1,xmm1 packsswb xmm1, xmm1
packsswb xmm1,xmm1 packsswb xmm1, xmm1
movd eax,xmm1 movd eax, xmm1
test eax,eax test eax, eax
jnz short .columnDCT jnz short .columnDCT
; -- AC terms all zero ; -- AC terms all zero
@@ -139,23 +139,23 @@ EXTN(jsimd_idct_ifast_sse2):
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
jmp near .column_end jmp near .column_end
alignx 16,7 alignx 16, 7
%endif %endif
.columnDCT: .columnDCT:
@@ -170,23 +170,23 @@ EXTN(jsimd_idct_ifast_sse2):
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm4,xmm0 movdqa xmm4, xmm0
movdqa xmm5,xmm1 movdqa xmm5, xmm1
psubw xmm0,xmm2 ; xmm0=tmp11 psubw xmm0, xmm2 ; xmm0=tmp11
psubw xmm1,xmm3 psubw xmm1, xmm3
paddw xmm4,xmm2 ; xmm4=tmp10 paddw xmm4, xmm2 ; xmm4=tmp10
paddw xmm5,xmm3 ; xmm5=tmp13 paddw xmm5, xmm3 ; xmm5=tmp13
psllw xmm1,PRE_MULTIPLY_SCALE_BITS psllw xmm1, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm1,[GOTOFF(ebx,PW_F1414)] pmulhw xmm1, [GOTOFF(ebx,PW_F1414)]
psubw xmm1,xmm5 ; xmm1=tmp12 psubw xmm1, xmm5 ; xmm1=tmp12
movdqa xmm6,xmm4 movdqa xmm6, xmm4
movdqa xmm7,xmm0 movdqa xmm7, xmm0
psubw xmm4,xmm5 ; xmm4=tmp3 psubw xmm4, xmm5 ; xmm4=tmp3
psubw xmm0,xmm1 ; xmm0=tmp2 psubw xmm0, xmm1 ; xmm0=tmp2
paddw xmm6,xmm5 ; xmm6=tmp0 paddw xmm6, xmm5 ; xmm6=tmp0
paddw xmm7,xmm1 ; xmm7=tmp1 paddw xmm7, xmm1 ; xmm7=tmp1
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
@@ -202,23 +202,23 @@ EXTN(jsimd_idct_ifast_sse2):
pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
movdqa xmm4,xmm2 movdqa xmm4, xmm2
movdqa xmm0,xmm5 movdqa xmm0, xmm5
psubw xmm2,xmm1 ; xmm2=z12 psubw xmm2, xmm1 ; xmm2=z12
psubw xmm5,xmm3 ; xmm5=z10 psubw xmm5, xmm3 ; xmm5=z10
paddw xmm4,xmm1 ; xmm4=z11 paddw xmm4, xmm1 ; xmm4=z11
paddw xmm0,xmm3 ; xmm0=z13 paddw xmm0, xmm3 ; xmm0=z13
movdqa xmm1,xmm5 ; xmm1=z10(unscaled) movdqa xmm1, xmm5 ; xmm1=z10(unscaled)
psllw xmm2,PRE_MULTIPLY_SCALE_BITS psllw xmm2, PRE_MULTIPLY_SCALE_BITS
psllw xmm5,PRE_MULTIPLY_SCALE_BITS psllw xmm5, PRE_MULTIPLY_SCALE_BITS
movdqa xmm3,xmm4 movdqa xmm3, xmm4
psubw xmm4,xmm0 psubw xmm4, xmm0
paddw xmm3,xmm0 ; xmm3=tmp7 paddw xmm3, xmm0 ; xmm3=tmp7
psllw xmm4,PRE_MULTIPLY_SCALE_BITS psllw xmm4, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm4,[GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11 pmulhw xmm4, [GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11
; To avoid overflow... ; To avoid overflow...
; ;
@@ -229,32 +229,32 @@ EXTN(jsimd_idct_ifast_sse2):
; tmp12 = (-1.613125930 - 1) * z10 + z5; ; tmp12 = (-1.613125930 - 1) * z10 + z5;
; = -1.613125930 * z10 - z10 + z5; ; = -1.613125930 * z10 - z10 + z5;
movdqa xmm0,xmm5 movdqa xmm0, xmm5
paddw xmm5,xmm2 paddw xmm5, xmm2
pmulhw xmm5,[GOTOFF(ebx,PW_F1847)] ; xmm5=z5 pmulhw xmm5, [GOTOFF(ebx,PW_F1847)] ; xmm5=z5
pmulhw xmm0,[GOTOFF(ebx,PW_MF1613)] pmulhw xmm0, [GOTOFF(ebx,PW_MF1613)]
pmulhw xmm2,[GOTOFF(ebx,PW_F1082)] pmulhw xmm2, [GOTOFF(ebx,PW_F1082)]
psubw xmm0,xmm1 psubw xmm0, xmm1
psubw xmm2,xmm5 ; xmm2=tmp10 psubw xmm2, xmm5 ; xmm2=tmp10
paddw xmm0,xmm5 ; xmm0=tmp12 paddw xmm0, xmm5 ; xmm0=tmp12
; -- Final output stage ; -- Final output stage
psubw xmm0,xmm3 ; xmm0=tmp6 psubw xmm0, xmm3 ; xmm0=tmp6
movdqa xmm1,xmm6 movdqa xmm1, xmm6
movdqa xmm5,xmm7 movdqa xmm5, xmm7
paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
psubw xmm4,xmm0 ; xmm4=tmp5 psubw xmm4, xmm0 ; xmm4=tmp5
movdqa xmm3,xmm6 ; transpose coefficients(phase 1) movdqa xmm3, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
movdqa xmm0,xmm5 ; transpose coefficients(phase 1) movdqa xmm0, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
@@ -262,27 +262,27 @@ EXTN(jsimd_idct_ifast_sse2):
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
paddw xmm2,xmm4 ; xmm2=tmp4 paddw xmm2, xmm4 ; xmm2=tmp4
movdqa xmm5,xmm7 movdqa xmm5, xmm7
movdqa xmm0,xmm1 movdqa xmm0, xmm1
paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
movdqa xmm4,xmm7 ; transpose coefficients(phase 1) movdqa xmm4, xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
movdqa xmm2,xmm1 ; transpose coefficients(phase 1) movdqa xmm2, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm0,xmm3 ; transpose coefficients(phase 2) movdqa xmm0, xmm3 ; transpose coefficients(phase 2)
punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
movdqa xmm5,xmm6 ; transpose coefficients(phase 2) movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
@@ -290,19 +290,19 @@ EXTN(jsimd_idct_ifast_sse2):
movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
movdqa xmm3,xmm1 ; transpose coefficients(phase 2) movdqa xmm3, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
movdqa xmm0,xmm2 ; transpose coefficients(phase 2) movdqa xmm0, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
movdqa xmm4,xmm6 ; transpose coefficients(phase 3) movdqa xmm4, xmm6 ; transpose coefficients(phase 3)
punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
movdqa xmm7,xmm5 ; transpose coefficients(phase 3) movdqa xmm7, xmm5 ; transpose coefficients(phase 3)
punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
@@ -310,12 +310,12 @@ EXTN(jsimd_idct_ifast_sse2):
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
movdqa xmm4,xmm1 ; transpose coefficients(phase 3) movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
movdqa xmm7,xmm3 ; transpose coefficients(phase 3) movdqa xmm7, xmm3 ; transpose coefficients(phase 3)
punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
.column_end: .column_end:
; -- Prefetch the next coefficient block ; -- Prefetch the next coefficient block
@@ -335,23 +335,23 @@ EXTN(jsimd_idct_ifast_sse2):
; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
movdqa xmm2,xmm6 movdqa xmm2, xmm6
movdqa xmm0,xmm5 movdqa xmm0, xmm5
psubw xmm6,xmm1 ; xmm6=tmp11 psubw xmm6, xmm1 ; xmm6=tmp11
psubw xmm5,xmm3 psubw xmm5, xmm3
paddw xmm2,xmm1 ; xmm2=tmp10 paddw xmm2, xmm1 ; xmm2=tmp10
paddw xmm0,xmm3 ; xmm0=tmp13 paddw xmm0, xmm3 ; xmm0=tmp13
psllw xmm5,PRE_MULTIPLY_SCALE_BITS psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5,[GOTOFF(ebx,PW_F1414)] pmulhw xmm5, [GOTOFF(ebx,PW_F1414)]
psubw xmm5,xmm0 ; xmm5=tmp12 psubw xmm5, xmm0 ; xmm5=tmp12
movdqa xmm1,xmm2 movdqa xmm1, xmm2
movdqa xmm3,xmm6 movdqa xmm3, xmm6
psubw xmm2,xmm0 ; xmm2=tmp3 psubw xmm2, xmm0 ; xmm2=tmp3
psubw xmm6,xmm5 ; xmm6=tmp2 psubw xmm6, xmm5 ; xmm6=tmp2
paddw xmm1,xmm0 ; xmm1=tmp0 paddw xmm1, xmm0 ; xmm1=tmp0
paddw xmm3,xmm5 ; xmm3=tmp1 paddw xmm3, xmm5 ; xmm3=tmp1
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
@@ -363,23 +363,23 @@ EXTN(jsimd_idct_ifast_sse2):
; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
movdqa xmm2,xmm0 movdqa xmm2, xmm0
movdqa xmm6,xmm4 movdqa xmm6, xmm4
psubw xmm0,xmm7 ; xmm0=z12 psubw xmm0, xmm7 ; xmm0=z12
psubw xmm4,xmm5 ; xmm4=z10 psubw xmm4, xmm5 ; xmm4=z10
paddw xmm2,xmm7 ; xmm2=z11 paddw xmm2, xmm7 ; xmm2=z11
paddw xmm6,xmm5 ; xmm6=z13 paddw xmm6, xmm5 ; xmm6=z13
movdqa xmm7,xmm4 ; xmm7=z10(unscaled) movdqa xmm7, xmm4 ; xmm7=z10(unscaled)
psllw xmm0,PRE_MULTIPLY_SCALE_BITS psllw xmm0, PRE_MULTIPLY_SCALE_BITS
psllw xmm4,PRE_MULTIPLY_SCALE_BITS psllw xmm4, PRE_MULTIPLY_SCALE_BITS
movdqa xmm5,xmm2 movdqa xmm5, xmm2
psubw xmm2,xmm6 psubw xmm2, xmm6
paddw xmm5,xmm6 ; xmm5=tmp7 paddw xmm5, xmm6 ; xmm5=tmp7
psllw xmm2,PRE_MULTIPLY_SCALE_BITS psllw xmm2, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm2,[GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11 pmulhw xmm2, [GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11
; To avoid overflow... ; To avoid overflow...
; ;
@@ -390,83 +390,83 @@ EXTN(jsimd_idct_ifast_sse2):
; tmp12 = (-1.613125930 - 1) * z10 + z5; ; tmp12 = (-1.613125930 - 1) * z10 + z5;
; = -1.613125930 * z10 - z10 + z5; ; = -1.613125930 * z10 - z10 + z5;
movdqa xmm6,xmm4 movdqa xmm6, xmm4
paddw xmm4,xmm0 paddw xmm4, xmm0
pmulhw xmm4,[GOTOFF(ebx,PW_F1847)] ; xmm4=z5 pmulhw xmm4, [GOTOFF(ebx,PW_F1847)] ; xmm4=z5
pmulhw xmm6,[GOTOFF(ebx,PW_MF1613)] pmulhw xmm6, [GOTOFF(ebx,PW_MF1613)]
pmulhw xmm0,[GOTOFF(ebx,PW_F1082)] pmulhw xmm0, [GOTOFF(ebx,PW_F1082)]
psubw xmm6,xmm7 psubw xmm6, xmm7
psubw xmm0,xmm4 ; xmm0=tmp10 psubw xmm0, xmm4 ; xmm0=tmp10
paddw xmm6,xmm4 ; xmm6=tmp12 paddw xmm6, xmm4 ; xmm6=tmp12
; -- Final output stage ; -- Final output stage
psubw xmm6,xmm5 ; xmm6=tmp6 psubw xmm6, xmm5 ; xmm6=tmp6
movdqa xmm7,xmm1 movdqa xmm7, xmm1
movdqa xmm4,xmm3 movdqa xmm4, xmm3
paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
psraw xmm1,(PASS1_BITS+3) ; descale psraw xmm1, (PASS1_BITS+3) ; descale
psraw xmm3,(PASS1_BITS+3) ; descale psraw xmm3, (PASS1_BITS+3) ; descale
psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
psraw xmm7,(PASS1_BITS+3) ; descale psraw xmm7, (PASS1_BITS+3) ; descale
psraw xmm4,(PASS1_BITS+3) ; descale psraw xmm4, (PASS1_BITS+3) ; descale
psubw xmm2,xmm6 ; xmm2=tmp5 psubw xmm2, xmm6 ; xmm2=tmp5
packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
paddw xmm0,xmm2 ; xmm0=tmp4 paddw xmm0, xmm2 ; xmm0=tmp4
movdqa xmm4,xmm5 movdqa xmm4, xmm5
movdqa xmm7,xmm6 movdqa xmm7, xmm6
paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
psraw xmm5,(PASS1_BITS+3) ; descale psraw xmm5, (PASS1_BITS+3) ; descale
psraw xmm6,(PASS1_BITS+3) ; descale psraw xmm6, (PASS1_BITS+3) ; descale
psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
psraw xmm4,(PASS1_BITS+3) ; descale psraw xmm4, (PASS1_BITS+3) ; descale
psraw xmm7,(PASS1_BITS+3) ; descale psraw xmm7, (PASS1_BITS+3) ; descale
movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
paddb xmm1,xmm2 paddb xmm1, xmm2
paddb xmm3,xmm2 paddb xmm3, xmm2
paddb xmm5,xmm2 paddb xmm5, xmm2
paddb xmm7,xmm2 paddb xmm7, xmm2
movdqa xmm0,xmm1 ; transpose coefficients(phase 1) movdqa xmm0, xmm1 ; transpose coefficients(phase 1)
punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
movdqa xmm6,xmm5 ; transpose coefficients(phase 1) movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
movdqa xmm4,xmm1 ; transpose coefficients(phase 2) movdqa xmm4, xmm1 ; transpose coefficients(phase 2)
punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
movdqa xmm2,xmm6 ; transpose coefficients(phase 2) movdqa xmm2, xmm6 ; transpose coefficients(phase 2)
punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
movdqa xmm3,xmm1 ; transpose coefficients(phase 3) movdqa xmm3, xmm1 ; transpose coefficients(phase 3)
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
movdqa xmm7,xmm4 ; transpose coefficients(phase 3) movdqa xmm7, xmm4 ; transpose coefficients(phase 3)
punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
@@ -491,7 +491,7 @@ EXTN(jsimd_idct_ifast_sse2):
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; unused ; pop ecx ; unused
poppic ebx poppic ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret

View File

@@ -48,18 +48,18 @@ F_3_072 equ 25172 ; FIX(3.072711026)
%else %else
; NASM cannot do compile-time arithmetic on floating-point constants. ; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) F_0_298 equ DESCALE( 320652955, 30-CONST_BITS) ; FIX(0.298631336)
F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) F_0_390 equ DESCALE( 418953276, 30-CONST_BITS) ; FIX(0.390180644)
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) F_1_175 equ DESCALE(1262586813, 30-CONST_BITS) ; FIX(1.175875602)
F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) F_1_501 equ DESCALE(1612031267, 30-CONST_BITS) ; FIX(1.501321110)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) F_1_961 equ DESCALE(2106220350, 30-CONST_BITS) ; FIX(1.961570560)
F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) F_2_053 equ DESCALE(2204520673, 30-CONST_BITS) ; FIX(2.053119869)
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026)
%endif %endif
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -109,11 +109,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
EXTN(jsimd_idct_islow_sse2): EXTN(jsimd_idct_islow_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
@@ -134,11 +134,11 @@ EXTN(jsimd_idct_islow_sse2):
por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
por xmm1,xmm0 por xmm1, xmm0
packsswb xmm1,xmm1 packsswb xmm1, xmm1
packsswb xmm1,xmm1 packsswb xmm1, xmm1
movd eax,xmm1 movd eax, xmm1
test rax,rax test rax, rax
jnz short .columnDCT jnz short .columnDCT
; -- AC terms all zero ; -- AC terms all zero
@@ -146,20 +146,20 @@ EXTN(jsimd_idct_islow_sse2):
movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
psllw xmm5,PASS1_BITS psllw xmm5, PASS1_BITS
movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03) punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07) punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
@@ -189,53 +189,53 @@ EXTN(jsimd_idct_islow_sse2):
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
movdqa xmm4,xmm1 ; xmm1=in2=z2 movdqa xmm4, xmm1 ; xmm1=in2=z2
movdqa xmm5,xmm1 movdqa xmm5, xmm1
punpcklwd xmm4,xmm3 ; xmm3=in6=z3 punpcklwd xmm4, xmm3 ; xmm3=in6=z3
punpckhwd xmm5,xmm3 punpckhwd xmm5, xmm3
movdqa xmm1,xmm4 movdqa xmm1, xmm4
movdqa xmm3,xmm5 movdqa xmm3, xmm5
pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=tmp3L pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=tmp3L
pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H
pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L
pmaddwd xmm3,[rel PW_F054_MF130] ; xmm3=tmp2H pmaddwd xmm3, [rel PW_F054_MF130] ; xmm3=tmp2H
movdqa xmm6,xmm0 movdqa xmm6, xmm0
paddw xmm0,xmm2 ; xmm0=in0+in4 paddw xmm0, xmm2 ; xmm0=in0+in4
psubw xmm6,xmm2 ; xmm6=in0-in4 psubw xmm6, xmm2 ; xmm6=in0-in4
pxor xmm7,xmm7 pxor xmm7, xmm7
pxor xmm2,xmm2 pxor xmm2, xmm2
punpcklwd xmm7,xmm0 ; xmm7=tmp0L punpcklwd xmm7, xmm0 ; xmm7=tmp0L
punpckhwd xmm2,xmm0 ; xmm2=tmp0H punpckhwd xmm2, xmm0 ; xmm2=tmp0H
psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
movdqa xmm0,xmm7 movdqa xmm0, xmm7
paddd xmm7,xmm4 ; xmm7=tmp10L paddd xmm7, xmm4 ; xmm7=tmp10L
psubd xmm0,xmm4 ; xmm0=tmp13L psubd xmm0, xmm4 ; xmm0=tmp13L
movdqa xmm4,xmm2 movdqa xmm4, xmm2
paddd xmm2,xmm5 ; xmm2=tmp10H paddd xmm2, xmm5 ; xmm2=tmp10H
psubd xmm4,xmm5 ; xmm4=tmp13H psubd xmm4, xmm5 ; xmm4=tmp13H
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
pxor xmm5,xmm5 pxor xmm5, xmm5
pxor xmm7,xmm7 pxor xmm7, xmm7
punpcklwd xmm5,xmm6 ; xmm5=tmp1L punpcklwd xmm5, xmm6 ; xmm5=tmp1L
punpckhwd xmm7,xmm6 ; xmm7=tmp1H punpckhwd xmm7, xmm6 ; xmm7=tmp1H
psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
movdqa xmm2,xmm5 movdqa xmm2, xmm5
paddd xmm5,xmm1 ; xmm5=tmp11L paddd xmm5, xmm1 ; xmm5=tmp11L
psubd xmm2,xmm1 ; xmm2=tmp12L psubd xmm2, xmm1 ; xmm2=tmp12L
movdqa xmm0,xmm7 movdqa xmm0, xmm7
paddd xmm7,xmm3 ; xmm7=tmp11H paddd xmm7, xmm3 ; xmm7=tmp11H
psubd xmm0,xmm3 ; xmm0=tmp12H psubd xmm0, xmm3 ; xmm0=tmp12H
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
@@ -253,10 +253,10 @@ EXTN(jsimd_idct_islow_sse2):
pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm5,xmm6 movdqa xmm5, xmm6
movdqa xmm7,xmm4 movdqa xmm7, xmm4
paddw xmm5,xmm3 ; xmm5=z3 paddw xmm5, xmm3 ; xmm5=z3
paddw xmm7,xmm1 ; xmm7=z4 paddw xmm7, xmm1 ; xmm7=z4
; (Original) ; (Original)
; z5 = (z3 + z4) * 1.175875602; ; z5 = (z3 + z4) * 1.175875602;
@@ -267,16 +267,16 @@ EXTN(jsimd_idct_islow_sse2):
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
movdqa xmm2,xmm5 movdqa xmm2, xmm5
movdqa xmm0,xmm5 movdqa xmm0, xmm5
punpcklwd xmm2,xmm7 punpcklwd xmm2, xmm7
punpckhwd xmm0,xmm7 punpckhwd xmm0, xmm7
movdqa xmm5,xmm2 movdqa xmm5, xmm2
movdqa xmm7,xmm0 movdqa xmm7, xmm0
pmaddwd xmm2,[rel PW_MF078_F117] ; xmm2=z3L pmaddwd xmm2, [rel PW_MF078_F117] ; xmm2=z3L
pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3H pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3H
pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L
pmaddwd xmm7,[rel PW_F117_F078] ; xmm7=z4H pmaddwd xmm7, [rel PW_F117_F078] ; xmm7=z4H
movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
@@ -297,38 +297,38 @@ EXTN(jsimd_idct_islow_sse2):
; tmp0 += z3; tmp1 += z4; ; tmp0 += z3; tmp1 += z4;
; tmp2 += z3; tmp3 += z4; ; tmp2 += z3; tmp3 += z4;
movdqa xmm2,xmm3 movdqa xmm2, xmm3
movdqa xmm0,xmm3 movdqa xmm0, xmm3
punpcklwd xmm2,xmm4 punpcklwd xmm2, xmm4
punpckhwd xmm0,xmm4 punpckhwd xmm0, xmm4
movdqa xmm3,xmm2 movdqa xmm3, xmm2
movdqa xmm4,xmm0 movdqa xmm4, xmm0
pmaddwd xmm2,[rel PW_MF060_MF089] ; xmm2=tmp0L pmaddwd xmm2, [rel PW_MF060_MF089] ; xmm2=tmp0L
pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0H pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0H
pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3L pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3L
pmaddwd xmm4,[rel PW_MF089_F060] ; xmm4=tmp3H pmaddwd xmm4, [rel PW_MF089_F060] ; xmm4=tmp3H
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
paddd xmm3,xmm5 ; xmm3=tmp3L paddd xmm3, xmm5 ; xmm3=tmp3L
paddd xmm4,xmm7 ; xmm4=tmp3H paddd xmm4, xmm7 ; xmm4=tmp3H
movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
movdqa xmm2,xmm1 movdqa xmm2, xmm1
movdqa xmm0,xmm1 movdqa xmm0, xmm1
punpcklwd xmm2,xmm6 punpcklwd xmm2, xmm6
punpckhwd xmm0,xmm6 punpckhwd xmm0, xmm6
movdqa xmm1,xmm2 movdqa xmm1, xmm2
movdqa xmm6,xmm0 movdqa xmm6, xmm0
pmaddwd xmm2,[rel PW_MF050_MF256] ; xmm2=tmp1L pmaddwd xmm2, [rel PW_MF050_MF256] ; xmm2=tmp1L
pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1H pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1H
pmaddwd xmm1,[rel PW_MF256_F050] ; xmm1=tmp2L pmaddwd xmm1, [rel PW_MF256_F050] ; xmm1=tmp2L
pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H
paddd xmm2,xmm5 ; xmm2=tmp1L paddd xmm2, xmm5 ; xmm2=tmp1L
paddd xmm0,xmm7 ; xmm0=tmp1H paddd xmm0, xmm7 ; xmm0=tmp1H
paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
@@ -340,57 +340,57 @@ EXTN(jsimd_idct_islow_sse2):
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
movdqa xmm2,xmm5 movdqa xmm2, xmm5
movdqa xmm0,xmm7 movdqa xmm0, xmm7
paddd xmm5,xmm3 ; xmm5=data0L paddd xmm5, xmm3 ; xmm5=data0L
paddd xmm7,xmm4 ; xmm7=data0H paddd xmm7, xmm4 ; xmm7=data0H
psubd xmm2,xmm3 ; xmm2=data7L psubd xmm2, xmm3 ; xmm2=data7L
psubd xmm0,xmm4 ; xmm0=data7H psubd xmm0, xmm4 ; xmm0=data7H
movdqa xmm3,[rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1] movdqa xmm3, [rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1]
paddd xmm5,xmm3 paddd xmm5, xmm3
paddd xmm7,xmm3 paddd xmm7, xmm3
psrad xmm5,DESCALE_P1 psrad xmm5, DESCALE_P1
psrad xmm7,DESCALE_P1 psrad xmm7, DESCALE_P1
paddd xmm2,xmm3 paddd xmm2, xmm3
paddd xmm0,xmm3 paddd xmm0, xmm3
psrad xmm2,DESCALE_P1 psrad xmm2, DESCALE_P1
psrad xmm0,DESCALE_P1 psrad xmm0, DESCALE_P1
packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
movdqa xmm7,xmm4 movdqa xmm7, xmm4
movdqa xmm0,xmm3 movdqa xmm0, xmm3
paddd xmm4,xmm1 ; xmm4=data1L paddd xmm4, xmm1 ; xmm4=data1L
paddd xmm3,xmm6 ; xmm3=data1H paddd xmm3, xmm6 ; xmm3=data1H
psubd xmm7,xmm1 ; xmm7=data6L psubd xmm7, xmm1 ; xmm7=data6L
psubd xmm0,xmm6 ; xmm0=data6H psubd xmm0, xmm6 ; xmm0=data6H
movdqa xmm1,[rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1] movdqa xmm1, [rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1]
paddd xmm4,xmm1 paddd xmm4, xmm1
paddd xmm3,xmm1 paddd xmm3, xmm1
psrad xmm4,DESCALE_P1 psrad xmm4, DESCALE_P1
psrad xmm3,DESCALE_P1 psrad xmm3, DESCALE_P1
paddd xmm7,xmm1 paddd xmm7, xmm1
paddd xmm0,xmm1 paddd xmm0, xmm1
psrad xmm7,DESCALE_P1 psrad xmm7, DESCALE_P1
psrad xmm0,DESCALE_P1 psrad xmm0, DESCALE_P1
packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
movdqa xmm6,xmm5 ; transpose coefficients(phase 1) movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13) punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
movdqa xmm1,xmm7 ; transpose coefficients(phase 1) movdqa xmm1, xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73) punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77) punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
@@ -402,69 +402,69 @@ EXTN(jsimd_idct_islow_sse2):
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
movdqa xmm5,xmm3 movdqa xmm5, xmm3
movdqa xmm6,xmm0 movdqa xmm6, xmm0
paddd xmm3,xmm4 ; xmm3=data2L paddd xmm3, xmm4 ; xmm3=data2L
paddd xmm0,xmm2 ; xmm0=data2H paddd xmm0, xmm2 ; xmm0=data2H
psubd xmm5,xmm4 ; xmm5=data5L psubd xmm5, xmm4 ; xmm5=data5L
psubd xmm6,xmm2 ; xmm6=data5H psubd xmm6, xmm2 ; xmm6=data5H
movdqa xmm7,[rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1] movdqa xmm7, [rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1]
paddd xmm3,xmm7 paddd xmm3, xmm7
paddd xmm0,xmm7 paddd xmm0, xmm7
psrad xmm3,DESCALE_P1 psrad xmm3, DESCALE_P1
psrad xmm0,DESCALE_P1 psrad xmm0, DESCALE_P1
paddd xmm5,xmm7 paddd xmm5, xmm7
paddd xmm6,xmm7 paddd xmm6, xmm7
psrad xmm5,DESCALE_P1 psrad xmm5, DESCALE_P1
psrad xmm6,DESCALE_P1 psrad xmm6, DESCALE_P1
packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
movdqa xmm0,xmm1 movdqa xmm0, xmm1
movdqa xmm6,xmm4 movdqa xmm6, xmm4
paddd xmm1,xmm2 ; xmm1=data3L paddd xmm1, xmm2 ; xmm1=data3L
paddd xmm4,xmm7 ; xmm4=data3H paddd xmm4, xmm7 ; xmm4=data3H
psubd xmm0,xmm2 ; xmm0=data4L psubd xmm0, xmm2 ; xmm0=data4L
psubd xmm6,xmm7 ; xmm6=data4H psubd xmm6, xmm7 ; xmm6=data4H
movdqa xmm2,[rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1] movdqa xmm2, [rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1]
paddd xmm1,xmm2 paddd xmm1, xmm2
paddd xmm4,xmm2 paddd xmm4, xmm2
psrad xmm1,DESCALE_P1 psrad xmm1, DESCALE_P1
psrad xmm4,DESCALE_P1 psrad xmm4, DESCALE_P1
paddd xmm0,xmm2 paddd xmm0, xmm2
paddd xmm6,xmm2 paddd xmm6, xmm2
psrad xmm0,DESCALE_P1 psrad xmm0, DESCALE_P1
psrad xmm6,DESCALE_P1 psrad xmm6, DESCALE_P1
packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
movdqa xmm4,xmm3 ; transpose coefficients(phase 1) movdqa xmm4, xmm3 ; transpose coefficients(phase 1)
punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33) punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37) punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
movdqa xmm6,xmm0 ; transpose coefficients(phase 1) movdqa xmm6, xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53) punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57) punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
movdqa xmm1,xmm7 ; transpose coefficients(phase 2) movdqa xmm1, xmm7 ; transpose coefficients(phase 2)
punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31) punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33) punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
movdqa xmm5,xmm2 ; transpose coefficients(phase 2) movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35) punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37) punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
@@ -472,19 +472,19 @@ EXTN(jsimd_idct_islow_sse2):
movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
movdqa xmm2,xmm0 ; transpose coefficients(phase 2) movdqa xmm2, xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71) punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73) punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
movdqa xmm5,xmm6 ; transpose coefficients(phase 2) movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75) punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77) punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
movdqa xmm3,xmm7 ; transpose coefficients(phase 3) movdqa xmm3, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
movdqa xmm4,xmm1 ; transpose coefficients(phase 3) movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
@@ -492,12 +492,12 @@ EXTN(jsimd_idct_islow_sse2):
movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
movdqa xmm3,xmm0 ; transpose coefficients(phase 3) movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
movdqa xmm4,xmm2 ; transpose coefficients(phase 3) movdqa xmm4, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
@@ -529,53 +529,53 @@ EXTN(jsimd_idct_islow_sse2):
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
movdqa xmm6,xmm1 ; xmm1=in2=z2 movdqa xmm6, xmm1 ; xmm1=in2=z2
movdqa xmm5,xmm1 movdqa xmm5, xmm1
punpcklwd xmm6,xmm2 ; xmm2=in6=z3 punpcklwd xmm6, xmm2 ; xmm2=in6=z3
punpckhwd xmm5,xmm2 punpckhwd xmm5, xmm2
movdqa xmm1,xmm6 movdqa xmm1, xmm6
movdqa xmm2,xmm5 movdqa xmm2, xmm5
pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=tmp3L pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=tmp3L
pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H
pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L
pmaddwd xmm2,[rel PW_F054_MF130] ; xmm2=tmp2H pmaddwd xmm2, [rel PW_F054_MF130] ; xmm2=tmp2H
movdqa xmm3,xmm7 movdqa xmm3, xmm7
paddw xmm7,xmm0 ; xmm7=in0+in4 paddw xmm7, xmm0 ; xmm7=in0+in4
psubw xmm3,xmm0 ; xmm3=in0-in4 psubw xmm3, xmm0 ; xmm3=in0-in4
pxor xmm4,xmm4 pxor xmm4, xmm4
pxor xmm0,xmm0 pxor xmm0, xmm0
punpcklwd xmm4,xmm7 ; xmm4=tmp0L punpcklwd xmm4, xmm7 ; xmm4=tmp0L
punpckhwd xmm0,xmm7 ; xmm0=tmp0H punpckhwd xmm0, xmm7 ; xmm0=tmp0H
psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
movdqa xmm7,xmm4 movdqa xmm7, xmm4
paddd xmm4,xmm6 ; xmm4=tmp10L paddd xmm4, xmm6 ; xmm4=tmp10L
psubd xmm7,xmm6 ; xmm7=tmp13L psubd xmm7, xmm6 ; xmm7=tmp13L
movdqa xmm6,xmm0 movdqa xmm6, xmm0
paddd xmm0,xmm5 ; xmm0=tmp10H paddd xmm0, xmm5 ; xmm0=tmp10H
psubd xmm6,xmm5 ; xmm6=tmp13H psubd xmm6, xmm5 ; xmm6=tmp13H
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
pxor xmm5,xmm5 pxor xmm5, xmm5
pxor xmm4,xmm4 pxor xmm4, xmm4
punpcklwd xmm5,xmm3 ; xmm5=tmp1L punpcklwd xmm5, xmm3 ; xmm5=tmp1L
punpckhwd xmm4,xmm3 ; xmm4=tmp1H punpckhwd xmm4, xmm3 ; xmm4=tmp1H
psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
movdqa xmm0,xmm5 movdqa xmm0, xmm5
paddd xmm5,xmm1 ; xmm5=tmp11L paddd xmm5, xmm1 ; xmm5=tmp11L
psubd xmm0,xmm1 ; xmm0=tmp12L psubd xmm0, xmm1 ; xmm0=tmp12L
movdqa xmm7,xmm4 movdqa xmm7, xmm4
paddd xmm4,xmm2 ; xmm4=tmp11H paddd xmm4, xmm2 ; xmm4=tmp11H
psubd xmm7,xmm2 ; xmm7=tmp12H psubd xmm7, xmm2 ; xmm7=tmp12H
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
@@ -589,10 +589,10 @@ EXTN(jsimd_idct_islow_sse2):
movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
movdqa xmm5,xmm6 movdqa xmm5, xmm6
movdqa xmm4,xmm3 movdqa xmm4, xmm3
paddw xmm5,xmm1 ; xmm5=z3 paddw xmm5, xmm1 ; xmm5=z3
paddw xmm4,xmm2 ; xmm4=z4 paddw xmm4, xmm2 ; xmm4=z4
; (Original) ; (Original)
; z5 = (z3 + z4) * 1.175875602; ; z5 = (z3 + z4) * 1.175875602;
@@ -603,16 +603,16 @@ EXTN(jsimd_idct_islow_sse2):
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
movdqa xmm0,xmm5 movdqa xmm0, xmm5
movdqa xmm7,xmm5 movdqa xmm7, xmm5
punpcklwd xmm0,xmm4 punpcklwd xmm0, xmm4
punpckhwd xmm7,xmm4 punpckhwd xmm7, xmm4
movdqa xmm5,xmm0 movdqa xmm5, xmm0
movdqa xmm4,xmm7 movdqa xmm4, xmm7
pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3L pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3L
pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3H pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3H
pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L
pmaddwd xmm4,[rel PW_F117_F078] ; xmm4=z4H pmaddwd xmm4, [rel PW_F117_F078] ; xmm4=z4H
movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
@@ -633,38 +633,38 @@ EXTN(jsimd_idct_islow_sse2):
; tmp0 += z3; tmp1 += z4; ; tmp0 += z3; tmp1 += z4;
; tmp2 += z3; tmp3 += z4; ; tmp2 += z3; tmp3 += z4;
movdqa xmm0,xmm1 movdqa xmm0, xmm1
movdqa xmm7,xmm1 movdqa xmm7, xmm1
punpcklwd xmm0,xmm3 punpcklwd xmm0, xmm3
punpckhwd xmm7,xmm3 punpckhwd xmm7, xmm3
movdqa xmm1,xmm0 movdqa xmm1, xmm0
movdqa xmm3,xmm7 movdqa xmm3, xmm7
pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0L pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0L
pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp0H pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp0H
pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp3L pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp3L
pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3H pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3H
paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
paddd xmm1,xmm5 ; xmm1=tmp3L paddd xmm1, xmm5 ; xmm1=tmp3L
paddd xmm3,xmm4 ; xmm3=tmp3H paddd xmm3, xmm4 ; xmm3=tmp3H
movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
movdqa xmm0,xmm2 movdqa xmm0, xmm2
movdqa xmm7,xmm2 movdqa xmm7, xmm2
punpcklwd xmm0,xmm6 punpcklwd xmm0, xmm6
punpckhwd xmm7,xmm6 punpckhwd xmm7, xmm6
movdqa xmm2,xmm0 movdqa xmm2, xmm0
movdqa xmm6,xmm7 movdqa xmm6, xmm7
pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1L pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1L
pmaddwd xmm7,[rel PW_MF050_MF256] ; xmm7=tmp1H pmaddwd xmm7, [rel PW_MF050_MF256] ; xmm7=tmp1H
pmaddwd xmm2,[rel PW_MF256_F050] ; xmm2=tmp2L pmaddwd xmm2, [rel PW_MF256_F050] ; xmm2=tmp2L
pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H
paddd xmm0,xmm5 ; xmm0=tmp1L paddd xmm0, xmm5 ; xmm0=tmp1L
paddd xmm7,xmm4 ; xmm7=tmp1H paddd xmm7, xmm4 ; xmm7=tmp1H
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
@@ -676,53 +676,53 @@ EXTN(jsimd_idct_islow_sse2):
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
movdqa xmm0,xmm5 movdqa xmm0, xmm5
movdqa xmm7,xmm4 movdqa xmm7, xmm4
paddd xmm5,xmm1 ; xmm5=data0L paddd xmm5, xmm1 ; xmm5=data0L
paddd xmm4,xmm3 ; xmm4=data0H paddd xmm4, xmm3 ; xmm4=data0H
psubd xmm0,xmm1 ; xmm0=data7L psubd xmm0, xmm1 ; xmm0=data7L
psubd xmm7,xmm3 ; xmm7=data7H psubd xmm7, xmm3 ; xmm7=data7H
movdqa xmm1,[rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2] movdqa xmm1, [rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2]
paddd xmm5,xmm1 paddd xmm5, xmm1
paddd xmm4,xmm1 paddd xmm4, xmm1
psrad xmm5,DESCALE_P2 psrad xmm5, DESCALE_P2
psrad xmm4,DESCALE_P2 psrad xmm4, DESCALE_P2
paddd xmm0,xmm1 paddd xmm0, xmm1
paddd xmm7,xmm1 paddd xmm7, xmm1
psrad xmm0,DESCALE_P2 psrad xmm0, DESCALE_P2
psrad xmm7,DESCALE_P2 psrad xmm7, DESCALE_P2
packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
movdqa xmm4,xmm3 movdqa xmm4, xmm3
movdqa xmm7,xmm1 movdqa xmm7, xmm1
paddd xmm3,xmm2 ; xmm3=data1L paddd xmm3, xmm2 ; xmm3=data1L
paddd xmm1,xmm6 ; xmm1=data1H paddd xmm1, xmm6 ; xmm1=data1H
psubd xmm4,xmm2 ; xmm4=data6L psubd xmm4, xmm2 ; xmm4=data6L
psubd xmm7,xmm6 ; xmm7=data6H psubd xmm7, xmm6 ; xmm7=data6H
movdqa xmm2,[rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2] movdqa xmm2, [rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2]
paddd xmm3,xmm2 paddd xmm3, xmm2
paddd xmm1,xmm2 paddd xmm1, xmm2
psrad xmm3,DESCALE_P2 psrad xmm3, DESCALE_P2
psrad xmm1,DESCALE_P2 psrad xmm1, DESCALE_P2
paddd xmm4,xmm2 paddd xmm4, xmm2
paddd xmm7,xmm2 paddd xmm7, xmm2
psrad xmm4,DESCALE_P2 psrad xmm4, DESCALE_P2
psrad xmm7,DESCALE_P2 psrad xmm7, DESCALE_P2
packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
@@ -732,91 +732,91 @@ EXTN(jsimd_idct_islow_sse2):
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
movdqa xmm4,xmm6 movdqa xmm4, xmm6
movdqa xmm0,xmm2 movdqa xmm0, xmm2
paddd xmm6,xmm1 ; xmm6=data2L paddd xmm6, xmm1 ; xmm6=data2L
paddd xmm2,xmm7 ; xmm2=data2H paddd xmm2, xmm7 ; xmm2=data2H
psubd xmm4,xmm1 ; xmm4=data5L psubd xmm4, xmm1 ; xmm4=data5L
psubd xmm0,xmm7 ; xmm0=data5H psubd xmm0, xmm7 ; xmm0=data5H
movdqa xmm5,[rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2] movdqa xmm5, [rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2]
paddd xmm6,xmm5 paddd xmm6, xmm5
paddd xmm2,xmm5 paddd xmm2, xmm5
psrad xmm6,DESCALE_P2 psrad xmm6, DESCALE_P2
psrad xmm2,DESCALE_P2 psrad xmm2, DESCALE_P2
paddd xmm4,xmm5 paddd xmm4, xmm5
paddd xmm0,xmm5 paddd xmm0, xmm5
psrad xmm4,DESCALE_P2 psrad xmm4, DESCALE_P2
psrad xmm0,DESCALE_P2 psrad xmm0, DESCALE_P2
packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
movdqa xmm2,xmm3 movdqa xmm2, xmm3
movdqa xmm0,xmm1 movdqa xmm0, xmm1
paddd xmm3,xmm7 ; xmm3=data3L paddd xmm3, xmm7 ; xmm3=data3L
paddd xmm1,xmm5 ; xmm1=data3H paddd xmm1, xmm5 ; xmm1=data3H
psubd xmm2,xmm7 ; xmm2=data4L psubd xmm2, xmm7 ; xmm2=data4L
psubd xmm0,xmm5 ; xmm0=data4H psubd xmm0, xmm5 ; xmm0=data4H
movdqa xmm7,[rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2] movdqa xmm7, [rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2]
paddd xmm3,xmm7 paddd xmm3, xmm7
paddd xmm1,xmm7 paddd xmm1, xmm7
psrad xmm3,DESCALE_P2 psrad xmm3, DESCALE_P2
psrad xmm1,DESCALE_P2 psrad xmm1, DESCALE_P2
paddd xmm2,xmm7 paddd xmm2, xmm7
paddd xmm0,xmm7 paddd xmm0, xmm7
psrad xmm2,DESCALE_P2 psrad xmm2, DESCALE_P2
psrad xmm0,DESCALE_P2 psrad xmm0, DESCALE_P2
movdqa xmm5,[rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP] movdqa xmm5, [rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP]
packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
paddb xmm7,xmm5 paddb xmm7, xmm5
paddb xmm1,xmm5 paddb xmm1, xmm5
paddb xmm6,xmm5 paddb xmm6, xmm5
paddb xmm3,xmm5 paddb xmm3, xmm5
movdqa xmm0,xmm7 ; transpose coefficients(phase 1) movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
movdqa xmm2,xmm6 ; transpose coefficients(phase 1) movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
movdqa xmm4,xmm7 ; transpose coefficients(phase 2) movdqa xmm4, xmm7 ; transpose coefficients(phase 2)
punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
movdqa xmm5,xmm2 ; transpose coefficients(phase 2) movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
movdqa xmm1,xmm7 ; transpose coefficients(phase 3) movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
movdqa xmm3,xmm4 ; transpose coefficients(phase 3) movdqa xmm3, xmm4 ; transpose coefficients(phase 3)
punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
@@ -837,7 +837,7 @@ EXTN(jsimd_idct_islow_sse2):
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret

View File

@@ -47,18 +47,18 @@ F_3_072 equ 25172 ; FIX(3.072711026)
%else %else
; NASM cannot do compile-time arithmetic on floating-point constants. ; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) F_0_298 equ DESCALE( 320652955, 30-CONST_BITS) ; FIX(0.298631336)
F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) F_0_390 equ DESCALE( 418953276, 30-CONST_BITS) ; FIX(0.390180644)
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) F_1_175 equ DESCALE(1262586813, 30-CONST_BITS) ; FIX(1.175875602)
F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) F_1_501 equ DESCALE(1612031267, 30-CONST_BITS) ; FIX(1.501321110)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) F_1_961 equ DESCALE(2106220350, 30-CONST_BITS) ; FIX(1.961570560)
F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) F_2_053 equ DESCALE(2204520673, 30-CONST_BITS) ; FIX(2.053119869)
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026)
%endif %endif
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -108,11 +108,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
EXTN(jsimd_idct_islow_sse2): EXTN(jsimd_idct_islow_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx pushpic ebx
; push ecx ; unused ; push ecx ; unused
@@ -140,11 +140,11 @@ EXTN(jsimd_idct_islow_sse2):
por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
por xmm1,xmm0 por xmm1, xmm0
packsswb xmm1,xmm1 packsswb xmm1, xmm1
packsswb xmm1,xmm1 packsswb xmm1, xmm1
movd eax,xmm1 movd eax, xmm1
test eax,eax test eax, eax
jnz short .columnDCT jnz short .columnDCT
; -- AC terms all zero ; -- AC terms all zero
@@ -152,27 +152,27 @@ EXTN(jsimd_idct_islow_sse2):
movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
psllw xmm5,PASS1_BITS psllw xmm5, PASS1_BITS
movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03) punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07) punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
jmp near .column_end jmp near .column_end
alignx 16,7 alignx 16, 7
%endif %endif
.columnDCT: .columnDCT:
@@ -196,53 +196,53 @@ EXTN(jsimd_idct_islow_sse2):
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
movdqa xmm4,xmm1 ; xmm1=in2=z2 movdqa xmm4, xmm1 ; xmm1=in2=z2
movdqa xmm5,xmm1 movdqa xmm5, xmm1
punpcklwd xmm4,xmm3 ; xmm3=in6=z3 punpcklwd xmm4, xmm3 ; xmm3=in6=z3
punpckhwd xmm5,xmm3 punpckhwd xmm5, xmm3
movdqa xmm1,xmm4 movdqa xmm1, xmm4
movdqa xmm3,xmm5 movdqa xmm3, xmm5
pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L
pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
pmaddwd xmm3,[GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H pmaddwd xmm3, [GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H
movdqa xmm6,xmm0 movdqa xmm6, xmm0
paddw xmm0,xmm2 ; xmm0=in0+in4 paddw xmm0, xmm2 ; xmm0=in0+in4
psubw xmm6,xmm2 ; xmm6=in0-in4 psubw xmm6, xmm2 ; xmm6=in0-in4
pxor xmm7,xmm7 pxor xmm7, xmm7
pxor xmm2,xmm2 pxor xmm2, xmm2
punpcklwd xmm7,xmm0 ; xmm7=tmp0L punpcklwd xmm7, xmm0 ; xmm7=tmp0L
punpckhwd xmm2,xmm0 ; xmm2=tmp0H punpckhwd xmm2, xmm0 ; xmm2=tmp0H
psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
movdqa xmm0,xmm7 movdqa xmm0, xmm7
paddd xmm7,xmm4 ; xmm7=tmp10L paddd xmm7, xmm4 ; xmm7=tmp10L
psubd xmm0,xmm4 ; xmm0=tmp13L psubd xmm0, xmm4 ; xmm0=tmp13L
movdqa xmm4,xmm2 movdqa xmm4, xmm2
paddd xmm2,xmm5 ; xmm2=tmp10H paddd xmm2, xmm5 ; xmm2=tmp10H
psubd xmm4,xmm5 ; xmm4=tmp13H psubd xmm4, xmm5 ; xmm4=tmp13H
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
pxor xmm5,xmm5 pxor xmm5, xmm5
pxor xmm7,xmm7 pxor xmm7, xmm7
punpcklwd xmm5,xmm6 ; xmm5=tmp1L punpcklwd xmm5, xmm6 ; xmm5=tmp1L
punpckhwd xmm7,xmm6 ; xmm7=tmp1H punpckhwd xmm7, xmm6 ; xmm7=tmp1H
psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
movdqa xmm2,xmm5 movdqa xmm2, xmm5
paddd xmm5,xmm1 ; xmm5=tmp11L paddd xmm5, xmm1 ; xmm5=tmp11L
psubd xmm2,xmm1 ; xmm2=tmp12L psubd xmm2, xmm1 ; xmm2=tmp12L
movdqa xmm0,xmm7 movdqa xmm0, xmm7
paddd xmm7,xmm3 ; xmm7=tmp11H paddd xmm7, xmm3 ; xmm7=tmp11H
psubd xmm0,xmm3 ; xmm0=tmp12H psubd xmm0, xmm3 ; xmm0=tmp12H
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
@@ -260,10 +260,10 @@ EXTN(jsimd_idct_islow_sse2):
pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm5,xmm6 movdqa xmm5, xmm6
movdqa xmm7,xmm4 movdqa xmm7, xmm4
paddw xmm5,xmm3 ; xmm5=z3 paddw xmm5, xmm3 ; xmm5=z3
paddw xmm7,xmm1 ; xmm7=z4 paddw xmm7, xmm1 ; xmm7=z4
; (Original) ; (Original)
; z5 = (z3 + z4) * 1.175875602; ; z5 = (z3 + z4) * 1.175875602;
@@ -274,16 +274,16 @@ EXTN(jsimd_idct_islow_sse2):
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
movdqa xmm2,xmm5 movdqa xmm2, xmm5
movdqa xmm0,xmm5 movdqa xmm0, xmm5
punpcklwd xmm2,xmm7 punpcklwd xmm2, xmm7
punpckhwd xmm0,xmm7 punpckhwd xmm0, xmm7
movdqa xmm5,xmm2 movdqa xmm5, xmm2
movdqa xmm7,xmm0 movdqa xmm7, xmm0
pmaddwd xmm2,[GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L pmaddwd xmm2, [GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L
pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H
pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
pmaddwd xmm7,[GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H pmaddwd xmm7, [GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H
movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
@@ -304,38 +304,38 @@ EXTN(jsimd_idct_islow_sse2):
; tmp0 += z3; tmp1 += z4; ; tmp0 += z3; tmp1 += z4;
; tmp2 += z3; tmp3 += z4; ; tmp2 += z3; tmp3 += z4;
movdqa xmm2,xmm3 movdqa xmm2, xmm3
movdqa xmm0,xmm3 movdqa xmm0, xmm3
punpcklwd xmm2,xmm4 punpcklwd xmm2, xmm4
punpckhwd xmm0,xmm4 punpckhwd xmm0, xmm4
movdqa xmm3,xmm2 movdqa xmm3, xmm2
movdqa xmm4,xmm0 movdqa xmm4, xmm0
pmaddwd xmm2,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L pmaddwd xmm2, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L
pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H
pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L
pmaddwd xmm4,[GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H pmaddwd xmm4, [GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
paddd xmm3,xmm5 ; xmm3=tmp3L paddd xmm3, xmm5 ; xmm3=tmp3L
paddd xmm4,xmm7 ; xmm4=tmp3H paddd xmm4, xmm7 ; xmm4=tmp3H
movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
movdqa xmm2,xmm1 movdqa xmm2, xmm1
movdqa xmm0,xmm1 movdqa xmm0, xmm1
punpcklwd xmm2,xmm6 punpcklwd xmm2, xmm6
punpckhwd xmm0,xmm6 punpckhwd xmm0, xmm6
movdqa xmm1,xmm2 movdqa xmm1, xmm2
movdqa xmm6,xmm0 movdqa xmm6, xmm0
pmaddwd xmm2,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L pmaddwd xmm2, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L
pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H
pmaddwd xmm1,[GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L pmaddwd xmm1, [GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L
pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
paddd xmm2,xmm5 ; xmm2=tmp1L paddd xmm2, xmm5 ; xmm2=tmp1L
paddd xmm0,xmm7 ; xmm0=tmp1H paddd xmm0, xmm7 ; xmm0=tmp1H
paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
@@ -347,57 +347,57 @@ EXTN(jsimd_idct_islow_sse2):
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
movdqa xmm2,xmm5 movdqa xmm2, xmm5
movdqa xmm0,xmm7 movdqa xmm0, xmm7
paddd xmm5,xmm3 ; xmm5=data0L paddd xmm5, xmm3 ; xmm5=data0L
paddd xmm7,xmm4 ; xmm7=data0H paddd xmm7, xmm4 ; xmm7=data0H
psubd xmm2,xmm3 ; xmm2=data7L psubd xmm2, xmm3 ; xmm2=data7L
psubd xmm0,xmm4 ; xmm0=data7H psubd xmm0, xmm4 ; xmm0=data7H
movdqa xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1] movdqa xmm3, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1]
paddd xmm5,xmm3 paddd xmm5, xmm3
paddd xmm7,xmm3 paddd xmm7, xmm3
psrad xmm5,DESCALE_P1 psrad xmm5, DESCALE_P1
psrad xmm7,DESCALE_P1 psrad xmm7, DESCALE_P1
paddd xmm2,xmm3 paddd xmm2, xmm3
paddd xmm0,xmm3 paddd xmm0, xmm3
psrad xmm2,DESCALE_P1 psrad xmm2, DESCALE_P1
psrad xmm0,DESCALE_P1 psrad xmm0, DESCALE_P1
packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
movdqa xmm7,xmm4 movdqa xmm7, xmm4
movdqa xmm0,xmm3 movdqa xmm0, xmm3
paddd xmm4,xmm1 ; xmm4=data1L paddd xmm4, xmm1 ; xmm4=data1L
paddd xmm3,xmm6 ; xmm3=data1H paddd xmm3, xmm6 ; xmm3=data1H
psubd xmm7,xmm1 ; xmm7=data6L psubd xmm7, xmm1 ; xmm7=data6L
psubd xmm0,xmm6 ; xmm0=data6H psubd xmm0, xmm6 ; xmm0=data6H
movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1] movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1]
paddd xmm4,xmm1 paddd xmm4, xmm1
paddd xmm3,xmm1 paddd xmm3, xmm1
psrad xmm4,DESCALE_P1 psrad xmm4, DESCALE_P1
psrad xmm3,DESCALE_P1 psrad xmm3, DESCALE_P1
paddd xmm7,xmm1 paddd xmm7, xmm1
paddd xmm0,xmm1 paddd xmm0, xmm1
psrad xmm7,DESCALE_P1 psrad xmm7, DESCALE_P1
psrad xmm0,DESCALE_P1 psrad xmm0, DESCALE_P1
packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
movdqa xmm6,xmm5 ; transpose coefficients(phase 1) movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13) punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
movdqa xmm1,xmm7 ; transpose coefficients(phase 1) movdqa xmm1, xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73) punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77) punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
@@ -409,69 +409,69 @@ EXTN(jsimd_idct_islow_sse2):
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
movdqa xmm5,xmm3 movdqa xmm5, xmm3
movdqa xmm6,xmm0 movdqa xmm6, xmm0
paddd xmm3,xmm4 ; xmm3=data2L paddd xmm3, xmm4 ; xmm3=data2L
paddd xmm0,xmm2 ; xmm0=data2H paddd xmm0, xmm2 ; xmm0=data2H
psubd xmm5,xmm4 ; xmm5=data5L psubd xmm5, xmm4 ; xmm5=data5L
psubd xmm6,xmm2 ; xmm6=data5H psubd xmm6, xmm2 ; xmm6=data5H
movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1] movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1]
paddd xmm3,xmm7 paddd xmm3, xmm7
paddd xmm0,xmm7 paddd xmm0, xmm7
psrad xmm3,DESCALE_P1 psrad xmm3, DESCALE_P1
psrad xmm0,DESCALE_P1 psrad xmm0, DESCALE_P1
paddd xmm5,xmm7 paddd xmm5, xmm7
paddd xmm6,xmm7 paddd xmm6, xmm7
psrad xmm5,DESCALE_P1 psrad xmm5, DESCALE_P1
psrad xmm6,DESCALE_P1 psrad xmm6, DESCALE_P1
packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
movdqa xmm0,xmm1 movdqa xmm0, xmm1
movdqa xmm6,xmm4 movdqa xmm6, xmm4
paddd xmm1,xmm2 ; xmm1=data3L paddd xmm1, xmm2 ; xmm1=data3L
paddd xmm4,xmm7 ; xmm4=data3H paddd xmm4, xmm7 ; xmm4=data3H
psubd xmm0,xmm2 ; xmm0=data4L psubd xmm0, xmm2 ; xmm0=data4L
psubd xmm6,xmm7 ; xmm6=data4H psubd xmm6, xmm7 ; xmm6=data4H
movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1] movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1]
paddd xmm1,xmm2 paddd xmm1, xmm2
paddd xmm4,xmm2 paddd xmm4, xmm2
psrad xmm1,DESCALE_P1 psrad xmm1, DESCALE_P1
psrad xmm4,DESCALE_P1 psrad xmm4, DESCALE_P1
paddd xmm0,xmm2 paddd xmm0, xmm2
paddd xmm6,xmm2 paddd xmm6, xmm2
psrad xmm0,DESCALE_P1 psrad xmm0, DESCALE_P1
psrad xmm6,DESCALE_P1 psrad xmm6, DESCALE_P1
packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
movdqa xmm4,xmm3 ; transpose coefficients(phase 1) movdqa xmm4, xmm3 ; transpose coefficients(phase 1)
punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33) punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37) punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
movdqa xmm6,xmm0 ; transpose coefficients(phase 1) movdqa xmm6, xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53) punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57) punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
movdqa xmm1,xmm7 ; transpose coefficients(phase 2) movdqa xmm1, xmm7 ; transpose coefficients(phase 2)
punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31) punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33) punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
movdqa xmm5,xmm2 ; transpose coefficients(phase 2) movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35) punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37) punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
@@ -479,19 +479,19 @@ EXTN(jsimd_idct_islow_sse2):
movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
movdqa xmm2,xmm0 ; transpose coefficients(phase 2) movdqa xmm2, xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71) punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73) punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
movdqa xmm5,xmm6 ; transpose coefficients(phase 2) movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75) punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77) punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
movdqa xmm3,xmm7 ; transpose coefficients(phase 3) movdqa xmm3, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
movdqa xmm4,xmm1 ; transpose coefficients(phase 3) movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
@@ -499,12 +499,12 @@ EXTN(jsimd_idct_islow_sse2):
movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
movdqa xmm3,xmm0 ; transpose coefficients(phase 3) movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
movdqa xmm4,xmm2 ; transpose coefficients(phase 3) movdqa xmm4, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
@@ -536,53 +536,53 @@ EXTN(jsimd_idct_islow_sse2):
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
movdqa xmm6,xmm1 ; xmm1=in2=z2 movdqa xmm6, xmm1 ; xmm1=in2=z2
movdqa xmm5,xmm1 movdqa xmm5, xmm1
punpcklwd xmm6,xmm2 ; xmm2=in6=z3 punpcklwd xmm6, xmm2 ; xmm2=in6=z3
punpckhwd xmm5,xmm2 punpckhwd xmm5, xmm2
movdqa xmm1,xmm6 movdqa xmm1, xmm6
movdqa xmm2,xmm5 movdqa xmm2, xmm5
pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L
pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
pmaddwd xmm2,[GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H pmaddwd xmm2, [GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H
movdqa xmm3,xmm7 movdqa xmm3, xmm7
paddw xmm7,xmm0 ; xmm7=in0+in4 paddw xmm7, xmm0 ; xmm7=in0+in4
psubw xmm3,xmm0 ; xmm3=in0-in4 psubw xmm3, xmm0 ; xmm3=in0-in4
pxor xmm4,xmm4 pxor xmm4, xmm4
pxor xmm0,xmm0 pxor xmm0, xmm0
punpcklwd xmm4,xmm7 ; xmm4=tmp0L punpcklwd xmm4, xmm7 ; xmm4=tmp0L
punpckhwd xmm0,xmm7 ; xmm0=tmp0H punpckhwd xmm0, xmm7 ; xmm0=tmp0H
psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
movdqa xmm7,xmm4 movdqa xmm7, xmm4
paddd xmm4,xmm6 ; xmm4=tmp10L paddd xmm4, xmm6 ; xmm4=tmp10L
psubd xmm7,xmm6 ; xmm7=tmp13L psubd xmm7, xmm6 ; xmm7=tmp13L
movdqa xmm6,xmm0 movdqa xmm6, xmm0
paddd xmm0,xmm5 ; xmm0=tmp10H paddd xmm0, xmm5 ; xmm0=tmp10H
psubd xmm6,xmm5 ; xmm6=tmp13H psubd xmm6, xmm5 ; xmm6=tmp13H
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
pxor xmm5,xmm5 pxor xmm5, xmm5
pxor xmm4,xmm4 pxor xmm4, xmm4
punpcklwd xmm5,xmm3 ; xmm5=tmp1L punpcklwd xmm5, xmm3 ; xmm5=tmp1L
punpckhwd xmm4,xmm3 ; xmm4=tmp1H punpckhwd xmm4, xmm3 ; xmm4=tmp1H
psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
movdqa xmm0,xmm5 movdqa xmm0, xmm5
paddd xmm5,xmm1 ; xmm5=tmp11L paddd xmm5, xmm1 ; xmm5=tmp11L
psubd xmm0,xmm1 ; xmm0=tmp12L psubd xmm0, xmm1 ; xmm0=tmp12L
movdqa xmm7,xmm4 movdqa xmm7, xmm4
paddd xmm4,xmm2 ; xmm4=tmp11H paddd xmm4, xmm2 ; xmm4=tmp11H
psubd xmm7,xmm2 ; xmm7=tmp12H psubd xmm7, xmm2 ; xmm7=tmp12H
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
@@ -596,10 +596,10 @@ EXTN(jsimd_idct_islow_sse2):
movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
movdqa xmm5,xmm6 movdqa xmm5, xmm6
movdqa xmm4,xmm3 movdqa xmm4, xmm3
paddw xmm5,xmm1 ; xmm5=z3 paddw xmm5, xmm1 ; xmm5=z3
paddw xmm4,xmm2 ; xmm4=z4 paddw xmm4, xmm2 ; xmm4=z4
; (Original) ; (Original)
; z5 = (z3 + z4) * 1.175875602; ; z5 = (z3 + z4) * 1.175875602;
@@ -610,16 +610,16 @@ EXTN(jsimd_idct_islow_sse2):
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
movdqa xmm0,xmm5 movdqa xmm0, xmm5
movdqa xmm7,xmm5 movdqa xmm7, xmm5
punpcklwd xmm0,xmm4 punpcklwd xmm0, xmm4
punpckhwd xmm7,xmm4 punpckhwd xmm7, xmm4
movdqa xmm5,xmm0 movdqa xmm5, xmm0
movdqa xmm4,xmm7 movdqa xmm4, xmm7
pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L
pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H
pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
pmaddwd xmm4,[GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H pmaddwd xmm4, [GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H
movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
@@ -640,38 +640,38 @@ EXTN(jsimd_idct_islow_sse2):
; tmp0 += z3; tmp1 += z4; ; tmp0 += z3; tmp1 += z4;
; tmp2 += z3; tmp3 += z4; ; tmp2 += z3; tmp3 += z4;
movdqa xmm0,xmm1 movdqa xmm0, xmm1
movdqa xmm7,xmm1 movdqa xmm7, xmm1
punpcklwd xmm0,xmm3 punpcklwd xmm0, xmm3
punpckhwd xmm7,xmm3 punpckhwd xmm7, xmm3
movdqa xmm1,xmm0 movdqa xmm1, xmm0
movdqa xmm3,xmm7 movdqa xmm3, xmm7
pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L
pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H
pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L
pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H
paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
paddd xmm1,xmm5 ; xmm1=tmp3L paddd xmm1, xmm5 ; xmm1=tmp3L
paddd xmm3,xmm4 ; xmm3=tmp3H paddd xmm3, xmm4 ; xmm3=tmp3H
movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
movdqa xmm0,xmm2 movdqa xmm0, xmm2
movdqa xmm7,xmm2 movdqa xmm7, xmm2
punpcklwd xmm0,xmm6 punpcklwd xmm0, xmm6
punpckhwd xmm7,xmm6 punpckhwd xmm7, xmm6
movdqa xmm2,xmm0 movdqa xmm2, xmm0
movdqa xmm6,xmm7 movdqa xmm6, xmm7
pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L
pmaddwd xmm7,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H pmaddwd xmm7, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H
pmaddwd xmm2,[GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L pmaddwd xmm2, [GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L
pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
paddd xmm0,xmm5 ; xmm0=tmp1L paddd xmm0, xmm5 ; xmm0=tmp1L
paddd xmm7,xmm4 ; xmm7=tmp1H paddd xmm7, xmm4 ; xmm7=tmp1H
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
@@ -683,53 +683,53 @@ EXTN(jsimd_idct_islow_sse2):
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
movdqa xmm0,xmm5 movdqa xmm0, xmm5
movdqa xmm7,xmm4 movdqa xmm7, xmm4
paddd xmm5,xmm1 ; xmm5=data0L paddd xmm5, xmm1 ; xmm5=data0L
paddd xmm4,xmm3 ; xmm4=data0H paddd xmm4, xmm3 ; xmm4=data0H
psubd xmm0,xmm1 ; xmm0=data7L psubd xmm0, xmm1 ; xmm0=data7L
psubd xmm7,xmm3 ; xmm7=data7H psubd xmm7, xmm3 ; xmm7=data7H
movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2] movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2]
paddd xmm5,xmm1 paddd xmm5, xmm1
paddd xmm4,xmm1 paddd xmm4, xmm1
psrad xmm5,DESCALE_P2 psrad xmm5, DESCALE_P2
psrad xmm4,DESCALE_P2 psrad xmm4, DESCALE_P2
paddd xmm0,xmm1 paddd xmm0, xmm1
paddd xmm7,xmm1 paddd xmm7, xmm1
psrad xmm0,DESCALE_P2 psrad xmm0, DESCALE_P2
psrad xmm7,DESCALE_P2 psrad xmm7, DESCALE_P2
packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
movdqa xmm4,xmm3 movdqa xmm4, xmm3
movdqa xmm7,xmm1 movdqa xmm7, xmm1
paddd xmm3,xmm2 ; xmm3=data1L paddd xmm3, xmm2 ; xmm3=data1L
paddd xmm1,xmm6 ; xmm1=data1H paddd xmm1, xmm6 ; xmm1=data1H
psubd xmm4,xmm2 ; xmm4=data6L psubd xmm4, xmm2 ; xmm4=data6L
psubd xmm7,xmm6 ; xmm7=data6H psubd xmm7, xmm6 ; xmm7=data6H
movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2] movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2]
paddd xmm3,xmm2 paddd xmm3, xmm2
paddd xmm1,xmm2 paddd xmm1, xmm2
psrad xmm3,DESCALE_P2 psrad xmm3, DESCALE_P2
psrad xmm1,DESCALE_P2 psrad xmm1, DESCALE_P2
paddd xmm4,xmm2 paddd xmm4, xmm2
paddd xmm7,xmm2 paddd xmm7, xmm2
psrad xmm4,DESCALE_P2 psrad xmm4, DESCALE_P2
psrad xmm7,DESCALE_P2 psrad xmm7, DESCALE_P2
packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
@@ -739,91 +739,91 @@ EXTN(jsimd_idct_islow_sse2):
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
movdqa xmm4,xmm6 movdqa xmm4, xmm6
movdqa xmm0,xmm2 movdqa xmm0, xmm2
paddd xmm6,xmm1 ; xmm6=data2L paddd xmm6, xmm1 ; xmm6=data2L
paddd xmm2,xmm7 ; xmm2=data2H paddd xmm2, xmm7 ; xmm2=data2H
psubd xmm4,xmm1 ; xmm4=data5L psubd xmm4, xmm1 ; xmm4=data5L
psubd xmm0,xmm7 ; xmm0=data5H psubd xmm0, xmm7 ; xmm0=data5H
movdqa xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2] movdqa xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2]
paddd xmm6,xmm5 paddd xmm6, xmm5
paddd xmm2,xmm5 paddd xmm2, xmm5
psrad xmm6,DESCALE_P2 psrad xmm6, DESCALE_P2
psrad xmm2,DESCALE_P2 psrad xmm2, DESCALE_P2
paddd xmm4,xmm5 paddd xmm4, xmm5
paddd xmm0,xmm5 paddd xmm0, xmm5
psrad xmm4,DESCALE_P2 psrad xmm4, DESCALE_P2
psrad xmm0,DESCALE_P2 psrad xmm0, DESCALE_P2
packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
movdqa xmm2,xmm3 movdqa xmm2, xmm3
movdqa xmm0,xmm1 movdqa xmm0, xmm1
paddd xmm3,xmm7 ; xmm3=data3L paddd xmm3, xmm7 ; xmm3=data3L
paddd xmm1,xmm5 ; xmm1=data3H paddd xmm1, xmm5 ; xmm1=data3H
psubd xmm2,xmm7 ; xmm2=data4L psubd xmm2, xmm7 ; xmm2=data4L
psubd xmm0,xmm5 ; xmm0=data4H psubd xmm0, xmm5 ; xmm0=data4H
movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2] movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2]
paddd xmm3,xmm7 paddd xmm3, xmm7
paddd xmm1,xmm7 paddd xmm1, xmm7
psrad xmm3,DESCALE_P2 psrad xmm3, DESCALE_P2
psrad xmm1,DESCALE_P2 psrad xmm1, DESCALE_P2
paddd xmm2,xmm7 paddd xmm2, xmm7
paddd xmm0,xmm7 paddd xmm0, xmm7
psrad xmm2,DESCALE_P2 psrad xmm2, DESCALE_P2
psrad xmm0,DESCALE_P2 psrad xmm0, DESCALE_P2
movdqa xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP] movdqa xmm5, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP]
packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
paddb xmm7,xmm5 paddb xmm7, xmm5
paddb xmm1,xmm5 paddb xmm1, xmm5
paddb xmm6,xmm5 paddb xmm6, xmm5
paddb xmm3,xmm5 paddb xmm3, xmm5
movdqa xmm0,xmm7 ; transpose coefficients(phase 1) movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
movdqa xmm2,xmm6 ; transpose coefficients(phase 1) movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
movdqa xmm4,xmm7 ; transpose coefficients(phase 2) movdqa xmm4, xmm7 ; transpose coefficients(phase 2)
punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
movdqa xmm5,xmm2 ; transpose coefficients(phase 2) movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
movdqa xmm1,xmm7 ; transpose coefficients(phase 3) movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
movdqa xmm3,xmm4 ; transpose coefficients(phase 3) movdqa xmm3, xmm4 ; transpose coefficients(phase 3)
punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
@@ -848,7 +848,7 @@ EXTN(jsimd_idct_islow_sse2):
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; unused ; pop ecx ; unused
poppic ebx poppic ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret

View File

@@ -52,20 +52,20 @@ F_3_624 equ 29692 ; FIX(3.624509785)
%else %else
; NASM cannot do compile-time arithmetic on floating-point constants. ; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) F_0_211 equ DESCALE( 226735879, 30-CONST_BITS) ; FIX(0.211164243)
F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) F_0_509 equ DESCALE( 547388834, 30-CONST_BITS) ; FIX(0.509795579)
F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) F_0_601 equ DESCALE( 645689155, 30-CONST_BITS) ; FIX(0.601344887)
F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) F_0_720 equ DESCALE( 774124714, 30-CONST_BITS) ; FIX(0.720959822)
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) F_0_850 equ DESCALE( 913142361, 30-CONST_BITS) ; FIX(0.850430095)
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) F_1_061 equ DESCALE(1139878239, 30-CONST_BITS) ; FIX(1.061594337)
F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) F_1_272 equ DESCALE(1366614119, 30-CONST_BITS) ; FIX(1.272758580)
F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) F_1_451 equ DESCALE(1558831516, 30-CONST_BITS) ; FIX(1.451774981)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) F_2_172 equ DESCALE(2332956230, 30-CONST_BITS) ; FIX(2.172734803)
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) F_3_624 equ DESCALE(3891787747, 30-CONST_BITS) ; FIX(3.624509785)
%endif %endif
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -117,11 +117,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
EXTN(jsimd_idct_4x4_sse2): EXTN(jsimd_idct_4x4_sse2):
push rbp push rbp
mov rax,rsp ; rax = original rbp mov rax, rsp ; rax = original rbp
sub rsp, byte 4 sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args
@@ -141,11 +141,11 @@ EXTN(jsimd_idct_4x4_sse2):
por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
por xmm0,xmm1 por xmm0, xmm1
packsswb xmm0,xmm0 packsswb xmm0, xmm0
packsswb xmm0,xmm0 packsswb xmm0, xmm0
movd eax,xmm0 movd eax, xmm0
test rax,rax test rax, rax
jnz short .columnDCT jnz short .columnDCT
; -- AC terms all zero ; -- AC terms all zero
@@ -153,16 +153,16 @@ EXTN(jsimd_idct_4x4_sse2):
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
psllw xmm0,PASS1_BITS psllw xmm0, PASS1_BITS
movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07) punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
jmp near .column_end jmp near .column_end
%endif %endif
@@ -179,32 +179,32 @@ EXTN(jsimd_idct_4x4_sse2):
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm4,xmm0 movdqa xmm4, xmm0
movdqa xmm5,xmm0 movdqa xmm5, xmm0
punpcklwd xmm4,xmm1 punpcklwd xmm4, xmm1
punpckhwd xmm5,xmm1 punpckhwd xmm5, xmm1
movdqa xmm0,xmm4 movdqa xmm0, xmm4
movdqa xmm1,xmm5 movdqa xmm1, xmm5
pmaddwd xmm4,[rel PW_F256_F089] ; xmm4=(tmp2L) pmaddwd xmm4, [rel PW_F256_F089] ; xmm4=(tmp2L)
pmaddwd xmm5,[rel PW_F256_F089] ; xmm5=(tmp2H) pmaddwd xmm5, [rel PW_F256_F089] ; xmm5=(tmp2H)
pmaddwd xmm0,[rel PW_F106_MF217] ; xmm0=(tmp0L) pmaddwd xmm0, [rel PW_F106_MF217] ; xmm0=(tmp0L)
pmaddwd xmm1,[rel PW_F106_MF217] ; xmm1=(tmp0H) pmaddwd xmm1, [rel PW_F106_MF217] ; xmm1=(tmp0H)
movdqa xmm6,xmm2 movdqa xmm6, xmm2
movdqa xmm7,xmm2 movdqa xmm7, xmm2
punpcklwd xmm6,xmm3 punpcklwd xmm6, xmm3
punpckhwd xmm7,xmm3 punpckhwd xmm7, xmm3
movdqa xmm2,xmm6 movdqa xmm2, xmm6
movdqa xmm3,xmm7 movdqa xmm3, xmm7
pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2L) pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2L)
pmaddwd xmm7,[rel PW_MF060_MF050] ; xmm7=(tmp2H) pmaddwd xmm7, [rel PW_MF060_MF050] ; xmm7=(tmp2H)
pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0L) pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0L)
pmaddwd xmm3,[rel PW_F145_MF021] ; xmm3=(tmp0H) pmaddwd xmm3, [rel PW_F145_MF021] ; xmm3=(tmp0H)
paddd xmm6,xmm4 ; xmm6=tmp2L paddd xmm6, xmm4 ; xmm6=tmp2L
paddd xmm7,xmm5 ; xmm7=tmp2H paddd xmm7, xmm5 ; xmm7=tmp2H
paddd xmm2,xmm0 ; xmm2=tmp0L paddd xmm2, xmm0 ; xmm2=tmp0L
paddd xmm3,xmm1 ; xmm3=tmp0H paddd xmm3, xmm1 ; xmm3=tmp0H
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
@@ -218,86 +218,86 @@ EXTN(jsimd_idct_4x4_sse2):
pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
pxor xmm1,xmm1 pxor xmm1, xmm1
pxor xmm2,xmm2 pxor xmm2, xmm2
punpcklwd xmm1,xmm4 ; xmm1=tmp0L punpcklwd xmm1, xmm4 ; xmm1=tmp0L
punpckhwd xmm2,xmm4 ; xmm2=tmp0H punpckhwd xmm2, xmm4 ; xmm2=tmp0H
psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
movdqa xmm3,xmm5 ; xmm5=in2=z2 movdqa xmm3, xmm5 ; xmm5=in2=z2
punpcklwd xmm5,xmm0 ; xmm0=in6=z3 punpcklwd xmm5, xmm0 ; xmm0=in6=z3
punpckhwd xmm3,xmm0 punpckhwd xmm3, xmm0
pmaddwd xmm5,[rel PW_F184_MF076] ; xmm5=tmp2L pmaddwd xmm5, [rel PW_F184_MF076] ; xmm5=tmp2L
pmaddwd xmm3,[rel PW_F184_MF076] ; xmm3=tmp2H pmaddwd xmm3, [rel PW_F184_MF076] ; xmm3=tmp2H
movdqa xmm4,xmm1 movdqa xmm4, xmm1
movdqa xmm0,xmm2 movdqa xmm0, xmm2
paddd xmm1,xmm5 ; xmm1=tmp10L paddd xmm1, xmm5 ; xmm1=tmp10L
paddd xmm2,xmm3 ; xmm2=tmp10H paddd xmm2, xmm3 ; xmm2=tmp10H
psubd xmm4,xmm5 ; xmm4=tmp12L psubd xmm4, xmm5 ; xmm4=tmp12L
psubd xmm0,xmm3 ; xmm0=tmp12H psubd xmm0, xmm3 ; xmm0=tmp12H
; -- Final output stage ; -- Final output stage
movdqa xmm5,xmm1 movdqa xmm5, xmm1
movdqa xmm3,xmm2 movdqa xmm3, xmm2
paddd xmm1,xmm6 ; xmm1=data0L paddd xmm1, xmm6 ; xmm1=data0L
paddd xmm2,xmm7 ; xmm2=data0H paddd xmm2, xmm7 ; xmm2=data0H
psubd xmm5,xmm6 ; xmm5=data3L psubd xmm5, xmm6 ; xmm5=data3L
psubd xmm3,xmm7 ; xmm3=data3H psubd xmm3, xmm7 ; xmm3=data3H
movdqa xmm6,[rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4] movdqa xmm6, [rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4]
paddd xmm1,xmm6 paddd xmm1, xmm6
paddd xmm2,xmm6 paddd xmm2, xmm6
psrad xmm1,DESCALE_P1_4 psrad xmm1, DESCALE_P1_4
psrad xmm2,DESCALE_P1_4 psrad xmm2, DESCALE_P1_4
paddd xmm5,xmm6 paddd xmm5, xmm6
paddd xmm3,xmm6 paddd xmm3, xmm6
psrad xmm5,DESCALE_P1_4 psrad xmm5, DESCALE_P1_4
psrad xmm3,DESCALE_P1_4 psrad xmm3, DESCALE_P1_4
packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
movdqa xmm2,xmm4 movdqa xmm2, xmm4
movdqa xmm3,xmm0 movdqa xmm3, xmm0
paddd xmm4,xmm7 ; xmm4=data1L paddd xmm4, xmm7 ; xmm4=data1L
paddd xmm0,xmm6 ; xmm0=data1H paddd xmm0, xmm6 ; xmm0=data1H
psubd xmm2,xmm7 ; xmm2=data2L psubd xmm2, xmm7 ; xmm2=data2L
psubd xmm3,xmm6 ; xmm3=data2H psubd xmm3, xmm6 ; xmm3=data2H
movdqa xmm7,[rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4] movdqa xmm7, [rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4]
paddd xmm4,xmm7 paddd xmm4, xmm7
paddd xmm0,xmm7 paddd xmm0, xmm7
psrad xmm4,DESCALE_P1_4 psrad xmm4, DESCALE_P1_4
psrad xmm0,DESCALE_P1_4 psrad xmm0, DESCALE_P1_4
paddd xmm2,xmm7 paddd xmm2, xmm7
paddd xmm3,xmm7 paddd xmm3, xmm7
psrad xmm2,DESCALE_P1_4 psrad xmm2, DESCALE_P1_4
psrad xmm3,DESCALE_P1_4 psrad xmm3, DESCALE_P1_4
packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
movdqa xmm6,xmm1 ; transpose coefficients(phase 1) movdqa xmm6, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13) punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
movdqa xmm7,xmm2 ; transpose coefficients(phase 1) movdqa xmm7, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33) punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37) punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
movdqa xmm0,xmm1 ; transpose coefficients(phase 2) movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
movdqa xmm3,xmm6 ; transpose coefficients(phase 2) movdqa xmm3, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
.column_end: .column_end:
; -- Prefetch the next coefficient block ; -- Prefetch the next coefficient block
@@ -315,70 +315,70 @@ EXTN(jsimd_idct_4x4_sse2):
; -- Even part ; -- Even part
pxor xmm4,xmm4 pxor xmm4, xmm4
punpcklwd xmm4,xmm1 ; xmm4=tmp0 punpcklwd xmm4, xmm1 ; xmm4=tmp0
psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
; -- Odd part ; -- Odd part
punpckhwd xmm1,xmm0 punpckhwd xmm1, xmm0
punpckhwd xmm6,xmm3 punpckhwd xmm6, xmm3
movdqa xmm5,xmm1 movdqa xmm5, xmm1
movdqa xmm2,xmm6 movdqa xmm2, xmm6
pmaddwd xmm1,[rel PW_F256_F089] ; xmm1=(tmp2) pmaddwd xmm1, [rel PW_F256_F089] ; xmm1=(tmp2)
pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2) pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2)
pmaddwd xmm5,[rel PW_F106_MF217] ; xmm5=(tmp0) pmaddwd xmm5, [rel PW_F106_MF217] ; xmm5=(tmp0)
pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0) pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0)
paddd xmm6,xmm1 ; xmm6=tmp2 paddd xmm6, xmm1 ; xmm6=tmp2
paddd xmm2,xmm5 ; xmm2=tmp0 paddd xmm2, xmm5 ; xmm2=tmp0
; -- Even part ; -- Even part
punpcklwd xmm0,xmm3 punpcklwd xmm0, xmm3
pmaddwd xmm0,[rel PW_F184_MF076] ; xmm0=tmp2 pmaddwd xmm0, [rel PW_F184_MF076] ; xmm0=tmp2
movdqa xmm7,xmm4 movdqa xmm7, xmm4
paddd xmm4,xmm0 ; xmm4=tmp10 paddd xmm4, xmm0 ; xmm4=tmp10
psubd xmm7,xmm0 ; xmm7=tmp12 psubd xmm7, xmm0 ; xmm7=tmp12
; -- Final output stage ; -- Final output stage
movdqa xmm1,[rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4] movdqa xmm1, [rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4]
movdqa xmm5,xmm4 movdqa xmm5, xmm4
movdqa xmm3,xmm7 movdqa xmm3, xmm7
paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30) paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30)
paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31) paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31)
psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33) psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33)
psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32) psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
paddd xmm4,xmm1 paddd xmm4, xmm1
paddd xmm7,xmm1 paddd xmm7, xmm1
psrad xmm4,DESCALE_P2_4 psrad xmm4, DESCALE_P2_4
psrad xmm7,DESCALE_P2_4 psrad xmm7, DESCALE_P2_4
paddd xmm5,xmm1 paddd xmm5, xmm1
paddd xmm3,xmm1 paddd xmm3, xmm1
psrad xmm5,DESCALE_P2_4 psrad xmm5, DESCALE_P2_4
psrad xmm3,DESCALE_P2_4 psrad xmm3, DESCALE_P2_4
packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32) packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33) packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
movdqa xmm0,xmm4 ; transpose coefficients(phase 1) movdqa xmm0, xmm4 ; transpose coefficients(phase 1)
punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31) punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33) punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
movdqa xmm6,xmm4 ; transpose coefficients(phase 2) movdqa xmm6, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13) punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33) punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
paddb xmm4,[rel PB_CENTERJSAMP] paddb xmm4, [rel PB_CENTERJSAMP]
pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
@@ -390,7 +390,7 @@ EXTN(jsimd_idct_4x4_sse2):
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
uncollect_args uncollect_args
mov rsp,rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
ret ret
@@ -416,8 +416,8 @@ EXTN(jsimd_idct_4x4_sse2):
EXTN(jsimd_idct_2x2_sse2): EXTN(jsimd_idct_2x2_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
push rbx push rbx
@@ -450,27 +450,27 @@ EXTN(jsimd_idct_2x2_sse2):
; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
pcmpeqd xmm7,xmm7 pcmpeqd xmm7, xmm7
pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
pmaddwd xmm4,[rel PW_F362_MF127] pmaddwd xmm4, [rel PW_F362_MF127]
pmaddwd xmm5,[rel PW_F085_MF072] pmaddwd xmm5, [rel PW_F085_MF072]
psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37) por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77) por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
pmaddwd xmm0,[rel PW_F362_MF127] pmaddwd xmm0, [rel PW_F362_MF127]
pmaddwd xmm2,[rel PW_F085_MF072] pmaddwd xmm2, [rel PW_F085_MF072]
paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3] paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3]
paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7] paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
; -- Even part ; -- Even part
@@ -479,36 +479,36 @@ EXTN(jsimd_idct_2x2_sse2):
; xmm6=(00 01 ** 03 ** 05 ** 07) ; xmm6=(00 01 ** 03 ** 05 ** 07)
movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
; -- Final output stage ; -- Final output stage
movdqa xmm3,xmm6 movdqa xmm3, xmm6
movdqa xmm5,xmm1 movdqa xmm5, xmm1
paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
movdqa xmm2,[rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2] movdqa xmm2, [rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2]
punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **) punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **)
movdqa xmm7,xmm1 movdqa xmm7, xmm1
punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3) punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3)
punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7) punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7)
paddd xmm6,xmm2 paddd xmm6, xmm2
psrad xmm6,DESCALE_P1_2 psrad xmm6, DESCALE_P1_2
paddd xmm1,xmm2 paddd xmm1, xmm2
paddd xmm7,xmm2 paddd xmm7, xmm2
psrad xmm1,DESCALE_P1_2 psrad xmm1, DESCALE_P1_2
psrad xmm7,DESCALE_P1_2 psrad xmm7, DESCALE_P1_2
; -- Prefetch the next coefficient block ; -- Prefetch the next coefficient block
@@ -531,34 +531,34 @@ EXTN(jsimd_idct_2x2_sse2):
; -- Odd part ; -- Odd part
packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
pmaddwd xmm1,[rel PW_F362_MF127] pmaddwd xmm1, [rel PW_F362_MF127]
pmaddwd xmm7,[rel PW_F085_MF072] pmaddwd xmm7, [rel PW_F085_MF072]
paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1] paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
; -- Even part ; -- Even part
pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
; -- Final output stage ; -- Final output stage
movdqa xmm4,xmm6 movdqa xmm4, xmm6
paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1) punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1)
paddd xmm6,[rel PD_DESCALE_P2_2] paddd xmm6, [rel PD_DESCALE_P2_2]
psrad xmm6,DESCALE_P2_2 psrad xmm6, DESCALE_P2_2
packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
paddb xmm6,[rel PB_CENTERJSAMP] paddb xmm6, [rel PB_CENTERJSAMP]
pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --) pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --) pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]

View File

@@ -51,20 +51,20 @@ F_3_624 equ 29692 ; FIX(3.624509785)
%else %else
; NASM cannot do compile-time arithmetic on floating-point constants. ; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) F_0_211 equ DESCALE( 226735879, 30-CONST_BITS) ; FIX(0.211164243)
F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) F_0_509 equ DESCALE( 547388834, 30-CONST_BITS) ; FIX(0.509795579)
F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) F_0_601 equ DESCALE( 645689155, 30-CONST_BITS) ; FIX(0.601344887)
F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) F_0_720 equ DESCALE( 774124714, 30-CONST_BITS) ; FIX(0.720959822)
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) F_0_850 equ DESCALE( 913142361, 30-CONST_BITS) ; FIX(0.850430095)
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) F_1_061 equ DESCALE(1139878239, 30-CONST_BITS) ; FIX(1.061594337)
F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) F_1_272 equ DESCALE(1366614119, 30-CONST_BITS) ; FIX(1.272758580)
F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) F_1_451 equ DESCALE(1558831516, 30-CONST_BITS) ; FIX(1.451774981)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) F_2_172 equ DESCALE(2332956230, 30-CONST_BITS) ; FIX(2.172734803)
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) F_3_624 equ DESCALE(3891787747, 30-CONST_BITS) ; FIX(3.624509785)
%endif %endif
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
@@ -116,11 +116,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
EXTN(jsimd_idct_4x4_sse2): EXTN(jsimd_idct_4x4_sse2):
push ebp push ebp
mov eax,esp ; eax = original ebp mov eax, esp ; eax = original ebp
sub esp, byte 4 sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax mov [esp], eax
mov ebp,esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx pushpic ebx
; push ecx ; unused ; push ecx ; unused
@@ -147,11 +147,11 @@ EXTN(jsimd_idct_4x4_sse2):
por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
por xmm0,xmm1 por xmm0, xmm1
packsswb xmm0,xmm0 packsswb xmm0, xmm0
packsswb xmm0,xmm0 packsswb xmm0, xmm0
movd eax,xmm0 movd eax, xmm0
test eax,eax test eax, eax
jnz short .columnDCT jnz short .columnDCT
; -- AC terms all zero ; -- AC terms all zero
@@ -159,19 +159,19 @@ EXTN(jsimd_idct_4x4_sse2):
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
psllw xmm0,PASS1_BITS psllw xmm0, PASS1_BITS
movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07) punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
jmp near .column_end jmp near .column_end
alignx 16,7 alignx 16, 7
%endif %endif
.columnDCT: .columnDCT:
@@ -186,32 +186,32 @@ EXTN(jsimd_idct_4x4_sse2):
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
movdqa xmm4,xmm0 movdqa xmm4, xmm0
movdqa xmm5,xmm0 movdqa xmm5, xmm0
punpcklwd xmm4,xmm1 punpcklwd xmm4, xmm1
punpckhwd xmm5,xmm1 punpckhwd xmm5, xmm1
movdqa xmm0,xmm4 movdqa xmm0, xmm4
movdqa xmm1,xmm5 movdqa xmm1, xmm5
pmaddwd xmm4,[GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L) pmaddwd xmm4, [GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L)
pmaddwd xmm5,[GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H) pmaddwd xmm5, [GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H)
pmaddwd xmm0,[GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L) pmaddwd xmm0, [GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L)
pmaddwd xmm1,[GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H) pmaddwd xmm1, [GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H)
movdqa xmm6,xmm2 movdqa xmm6, xmm2
movdqa xmm7,xmm2 movdqa xmm7, xmm2
punpcklwd xmm6,xmm3 punpcklwd xmm6, xmm3
punpckhwd xmm7,xmm3 punpckhwd xmm7, xmm3
movdqa xmm2,xmm6 movdqa xmm2, xmm6
movdqa xmm3,xmm7 movdqa xmm3, xmm7
pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L) pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L)
pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H) pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H)
pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L) pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L)
pmaddwd xmm3,[GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H) pmaddwd xmm3, [GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H)
paddd xmm6,xmm4 ; xmm6=tmp2L paddd xmm6, xmm4 ; xmm6=tmp2L
paddd xmm7,xmm5 ; xmm7=tmp2H paddd xmm7, xmm5 ; xmm7=tmp2H
paddd xmm2,xmm0 ; xmm2=tmp0L paddd xmm2, xmm0 ; xmm2=tmp0L
paddd xmm3,xmm1 ; xmm3=tmp0H paddd xmm3, xmm1 ; xmm3=tmp0H
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
@@ -225,86 +225,86 @@ EXTN(jsimd_idct_4x4_sse2):
pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
pxor xmm1,xmm1 pxor xmm1, xmm1
pxor xmm2,xmm2 pxor xmm2, xmm2
punpcklwd xmm1,xmm4 ; xmm1=tmp0L punpcklwd xmm1, xmm4 ; xmm1=tmp0L
punpckhwd xmm2,xmm4 ; xmm2=tmp0H punpckhwd xmm2, xmm4 ; xmm2=tmp0H
psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
movdqa xmm3,xmm5 ; xmm5=in2=z2 movdqa xmm3, xmm5 ; xmm5=in2=z2
punpcklwd xmm5,xmm0 ; xmm0=in6=z3 punpcklwd xmm5, xmm0 ; xmm0=in6=z3
punpckhwd xmm3,xmm0 punpckhwd xmm3, xmm0
pmaddwd xmm5,[GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L pmaddwd xmm5, [GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L
pmaddwd xmm3,[GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H pmaddwd xmm3, [GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H
movdqa xmm4,xmm1 movdqa xmm4, xmm1
movdqa xmm0,xmm2 movdqa xmm0, xmm2
paddd xmm1,xmm5 ; xmm1=tmp10L paddd xmm1, xmm5 ; xmm1=tmp10L
paddd xmm2,xmm3 ; xmm2=tmp10H paddd xmm2, xmm3 ; xmm2=tmp10H
psubd xmm4,xmm5 ; xmm4=tmp12L psubd xmm4, xmm5 ; xmm4=tmp12L
psubd xmm0,xmm3 ; xmm0=tmp12H psubd xmm0, xmm3 ; xmm0=tmp12H
; -- Final output stage ; -- Final output stage
movdqa xmm5,xmm1 movdqa xmm5, xmm1
movdqa xmm3,xmm2 movdqa xmm3, xmm2
paddd xmm1,xmm6 ; xmm1=data0L paddd xmm1, xmm6 ; xmm1=data0L
paddd xmm2,xmm7 ; xmm2=data0H paddd xmm2, xmm7 ; xmm2=data0H
psubd xmm5,xmm6 ; xmm5=data3L psubd xmm5, xmm6 ; xmm5=data3L
psubd xmm3,xmm7 ; xmm3=data3H psubd xmm3, xmm7 ; xmm3=data3H
movdqa xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4] movdqa xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4]
paddd xmm1,xmm6 paddd xmm1, xmm6
paddd xmm2,xmm6 paddd xmm2, xmm6
psrad xmm1,DESCALE_P1_4 psrad xmm1, DESCALE_P1_4
psrad xmm2,DESCALE_P1_4 psrad xmm2, DESCALE_P1_4
paddd xmm5,xmm6 paddd xmm5, xmm6
paddd xmm3,xmm6 paddd xmm3, xmm6
psrad xmm5,DESCALE_P1_4 psrad xmm5, DESCALE_P1_4
psrad xmm3,DESCALE_P1_4 psrad xmm3, DESCALE_P1_4
packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
movdqa xmm2,xmm4 movdqa xmm2, xmm4
movdqa xmm3,xmm0 movdqa xmm3, xmm0
paddd xmm4,xmm7 ; xmm4=data1L paddd xmm4, xmm7 ; xmm4=data1L
paddd xmm0,xmm6 ; xmm0=data1H paddd xmm0, xmm6 ; xmm0=data1H
psubd xmm2,xmm7 ; xmm2=data2L psubd xmm2, xmm7 ; xmm2=data2L
psubd xmm3,xmm6 ; xmm3=data2H psubd xmm3, xmm6 ; xmm3=data2H
movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4] movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4]
paddd xmm4,xmm7 paddd xmm4, xmm7
paddd xmm0,xmm7 paddd xmm0, xmm7
psrad xmm4,DESCALE_P1_4 psrad xmm4, DESCALE_P1_4
psrad xmm0,DESCALE_P1_4 psrad xmm0, DESCALE_P1_4
paddd xmm2,xmm7 paddd xmm2, xmm7
paddd xmm3,xmm7 paddd xmm3, xmm7
psrad xmm2,DESCALE_P1_4 psrad xmm2, DESCALE_P1_4
psrad xmm3,DESCALE_P1_4 psrad xmm3, DESCALE_P1_4
packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
movdqa xmm6,xmm1 ; transpose coefficients(phase 1) movdqa xmm6, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13) punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
movdqa xmm7,xmm2 ; transpose coefficients(phase 1) movdqa xmm7, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33) punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37) punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
movdqa xmm0,xmm1 ; transpose coefficients(phase 2) movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
movdqa xmm3,xmm6 ; transpose coefficients(phase 2) movdqa xmm3, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
.column_end: .column_end:
; -- Prefetch the next coefficient block ; -- Prefetch the next coefficient block
@@ -322,70 +322,70 @@ EXTN(jsimd_idct_4x4_sse2):
; -- Even part ; -- Even part
pxor xmm4,xmm4 pxor xmm4, xmm4
punpcklwd xmm4,xmm1 ; xmm4=tmp0 punpcklwd xmm4, xmm1 ; xmm4=tmp0
psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
; -- Odd part ; -- Odd part
punpckhwd xmm1,xmm0 punpckhwd xmm1, xmm0
punpckhwd xmm6,xmm3 punpckhwd xmm6, xmm3
movdqa xmm5,xmm1 movdqa xmm5, xmm1
movdqa xmm2,xmm6 movdqa xmm2, xmm6
pmaddwd xmm1,[GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2) pmaddwd xmm1, [GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2)
pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2) pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2)
pmaddwd xmm5,[GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0) pmaddwd xmm5, [GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0)
pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0) pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0)
paddd xmm6,xmm1 ; xmm6=tmp2 paddd xmm6, xmm1 ; xmm6=tmp2
paddd xmm2,xmm5 ; xmm2=tmp0 paddd xmm2, xmm5 ; xmm2=tmp0
; -- Even part ; -- Even part
punpcklwd xmm0,xmm3 punpcklwd xmm0, xmm3
pmaddwd xmm0,[GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2 pmaddwd xmm0, [GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2
movdqa xmm7,xmm4 movdqa xmm7, xmm4
paddd xmm4,xmm0 ; xmm4=tmp10 paddd xmm4, xmm0 ; xmm4=tmp10
psubd xmm7,xmm0 ; xmm7=tmp12 psubd xmm7, xmm0 ; xmm7=tmp12
; -- Final output stage ; -- Final output stage
movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4] movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4]
movdqa xmm5,xmm4 movdqa xmm5, xmm4
movdqa xmm3,xmm7 movdqa xmm3, xmm7
paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30) paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30)
paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31) paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31)
psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33) psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33)
psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32) psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
paddd xmm4,xmm1 paddd xmm4, xmm1
paddd xmm7,xmm1 paddd xmm7, xmm1
psrad xmm4,DESCALE_P2_4 psrad xmm4, DESCALE_P2_4
psrad xmm7,DESCALE_P2_4 psrad xmm7, DESCALE_P2_4
paddd xmm5,xmm1 paddd xmm5, xmm1
paddd xmm3,xmm1 paddd xmm3, xmm1
psrad xmm5,DESCALE_P2_4 psrad xmm5, DESCALE_P2_4
psrad xmm3,DESCALE_P2_4 psrad xmm3, DESCALE_P2_4
packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32) packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33) packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
movdqa xmm0,xmm4 ; transpose coefficients(phase 1) movdqa xmm0, xmm4 ; transpose coefficients(phase 1)
punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31) punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33) punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
movdqa xmm6,xmm4 ; transpose coefficients(phase 2) movdqa xmm6, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13) punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33) punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
paddb xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)] paddb xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)]
pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
@@ -401,7 +401,7 @@ EXTN(jsimd_idct_4x4_sse2):
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; unused ; pop ecx ; unused
poppic ebx poppic ebx
mov esp,ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
ret ret
@@ -427,7 +427,7 @@ EXTN(jsimd_idct_4x4_sse2):
EXTN(jsimd_idct_2x2_sse2): EXTN(jsimd_idct_2x2_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
@@ -465,27 +465,27 @@ EXTN(jsimd_idct_2x2_sse2):
; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
pcmpeqd xmm7,xmm7 pcmpeqd xmm7, xmm7
pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
pmaddwd xmm4,[GOTOFF(ebx,PW_F362_MF127)] pmaddwd xmm4, [GOTOFF(ebx,PW_F362_MF127)]
pmaddwd xmm5,[GOTOFF(ebx,PW_F085_MF072)] pmaddwd xmm5, [GOTOFF(ebx,PW_F085_MF072)]
psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37) por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77) por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)] pmaddwd xmm0, [GOTOFF(ebx,PW_F362_MF127)]
pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)] pmaddwd xmm2, [GOTOFF(ebx,PW_F085_MF072)]
paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3] paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3]
paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7] paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
; -- Even part ; -- Even part
@@ -494,36 +494,36 @@ EXTN(jsimd_idct_2x2_sse2):
; xmm6=(00 01 ** 03 ** 05 ** 07) ; xmm6=(00 01 ** 03 ** 05 ** 07)
movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
; -- Final output stage ; -- Final output stage
movdqa xmm3,xmm6 movdqa xmm3, xmm6
movdqa xmm5,xmm1 movdqa xmm5, xmm1
paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2] movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2]
punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **) punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **)
movdqa xmm7,xmm1 movdqa xmm7, xmm1
punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3) punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3)
punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7) punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7)
paddd xmm6,xmm2 paddd xmm6, xmm2
psrad xmm6,DESCALE_P1_2 psrad xmm6, DESCALE_P1_2
paddd xmm1,xmm2 paddd xmm1, xmm2
paddd xmm7,xmm2 paddd xmm7, xmm2
psrad xmm1,DESCALE_P1_2 psrad xmm1, DESCALE_P1_2
psrad xmm7,DESCALE_P1_2 psrad xmm7, DESCALE_P1_2
; -- Prefetch the next coefficient block ; -- Prefetch the next coefficient block
@@ -546,34 +546,34 @@ EXTN(jsimd_idct_2x2_sse2):
; -- Odd part ; -- Odd part
packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
pmaddwd xmm1,[GOTOFF(ebx,PW_F362_MF127)] pmaddwd xmm1, [GOTOFF(ebx,PW_F362_MF127)]
pmaddwd xmm7,[GOTOFF(ebx,PW_F085_MF072)] pmaddwd xmm7, [GOTOFF(ebx,PW_F085_MF072)]
paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1] paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
; -- Even part ; -- Even part
pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
; -- Final output stage ; -- Final output stage
movdqa xmm4,xmm6 movdqa xmm4, xmm6
paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1) punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1)
paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)] paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)]
psrad xmm6,DESCALE_P2_2 psrad xmm6, DESCALE_P2_2
packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
paddb xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)] paddb xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)]
pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --) pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --) pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]

View File

@@ -39,14 +39,14 @@
EXTN(jsimd_convsamp_float_sse2): EXTN(jsimd_convsamp_float_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
push rbx push rbx
pcmpeqw xmm7,xmm7 pcmpeqw xmm7, xmm7
psllw xmm7,7 psllw xmm7, 7
packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
mov rsi, r10 mov rsi, r10
mov eax, r11d mov eax, r11d
@@ -59,25 +59,25 @@ EXTN(jsimd_convsamp_float_sse2):
movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
psubb xmm0,xmm7 ; xmm0=(01234567) psubb xmm0, xmm7 ; xmm0=(01234567)
psubb xmm1,xmm7 ; xmm1=(89ABCDEF) psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
cvtdq2ps xmm2,xmm2 ; xmm2=(0123) cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
cvtdq2ps xmm0,xmm0 ; xmm0=(4567) cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
@@ -113,8 +113,8 @@ EXTN(jsimd_convsamp_float_sse2):
EXTN(jsimd_quantize_float_sse2): EXTN(jsimd_quantize_float_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
mov rsi, r12 mov rsi, r12
@@ -131,13 +131,13 @@ EXTN(jsimd_quantize_float_sse2):
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
cvtps2dq xmm0,xmm0 cvtps2dq xmm0, xmm0
cvtps2dq xmm1,xmm1 cvtps2dq xmm1, xmm1
cvtps2dq xmm2,xmm2 cvtps2dq xmm2, xmm2
cvtps2dq xmm3,xmm3 cvtps2dq xmm3, xmm3
packssdw xmm0,xmm1 packssdw xmm0, xmm1
packssdw xmm2,xmm3 packssdw xmm2, xmm3
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2

View File

@@ -38,22 +38,22 @@
EXTN(jsimd_convsamp_float_sse2): EXTN(jsimd_convsamp_float_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
pcmpeqw xmm7,xmm7 pcmpeqw xmm7, xmm7
psllw xmm7,7 psllw xmm7, 7
packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
mov eax, JDIMENSION [start_col] mov eax, JDIMENSION [start_col]
mov edi, POINTER [workspace] ; (DCTELEM *) mov edi, POINTER [workspace] ; (DCTELEM *)
mov ecx, DCTSIZE/2 mov ecx, DCTSIZE/2
alignx 16,7 alignx 16, 7
.convloop: .convloop:
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
@@ -61,25 +61,25 @@ EXTN(jsimd_convsamp_float_sse2):
movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
psubb xmm0,xmm7 ; xmm0=(01234567) psubb xmm0, xmm7 ; xmm0=(01234567)
psubb xmm1,xmm7 ; xmm1=(89ABCDEF) psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
cvtdq2ps xmm2,xmm2 ; xmm2=(0123) cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
cvtdq2ps xmm0,xmm0 ; xmm0=(4567) cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
@@ -118,7 +118,7 @@ EXTN(jsimd_convsamp_float_sse2):
EXTN(jsimd_quantize_float_sse2): EXTN(jsimd_quantize_float_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
; push ebx ; unused ; push ebx ; unused
; push ecx ; unused ; push ecx ; unused
; push edx ; need not be preserved ; push edx ; need not be preserved
@@ -129,7 +129,7 @@ EXTN(jsimd_quantize_float_sse2):
mov edx, POINTER [divisors] mov edx, POINTER [divisors]
mov edi, JCOEFPTR [coef_block] mov edi, JCOEFPTR [coef_block]
mov eax, DCTSIZE2/16 mov eax, DCTSIZE2/16
alignx 16,7 alignx 16, 7
.quantloop: .quantloop:
movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
@@ -140,13 +140,13 @@ EXTN(jsimd_quantize_float_sse2):
mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
cvtps2dq xmm0,xmm0 cvtps2dq xmm0, xmm0
cvtps2dq xmm1,xmm1 cvtps2dq xmm1, xmm1
cvtps2dq xmm2,xmm2 cvtps2dq xmm2, xmm2
cvtps2dq xmm3,xmm3 cvtps2dq xmm3, xmm3
packssdw xmm0,xmm1 packssdw xmm0, xmm1
packssdw xmm2,xmm3 packssdw xmm2, xmm3
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2

View File

@@ -39,14 +39,14 @@
EXTN(jsimd_convsamp_sse2): EXTN(jsimd_convsamp_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
push rbx push rbx
pxor xmm6,xmm6 ; xmm6=(all 0's) pxor xmm6, xmm6 ; xmm6=(all 0's)
pcmpeqw xmm7,xmm7 pcmpeqw xmm7, xmm7
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
mov rsi, r10 mov rsi, r10
mov eax, r11d mov eax, r11d
@@ -65,14 +65,14 @@ EXTN(jsimd_convsamp_sse2):
movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
punpcklbw xmm0,xmm6 ; xmm0=(01234567) punpcklbw xmm0, xmm6 ; xmm0=(01234567)
punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm1,xmm7 paddw xmm1, xmm7
punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
paddw xmm2,xmm7 paddw xmm2, xmm7
paddw xmm3,xmm7 paddw xmm3, xmm7
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
@@ -115,8 +115,8 @@ EXTN(jsimd_convsamp_sse2):
EXTN(jsimd_quantize_sse2): EXTN(jsimd_quantize_sse2):
push rbp push rbp
mov rax,rsp mov rax, rsp
mov rbp,rsp mov rbp, rsp
collect_args collect_args
mov rsi, r12 mov rsi, r12
@@ -128,22 +128,22 @@ EXTN(jsimd_quantize_sse2):
movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)] movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)] movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)] movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
movdqa xmm0,xmm4 movdqa xmm0, xmm4
movdqa xmm1,xmm5 movdqa xmm1, xmm5
movdqa xmm2,xmm6 movdqa xmm2, xmm6
movdqa xmm3,xmm7 movdqa xmm3, xmm7
psraw xmm4,(WORD_BIT-1) psraw xmm4, (WORD_BIT-1)
psraw xmm5,(WORD_BIT-1) psraw xmm5, (WORD_BIT-1)
psraw xmm6,(WORD_BIT-1) psraw xmm6, (WORD_BIT-1)
psraw xmm7,(WORD_BIT-1) psraw xmm7, (WORD_BIT-1)
pxor xmm0,xmm4 pxor xmm0, xmm4
pxor xmm1,xmm5 pxor xmm1, xmm5
pxor xmm2,xmm6 pxor xmm2, xmm6
pxor xmm3,xmm7 pxor xmm3, xmm7
psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)] paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
@@ -158,14 +158,14 @@ EXTN(jsimd_quantize_sse2):
pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)] pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)] pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
pxor xmm0,xmm4 pxor xmm0, xmm4
pxor xmm1,xmm5 pxor xmm1, xmm5
pxor xmm2,xmm6 pxor xmm2, xmm6
pxor xmm3,xmm7 pxor xmm3, xmm7
psubw xmm0,xmm4 psubw xmm0, xmm4
psubw xmm1,xmm5 psubw xmm1, xmm5
psubw xmm2,xmm6 psubw xmm2, xmm6
psubw xmm3,xmm7 psubw xmm3, xmm7
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2

View File

@@ -38,22 +38,22 @@
EXTN(jsimd_convsamp_sse2): EXTN(jsimd_convsamp_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
pxor xmm6,xmm6 ; xmm6=(all 0's) pxor xmm6, xmm6 ; xmm6=(all 0's)
pcmpeqw xmm7,xmm7 pcmpeqw xmm7, xmm7
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
mov eax, JDIMENSION [start_col] mov eax, JDIMENSION [start_col]
mov edi, POINTER [workspace] ; (DCTELEM *) mov edi, POINTER [workspace] ; (DCTELEM *)
mov ecx, DCTSIZE/4 mov ecx, DCTSIZE/4
alignx 16,7 alignx 16, 7
.convloop: .convloop:
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
@@ -67,14 +67,14 @@ EXTN(jsimd_convsamp_sse2):
movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
punpcklbw xmm0,xmm6 ; xmm0=(01234567) punpcklbw xmm0, xmm6 ; xmm0=(01234567)
punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
paddw xmm0,xmm7 paddw xmm0, xmm7
paddw xmm1,xmm7 paddw xmm1, xmm7
punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
paddw xmm2,xmm7 paddw xmm2, xmm7
paddw xmm3,xmm7 paddw xmm3, xmm7
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
@@ -120,7 +120,7 @@ EXTN(jsimd_convsamp_sse2):
EXTN(jsimd_quantize_sse2): EXTN(jsimd_quantize_sse2):
push ebp push ebp
mov ebp,esp mov ebp, esp
; push ebx ; unused ; push ebx ; unused
; push ecx ; unused ; push ecx ; unused
; push edx ; need not be preserved ; push edx ; need not be preserved
@@ -131,28 +131,28 @@ EXTN(jsimd_quantize_sse2):
mov edx, POINTER [divisors] mov edx, POINTER [divisors]
mov edi, JCOEFPTR [coef_block] mov edi, JCOEFPTR [coef_block]
mov eax, DCTSIZE2/32 mov eax, DCTSIZE2/32
alignx 16,7 alignx 16, 7
.quantloop: .quantloop:
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
movdqa xmm0,xmm4 movdqa xmm0, xmm4
movdqa xmm1,xmm5 movdqa xmm1, xmm5
movdqa xmm2,xmm6 movdqa xmm2, xmm6
movdqa xmm3,xmm7 movdqa xmm3, xmm7
psraw xmm4,(WORD_BIT-1) psraw xmm4, (WORD_BIT-1)
psraw xmm5,(WORD_BIT-1) psraw xmm5, (WORD_BIT-1)
psraw xmm6,(WORD_BIT-1) psraw xmm6, (WORD_BIT-1)
psraw xmm7,(WORD_BIT-1) psraw xmm7, (WORD_BIT-1)
pxor xmm0,xmm4 pxor xmm0, xmm4
pxor xmm1,xmm5 pxor xmm1, xmm5
pxor xmm2,xmm6 pxor xmm2, xmm6
pxor xmm3,xmm7 pxor xmm3, xmm7
psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] paddw xmm1, XMMWORD [CORRECTION(1,0,edx)]
@@ -167,14 +167,14 @@ EXTN(jsimd_quantize_sse2):
pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
pxor xmm0,xmm4 pxor xmm0, xmm4
pxor xmm1,xmm5 pxor xmm1, xmm5
pxor xmm2,xmm6 pxor xmm2, xmm6
pxor xmm3,xmm7 pxor xmm3, xmm7
psubw xmm0,xmm4 psubw xmm0, xmm4
psubw xmm1,xmm5 psubw xmm1, xmm5
psubw xmm2,xmm6 psubw xmm2, xmm6
psubw xmm3,xmm7 psubw xmm3, xmm7
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2

View File

@@ -190,7 +190,7 @@ section .note.GNU-stack noalloc noexec nowrite progbits
%ifdef PIC ; ------------------------------------------- %ifdef PIC ; -------------------------------------------
%ifidn GOT_SYMBOL,_MACHO_PIC_ ; -------------------- %ifidn GOT_SYMBOL, _MACHO_PIC_ ; --------------------
; At present, nasm doesn't seem to support PIC generation for Mach-O. ; At present, nasm doesn't seem to support PIC generation for Mach-O.
; The PIC support code below is a little tricky. ; The PIC support code below is a little tricky.
@@ -210,19 +210,20 @@ const_base:
ret ret
%%adjust: %%adjust:
push ebp push ebp
xor ebp,ebp ; ebp = 0 xor ebp, ebp ; ebp = 0
%ifidni %1,ebx ; (%1 == ebx) %ifidni %1, ebx ; (%1 == ebx)
; db 0x8D,0x9C + jmp near const_base = ; db 0x8D,0x9C + jmp near const_base =
; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
db 0x8D,0x9C ; 8D,9C db 0x8D, 0x9C ; 8D,9C
jmp near const_base ; E9,(const_base-%%ref) jmp near const_base ; E9,(const_base-%%ref)
%%ref: %%ref:
%else ; (%1 != ebx) %else ; (%1 != ebx)
; db 0x8D,0x8C + jmp near const_base = ; db 0x8D,0x8C + jmp near const_base =
; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
db 0x8D,0x8C ; 8D,8C db 0x8D, 0x8C ; 8D,8C
jmp near const_base ; E9,(const_base-%%ref) jmp near const_base ; E9,(const_base-%%ref)
%%ref: mov %1, ecx %%ref:
mov %1, ecx
%endif ; (%1 == ebx) %endif ; (%1 == ebx)
pop ebp pop ebp
%endmacro %endmacro
@@ -251,7 +252,7 @@ const_base:
pop %1 pop %1
%endmacro %endmacro
%imacro movpic 2.nolist %imacro movpic 2.nolist
mov %1,%2 mov %1, %2
%endmacro %endmacro
%else ; !PIC ----------------------------------------- %else ; !PIC -----------------------------------------
@@ -277,7 +278,8 @@ const_base:
%define FILLB(b,n) (($$-(b)) & ((n)-1)) %define FILLB(b,n) (($$-(b)) & ((n)-1))
%imacro alignx 1-2.nolist 0xFFFF %imacro alignx 1-2.nolist 0xFFFF
%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \ %%bs: \
times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
db 0x90 ; nop db 0x90 ; nop
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \ times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000] db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]