Reformat SSE/SSE2 SIMD code to improve readability
This commit is contained in:
@@ -42,17 +42,17 @@
|
|||||||
|
|
||||||
EXTN(jsimd_rgb_ycc_convert_sse2):
|
EXTN(jsimd_rgb_ycc_convert_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp ; rax = original rbp
|
mov rax, rsp ; rax = original rbp
|
||||||
sub rsp, byte 4
|
sub rsp, byte 4
|
||||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [rsp],rax
|
mov [rsp], rax
|
||||||
mov rbp,rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
mov ecx, r10d
|
mov ecx, r10d
|
||||||
test rcx,rcx
|
test rcx, rcx
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
push rcx
|
push rcx
|
||||||
@@ -70,7 +70,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
|
|
||||||
mov rsi, r11
|
mov rsi, r11
|
||||||
mov eax, r14d
|
mov eax, r14d
|
||||||
test rax,rax
|
test rax, rax
|
||||||
jle near .return
|
jle near .return
|
||||||
.rowloop:
|
.rowloop:
|
||||||
push rdx
|
push rdx
|
||||||
@@ -92,7 +92,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
.column_ld1:
|
.column_ld1:
|
||||||
push rax
|
push rax
|
||||||
push rdx
|
push rdx
|
||||||
lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
|
lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
|
||||||
test cl, SIZEOF_BYTE
|
test cl, SIZEOF_BYTE
|
||||||
jz short .column_ld2
|
jz short .column_ld2
|
||||||
sub rcx, byte SIZEOF_BYTE
|
sub rcx, byte SIZEOF_BYTE
|
||||||
@@ -103,9 +103,9 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
sub rcx, byte SIZEOF_WORD
|
sub rcx, byte SIZEOF_WORD
|
||||||
movzx rdx, WORD [rsi+rcx]
|
movzx rdx, WORD [rsi+rcx]
|
||||||
shl rax, WORD_BIT
|
shl rax, WORD_BIT
|
||||||
or rax,rdx
|
or rax, rdx
|
||||||
.column_ld4:
|
.column_ld4:
|
||||||
movd xmmA,eax
|
movd xmmA, eax
|
||||||
pop rdx
|
pop rdx
|
||||||
pop rax
|
pop rax
|
||||||
test cl, SIZEOF_DWORD
|
test cl, SIZEOF_DWORD
|
||||||
@@ -113,18 +113,18 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
sub rcx, byte SIZEOF_DWORD
|
sub rcx, byte SIZEOF_DWORD
|
||||||
movd xmmF, XMM_DWORD [rsi+rcx]
|
movd xmmF, XMM_DWORD [rsi+rcx]
|
||||||
pslldq xmmA, SIZEOF_DWORD
|
pslldq xmmA, SIZEOF_DWORD
|
||||||
por xmmA,xmmF
|
por xmmA, xmmF
|
||||||
.column_ld8:
|
.column_ld8:
|
||||||
test cl, SIZEOF_MMWORD
|
test cl, SIZEOF_MMWORD
|
||||||
jz short .column_ld16
|
jz short .column_ld16
|
||||||
sub rcx, byte SIZEOF_MMWORD
|
sub rcx, byte SIZEOF_MMWORD
|
||||||
movq xmmB, XMM_MMWORD [rsi+rcx]
|
movq xmmB, XMM_MMWORD [rsi+rcx]
|
||||||
pslldq xmmA, SIZEOF_MMWORD
|
pslldq xmmA, SIZEOF_MMWORD
|
||||||
por xmmA,xmmB
|
por xmmA, xmmB
|
||||||
.column_ld16:
|
.column_ld16:
|
||||||
test cl, SIZEOF_XMMWORD
|
test cl, SIZEOF_XMMWORD
|
||||||
jz short .column_ld32
|
jz short .column_ld32
|
||||||
movdqa xmmF,xmmA
|
movdqa xmmF, xmmA
|
||||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||||
mov rcx, SIZEOF_XMMWORD
|
mov rcx, SIZEOF_XMMWORD
|
||||||
jmp short .rgb_ycc_cnv
|
jmp short .rgb_ycc_cnv
|
||||||
@@ -132,7 +132,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
test cl, 2*SIZEOF_XMMWORD
|
test cl, 2*SIZEOF_XMMWORD
|
||||||
mov rcx, SIZEOF_XMMWORD
|
mov rcx, SIZEOF_XMMWORD
|
||||||
jz short .rgb_ycc_cnv
|
jz short .rgb_ycc_cnv
|
||||||
movdqa xmmB,xmmA
|
movdqa xmmB, xmmA
|
||||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||||
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||||
jmp short .rgb_ycc_cnv
|
jmp short .rgb_ycc_cnv
|
||||||
@@ -147,49 +147,49 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||||
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||||
|
|
||||||
movdqa xmmG,xmmA
|
movdqa xmmG, xmmA
|
||||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
|
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
|
||||||
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
|
psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
|
||||||
|
|
||||||
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
|
punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
|
||||||
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
|
pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
|
||||||
|
|
||||||
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
|
punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
|
||||||
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
|
punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
|
||||||
|
|
||||||
movdqa xmmD,xmmA
|
movdqa xmmD, xmmA
|
||||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
|
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
|
||||||
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
|
psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
|
||||||
|
|
||||||
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
|
punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
|
||||||
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
|
pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
|
||||||
|
|
||||||
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
|
punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
|
||||||
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
|
punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
|
||||||
|
|
||||||
movdqa xmmE,xmmA
|
movdqa xmmE, xmmA
|
||||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
|
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
|
||||||
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
|
psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
|
||||||
|
|
||||||
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||||
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
|
pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
|
||||||
|
|
||||||
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
|
punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
|
||||||
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
|
punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
|
||||||
|
|
||||||
pxor xmmH,xmmH
|
pxor xmmH, xmmH
|
||||||
|
|
||||||
movdqa xmmC,xmmA
|
movdqa xmmC, xmmA
|
||||||
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||||
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||||
|
|
||||||
movdqa xmmB,xmmE
|
movdqa xmmB, xmmE
|
||||||
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||||
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||||
|
|
||||||
movdqa xmmF,xmmD
|
movdqa xmmF, xmmD
|
||||||
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||||
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||||
|
|
||||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||||
|
|
||||||
@@ -204,19 +204,19 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
sub rcx, byte SIZEOF_XMMWORD/8
|
sub rcx, byte SIZEOF_XMMWORD/8
|
||||||
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
|
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||||
pslldq xmmA, SIZEOF_MMWORD
|
pslldq xmmA, SIZEOF_MMWORD
|
||||||
por xmmA,xmmE
|
por xmmA, xmmE
|
||||||
.column_ld4:
|
.column_ld4:
|
||||||
test cl, SIZEOF_XMMWORD/4
|
test cl, SIZEOF_XMMWORD/4
|
||||||
jz short .column_ld8
|
jz short .column_ld8
|
||||||
sub rcx, byte SIZEOF_XMMWORD/4
|
sub rcx, byte SIZEOF_XMMWORD/4
|
||||||
movdqa xmmE,xmmA
|
movdqa xmmE, xmmA
|
||||||
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
|
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||||
.column_ld8:
|
.column_ld8:
|
||||||
test cl, SIZEOF_XMMWORD/2
|
test cl, SIZEOF_XMMWORD/2
|
||||||
mov rcx, SIZEOF_XMMWORD
|
mov rcx, SIZEOF_XMMWORD
|
||||||
jz short .rgb_ycc_cnv
|
jz short .rgb_ycc_cnv
|
||||||
movdqa xmmF,xmmA
|
movdqa xmmF, xmmA
|
||||||
movdqa xmmH,xmmE
|
movdqa xmmH, xmmE
|
||||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||||
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||||
jmp short .rgb_ycc_cnv
|
jmp short .rgb_ycc_cnv
|
||||||
@@ -233,48 +233,48 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||||
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||||
|
|
||||||
movdqa xmmD,xmmA
|
movdqa xmmD, xmmA
|
||||||
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
|
punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
|
||||||
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
|
punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
|
||||||
|
|
||||||
movdqa xmmC,xmmF
|
movdqa xmmC, xmmF
|
||||||
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
|
punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
|
||||||
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
|
punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
|
||||||
|
|
||||||
movdqa xmmB,xmmA
|
movdqa xmmB, xmmA
|
||||||
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
|
punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
|
||||||
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
|
punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
|
||||||
|
|
||||||
movdqa xmmG,xmmD
|
movdqa xmmG, xmmD
|
||||||
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
|
punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
|
||||||
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
|
punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
|
||||||
|
|
||||||
movdqa xmmE,xmmA
|
movdqa xmmE, xmmA
|
||||||
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||||
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
|
punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
|
||||||
|
|
||||||
movdqa xmmH,xmmB
|
movdqa xmmH, xmmB
|
||||||
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
|
punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
|
||||||
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
|
punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
|
||||||
|
|
||||||
pxor xmmF,xmmF
|
pxor xmmF, xmmF
|
||||||
|
|
||||||
movdqa xmmC,xmmA
|
movdqa xmmC, xmmA
|
||||||
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||||
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||||
|
|
||||||
movdqa xmmD,xmmB
|
movdqa xmmD, xmmB
|
||||||
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||||
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||||
|
|
||||||
movdqa xmmG,xmmE
|
movdqa xmmG, xmmE
|
||||||
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||||
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
|
punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
|
||||||
|
|
||||||
punpcklbw xmmF,xmmH
|
punpcklbw xmmF, xmmH
|
||||||
punpckhbw xmmH,xmmH
|
punpckhbw xmmH, xmmH
|
||||||
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||||
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
|
psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
|
||||||
|
|
||||||
%endif ; RGB_PIXELSIZE ; ---------------
|
%endif ; RGB_PIXELSIZE ; ---------------
|
||||||
|
|
||||||
@@ -296,158 +296,158 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
|
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
|
||||||
movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
|
movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
|
||||||
|
|
||||||
movdqa xmm6,xmm1
|
movdqa xmm6, xmm1
|
||||||
punpcklwd xmm1,xmm3
|
punpcklwd xmm1, xmm3
|
||||||
punpckhwd xmm6,xmm3
|
punpckhwd xmm6, xmm3
|
||||||
movdqa xmm7,xmm1
|
movdqa xmm7, xmm1
|
||||||
movdqa xmm4,xmm6
|
movdqa xmm4, xmm6
|
||||||
pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||||
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||||
pmaddwd xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
|
pmaddwd xmm7, [rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
|
||||||
pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
|
pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
|
movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||||
movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
|
movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||||
|
|
||||||
pxor xmm1,xmm1
|
pxor xmm1, xmm1
|
||||||
pxor xmm6,xmm6
|
pxor xmm6, xmm6
|
||||||
punpcklwd xmm1,xmm5 ; xmm1=BOL
|
punpcklwd xmm1, xmm5 ; xmm1=BOL
|
||||||
punpckhwd xmm6,xmm5 ; xmm6=BOH
|
punpckhwd xmm6, xmm5 ; xmm6=BOH
|
||||||
psrld xmm1,1 ; xmm1=BOL*FIX(0.500)
|
psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
|
||||||
psrld xmm6,1 ; xmm6=BOH*FIX(0.500)
|
psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
|
||||||
|
|
||||||
movdqa xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
|
movdqa xmm5, [rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
|
||||||
|
|
||||||
paddd xmm7,xmm1
|
paddd xmm7, xmm1
|
||||||
paddd xmm4,xmm6
|
paddd xmm4, xmm6
|
||||||
paddd xmm7,xmm5
|
paddd xmm7, xmm5
|
||||||
paddd xmm4,xmm5
|
paddd xmm4, xmm5
|
||||||
psrld xmm7,SCALEBITS ; xmm7=CbOL
|
psrld xmm7, SCALEBITS ; xmm7=CbOL
|
||||||
psrld xmm4,SCALEBITS ; xmm4=CbOH
|
psrld xmm4, SCALEBITS ; xmm4=CbOH
|
||||||
packssdw xmm7,xmm4 ; xmm7=CbO
|
packssdw xmm7, xmm4 ; xmm7=CbO
|
||||||
|
|
||||||
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
|
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
|
||||||
|
|
||||||
movdqa xmm6,xmm0
|
movdqa xmm6, xmm0
|
||||||
punpcklwd xmm0,xmm2
|
punpcklwd xmm0, xmm2
|
||||||
punpckhwd xmm6,xmm2
|
punpckhwd xmm6, xmm2
|
||||||
movdqa xmm5,xmm0
|
movdqa xmm5, xmm0
|
||||||
movdqa xmm4,xmm6
|
movdqa xmm4, xmm6
|
||||||
pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||||
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||||
pmaddwd xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
|
pmaddwd xmm5, [rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
|
||||||
pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
|
pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
|
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||||
movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
|
movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||||
|
|
||||||
pxor xmm0,xmm0
|
pxor xmm0, xmm0
|
||||||
pxor xmm6,xmm6
|
pxor xmm6, xmm6
|
||||||
punpcklwd xmm0,xmm1 ; xmm0=BEL
|
punpcklwd xmm0, xmm1 ; xmm0=BEL
|
||||||
punpckhwd xmm6,xmm1 ; xmm6=BEH
|
punpckhwd xmm6, xmm1 ; xmm6=BEH
|
||||||
psrld xmm0,1 ; xmm0=BEL*FIX(0.500)
|
psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
|
||||||
psrld xmm6,1 ; xmm6=BEH*FIX(0.500)
|
psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
|
||||||
|
|
||||||
movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
|
movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
|
||||||
|
|
||||||
paddd xmm5,xmm0
|
paddd xmm5, xmm0
|
||||||
paddd xmm4,xmm6
|
paddd xmm4, xmm6
|
||||||
paddd xmm5,xmm1
|
paddd xmm5, xmm1
|
||||||
paddd xmm4,xmm1
|
paddd xmm4, xmm1
|
||||||
psrld xmm5,SCALEBITS ; xmm5=CbEL
|
psrld xmm5, SCALEBITS ; xmm5=CbEL
|
||||||
psrld xmm4,SCALEBITS ; xmm4=CbEH
|
psrld xmm4, SCALEBITS ; xmm4=CbEH
|
||||||
packssdw xmm5,xmm4 ; xmm5=CbE
|
packssdw xmm5, xmm4 ; xmm5=CbE
|
||||||
|
|
||||||
psllw xmm7,BYTE_BIT
|
psllw xmm7, BYTE_BIT
|
||||||
por xmm5,xmm7 ; xmm5=Cb
|
por xmm5, xmm7 ; xmm5=Cb
|
||||||
movdqa XMMWORD [rbx], xmm5 ; Save Cb
|
movdqa XMMWORD [rbx], xmm5 ; Save Cb
|
||||||
|
|
||||||
movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
|
movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
|
||||||
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
|
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
|
||||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
|
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
|
||||||
|
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
punpcklwd xmm0,xmm3
|
punpcklwd xmm0, xmm3
|
||||||
punpckhwd xmm4,xmm3
|
punpckhwd xmm4, xmm3
|
||||||
movdqa xmm7,xmm0
|
movdqa xmm7, xmm0
|
||||||
movdqa xmm5,xmm4
|
movdqa xmm5, xmm4
|
||||||
pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||||
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||||
pmaddwd xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
|
pmaddwd xmm7, [rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
|
||||||
pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
|
pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
|
||||||
|
|
||||||
movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
|
movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
|
||||||
|
|
||||||
paddd xmm0, XMMWORD [wk(4)]
|
paddd xmm0, XMMWORD [wk(4)]
|
||||||
paddd xmm4, XMMWORD [wk(5)]
|
paddd xmm4, XMMWORD [wk(5)]
|
||||||
paddd xmm0,xmm3
|
paddd xmm0, xmm3
|
||||||
paddd xmm4,xmm3
|
paddd xmm4, xmm3
|
||||||
psrld xmm0,SCALEBITS ; xmm0=YOL
|
psrld xmm0, SCALEBITS ; xmm0=YOL
|
||||||
psrld xmm4,SCALEBITS ; xmm4=YOH
|
psrld xmm4, SCALEBITS ; xmm4=YOH
|
||||||
packssdw xmm0,xmm4 ; xmm0=YO
|
packssdw xmm0, xmm4 ; xmm0=YO
|
||||||
|
|
||||||
pxor xmm3,xmm3
|
pxor xmm3, xmm3
|
||||||
pxor xmm4,xmm4
|
pxor xmm4, xmm4
|
||||||
punpcklwd xmm3,xmm1 ; xmm3=ROL
|
punpcklwd xmm3, xmm1 ; xmm3=ROL
|
||||||
punpckhwd xmm4,xmm1 ; xmm4=ROH
|
punpckhwd xmm4, xmm1 ; xmm4=ROH
|
||||||
psrld xmm3,1 ; xmm3=ROL*FIX(0.500)
|
psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
|
||||||
psrld xmm4,1 ; xmm4=ROH*FIX(0.500)
|
psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
|
||||||
|
|
||||||
movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
|
movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
|
||||||
|
|
||||||
paddd xmm7,xmm3
|
paddd xmm7, xmm3
|
||||||
paddd xmm5,xmm4
|
paddd xmm5, xmm4
|
||||||
paddd xmm7,xmm1
|
paddd xmm7, xmm1
|
||||||
paddd xmm5,xmm1
|
paddd xmm5, xmm1
|
||||||
psrld xmm7,SCALEBITS ; xmm7=CrOL
|
psrld xmm7, SCALEBITS ; xmm7=CrOL
|
||||||
psrld xmm5,SCALEBITS ; xmm5=CrOH
|
psrld xmm5, SCALEBITS ; xmm5=CrOH
|
||||||
packssdw xmm7,xmm5 ; xmm7=CrO
|
packssdw xmm7, xmm5 ; xmm7=CrO
|
||||||
|
|
||||||
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
|
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
|
||||||
|
|
||||||
movdqa xmm4,xmm6
|
movdqa xmm4, xmm6
|
||||||
punpcklwd xmm6,xmm2
|
punpcklwd xmm6, xmm2
|
||||||
punpckhwd xmm4,xmm2
|
punpckhwd xmm4, xmm2
|
||||||
movdqa xmm1,xmm6
|
movdqa xmm1, xmm6
|
||||||
movdqa xmm5,xmm4
|
movdqa xmm5, xmm4
|
||||||
pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||||
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||||
pmaddwd xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
|
pmaddwd xmm1, [rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
|
||||||
pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
|
pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
|
||||||
|
|
||||||
movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
|
movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
|
||||||
|
|
||||||
paddd xmm6, XMMWORD [wk(6)]
|
paddd xmm6, XMMWORD [wk(6)]
|
||||||
paddd xmm4, XMMWORD [wk(7)]
|
paddd xmm4, XMMWORD [wk(7)]
|
||||||
paddd xmm6,xmm2
|
paddd xmm6, xmm2
|
||||||
paddd xmm4,xmm2
|
paddd xmm4, xmm2
|
||||||
psrld xmm6,SCALEBITS ; xmm6=YEL
|
psrld xmm6, SCALEBITS ; xmm6=YEL
|
||||||
psrld xmm4,SCALEBITS ; xmm4=YEH
|
psrld xmm4, SCALEBITS ; xmm4=YEH
|
||||||
packssdw xmm6,xmm4 ; xmm6=YE
|
packssdw xmm6, xmm4 ; xmm6=YE
|
||||||
|
|
||||||
psllw xmm0,BYTE_BIT
|
psllw xmm0, BYTE_BIT
|
||||||
por xmm6,xmm0 ; xmm6=Y
|
por xmm6, xmm0 ; xmm6=Y
|
||||||
movdqa XMMWORD [rdi], xmm6 ; Save Y
|
movdqa XMMWORD [rdi], xmm6 ; Save Y
|
||||||
|
|
||||||
pxor xmm2,xmm2
|
pxor xmm2, xmm2
|
||||||
pxor xmm4,xmm4
|
pxor xmm4, xmm4
|
||||||
punpcklwd xmm2,xmm3 ; xmm2=REL
|
punpcklwd xmm2, xmm3 ; xmm2=REL
|
||||||
punpckhwd xmm4,xmm3 ; xmm4=REH
|
punpckhwd xmm4, xmm3 ; xmm4=REH
|
||||||
psrld xmm2,1 ; xmm2=REL*FIX(0.500)
|
psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
|
||||||
psrld xmm4,1 ; xmm4=REH*FIX(0.500)
|
psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
|
||||||
|
|
||||||
movdqa xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
|
movdqa xmm0, [rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
|
||||||
|
|
||||||
paddd xmm1,xmm2
|
paddd xmm1, xmm2
|
||||||
paddd xmm5,xmm4
|
paddd xmm5, xmm4
|
||||||
paddd xmm1,xmm0
|
paddd xmm1, xmm0
|
||||||
paddd xmm5,xmm0
|
paddd xmm5, xmm0
|
||||||
psrld xmm1,SCALEBITS ; xmm1=CrEL
|
psrld xmm1, SCALEBITS ; xmm1=CrEL
|
||||||
psrld xmm5,SCALEBITS ; xmm5=CrEH
|
psrld xmm5, SCALEBITS ; xmm5=CrEH
|
||||||
packssdw xmm1,xmm5 ; xmm1=CrE
|
packssdw xmm1, xmm5 ; xmm1=CrE
|
||||||
|
|
||||||
psllw xmm7,BYTE_BIT
|
psllw xmm7, BYTE_BIT
|
||||||
por xmm1,xmm7 ; xmm1=Cr
|
por xmm1, xmm7 ; xmm1=Cr
|
||||||
movdqa XMMWORD [rdx], xmm1 ; Save Cr
|
movdqa XMMWORD [rdx], xmm1 ; Save Cr
|
||||||
|
|
||||||
sub rcx, byte SIZEOF_XMMWORD
|
sub rcx, byte SIZEOF_XMMWORD
|
||||||
@@ -457,7 +457,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
add rdx, byte SIZEOF_XMMWORD ; outptr2
|
add rdx, byte SIZEOF_XMMWORD ; outptr2
|
||||||
cmp rcx, byte SIZEOF_XMMWORD
|
cmp rcx, byte SIZEOF_XMMWORD
|
||||||
jae near .columnloop
|
jae near .columnloop
|
||||||
test rcx,rcx
|
test rcx, rcx
|
||||||
jnz near .column_ld1
|
jnz near .column_ld1
|
||||||
|
|
||||||
pop rcx ; col
|
pop rcx ; col
|
||||||
@@ -476,7 +476,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
.return:
|
.return:
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args
|
||||||
mov rsp,rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -42,11 +42,11 @@
|
|||||||
|
|
||||||
EXTN(jsimd_rgb_ycc_convert_sse2):
|
EXTN(jsimd_rgb_ycc_convert_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov eax,esp ; eax = original ebp
|
mov eax, esp ; eax = original ebp
|
||||||
sub esp, byte 4
|
sub esp, byte 4
|
||||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [esp],eax
|
mov [esp], eax
|
||||||
mov ebp,esp ; ebp = aligned ebp
|
mov ebp, esp ; ebp = aligned ebp
|
||||||
lea esp, [wk(0)]
|
lea esp, [wk(0)]
|
||||||
pushpic eax ; make a room for GOT address
|
pushpic eax ; make a room for GOT address
|
||||||
push ebx
|
push ebx
|
||||||
@@ -59,7 +59,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
movpic POINTER [gotptr], ebx ; save GOT address
|
movpic POINTER [gotptr], ebx ; save GOT address
|
||||||
|
|
||||||
mov ecx, JDIMENSION [img_width(eax)]
|
mov ecx, JDIMENSION [img_width(eax)]
|
||||||
test ecx,ecx
|
test ecx, ecx
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
push ecx
|
push ecx
|
||||||
@@ -77,9 +77,9 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
|
|
||||||
mov esi, JSAMPARRAY [input_buf(eax)]
|
mov esi, JSAMPARRAY [input_buf(eax)]
|
||||||
mov eax, INT [num_rows(eax)]
|
mov eax, INT [num_rows(eax)]
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jle near .return
|
jle near .return
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.rowloop:
|
.rowloop:
|
||||||
pushpic eax
|
pushpic eax
|
||||||
push edx
|
push edx
|
||||||
@@ -96,14 +96,14 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
|
|
||||||
cmp ecx, byte SIZEOF_XMMWORD
|
cmp ecx, byte SIZEOF_XMMWORD
|
||||||
jae near .columnloop
|
jae near .columnloop
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||||
|
|
||||||
.column_ld1:
|
.column_ld1:
|
||||||
push eax
|
push eax
|
||||||
push edx
|
push edx
|
||||||
lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
|
lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
|
||||||
test cl, SIZEOF_BYTE
|
test cl, SIZEOF_BYTE
|
||||||
jz short .column_ld2
|
jz short .column_ld2
|
||||||
sub ecx, byte SIZEOF_BYTE
|
sub ecx, byte SIZEOF_BYTE
|
||||||
@@ -114,9 +114,9 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
sub ecx, byte SIZEOF_WORD
|
sub ecx, byte SIZEOF_WORD
|
||||||
movzx edx, WORD [esi+ecx]
|
movzx edx, WORD [esi+ecx]
|
||||||
shl eax, WORD_BIT
|
shl eax, WORD_BIT
|
||||||
or eax,edx
|
or eax, edx
|
||||||
.column_ld4:
|
.column_ld4:
|
||||||
movd xmmA,eax
|
movd xmmA, eax
|
||||||
pop edx
|
pop edx
|
||||||
pop eax
|
pop eax
|
||||||
test cl, SIZEOF_DWORD
|
test cl, SIZEOF_DWORD
|
||||||
@@ -124,18 +124,18 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
sub ecx, byte SIZEOF_DWORD
|
sub ecx, byte SIZEOF_DWORD
|
||||||
movd xmmF, XMM_DWORD [esi+ecx]
|
movd xmmF, XMM_DWORD [esi+ecx]
|
||||||
pslldq xmmA, SIZEOF_DWORD
|
pslldq xmmA, SIZEOF_DWORD
|
||||||
por xmmA,xmmF
|
por xmmA, xmmF
|
||||||
.column_ld8:
|
.column_ld8:
|
||||||
test cl, SIZEOF_MMWORD
|
test cl, SIZEOF_MMWORD
|
||||||
jz short .column_ld16
|
jz short .column_ld16
|
||||||
sub ecx, byte SIZEOF_MMWORD
|
sub ecx, byte SIZEOF_MMWORD
|
||||||
movq xmmB, XMM_MMWORD [esi+ecx]
|
movq xmmB, XMM_MMWORD [esi+ecx]
|
||||||
pslldq xmmA, SIZEOF_MMWORD
|
pslldq xmmA, SIZEOF_MMWORD
|
||||||
por xmmA,xmmB
|
por xmmA, xmmB
|
||||||
.column_ld16:
|
.column_ld16:
|
||||||
test cl, SIZEOF_XMMWORD
|
test cl, SIZEOF_XMMWORD
|
||||||
jz short .column_ld32
|
jz short .column_ld32
|
||||||
movdqa xmmF,xmmA
|
movdqa xmmF, xmmA
|
||||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
mov ecx, SIZEOF_XMMWORD
|
mov ecx, SIZEOF_XMMWORD
|
||||||
jmp short .rgb_ycc_cnv
|
jmp short .rgb_ycc_cnv
|
||||||
@@ -143,11 +143,11 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
test cl, 2*SIZEOF_XMMWORD
|
test cl, 2*SIZEOF_XMMWORD
|
||||||
mov ecx, SIZEOF_XMMWORD
|
mov ecx, SIZEOF_XMMWORD
|
||||||
jz short .rgb_ycc_cnv
|
jz short .rgb_ycc_cnv
|
||||||
movdqa xmmB,xmmA
|
movdqa xmmB, xmmA
|
||||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||||
jmp short .rgb_ycc_cnv
|
jmp short .rgb_ycc_cnv
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.columnloop:
|
.columnloop:
|
||||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
@@ -159,49 +159,49 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||||
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||||
|
|
||||||
movdqa xmmG,xmmA
|
movdqa xmmG, xmmA
|
||||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
|
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
|
||||||
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
|
psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
|
||||||
|
|
||||||
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
|
punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
|
||||||
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
|
pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
|
||||||
|
|
||||||
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
|
punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
|
||||||
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
|
punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
|
||||||
|
|
||||||
movdqa xmmD,xmmA
|
movdqa xmmD, xmmA
|
||||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
|
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
|
||||||
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
|
psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
|
||||||
|
|
||||||
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
|
punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
|
||||||
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
|
pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
|
||||||
|
|
||||||
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
|
punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
|
||||||
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
|
punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
|
||||||
|
|
||||||
movdqa xmmE,xmmA
|
movdqa xmmE, xmmA
|
||||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
|
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
|
||||||
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
|
psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
|
||||||
|
|
||||||
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||||
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
|
pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
|
||||||
|
|
||||||
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
|
punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
|
||||||
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
|
punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
|
||||||
|
|
||||||
pxor xmmH,xmmH
|
pxor xmmH, xmmH
|
||||||
|
|
||||||
movdqa xmmC,xmmA
|
movdqa xmmC, xmmA
|
||||||
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||||
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||||
|
|
||||||
movdqa xmmB,xmmE
|
movdqa xmmB, xmmE
|
||||||
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||||
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||||
|
|
||||||
movdqa xmmF,xmmD
|
movdqa xmmF, xmmD
|
||||||
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||||
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||||
|
|
||||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||||
|
|
||||||
@@ -216,23 +216,23 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
sub ecx, byte SIZEOF_XMMWORD/8
|
sub ecx, byte SIZEOF_XMMWORD/8
|
||||||
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
|
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||||
pslldq xmmA, SIZEOF_MMWORD
|
pslldq xmmA, SIZEOF_MMWORD
|
||||||
por xmmA,xmmE
|
por xmmA, xmmE
|
||||||
.column_ld4:
|
.column_ld4:
|
||||||
test cl, SIZEOF_XMMWORD/4
|
test cl, SIZEOF_XMMWORD/4
|
||||||
jz short .column_ld8
|
jz short .column_ld8
|
||||||
sub ecx, byte SIZEOF_XMMWORD/4
|
sub ecx, byte SIZEOF_XMMWORD/4
|
||||||
movdqa xmmE,xmmA
|
movdqa xmmE, xmmA
|
||||||
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
|
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||||
.column_ld8:
|
.column_ld8:
|
||||||
test cl, SIZEOF_XMMWORD/2
|
test cl, SIZEOF_XMMWORD/2
|
||||||
mov ecx, SIZEOF_XMMWORD
|
mov ecx, SIZEOF_XMMWORD
|
||||||
jz short .rgb_ycc_cnv
|
jz short .rgb_ycc_cnv
|
||||||
movdqa xmmF,xmmA
|
movdqa xmmF, xmmA
|
||||||
movdqa xmmH,xmmE
|
movdqa xmmH, xmmE
|
||||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||||
jmp short .rgb_ycc_cnv
|
jmp short .rgb_ycc_cnv
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.columnloop:
|
.columnloop:
|
||||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
@@ -246,48 +246,48 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||||
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||||
|
|
||||||
movdqa xmmD,xmmA
|
movdqa xmmD, xmmA
|
||||||
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
|
punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
|
||||||
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
|
punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
|
||||||
|
|
||||||
movdqa xmmC,xmmF
|
movdqa xmmC, xmmF
|
||||||
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
|
punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
|
||||||
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
|
punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
|
||||||
|
|
||||||
movdqa xmmB,xmmA
|
movdqa xmmB, xmmA
|
||||||
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
|
punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
|
||||||
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
|
punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
|
||||||
|
|
||||||
movdqa xmmG,xmmD
|
movdqa xmmG, xmmD
|
||||||
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
|
punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
|
||||||
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
|
punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
|
||||||
|
|
||||||
movdqa xmmE,xmmA
|
movdqa xmmE, xmmA
|
||||||
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||||
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
|
punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
|
||||||
|
|
||||||
movdqa xmmH,xmmB
|
movdqa xmmH, xmmB
|
||||||
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
|
punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
|
||||||
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
|
punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
|
||||||
|
|
||||||
pxor xmmF,xmmF
|
pxor xmmF, xmmF
|
||||||
|
|
||||||
movdqa xmmC,xmmA
|
movdqa xmmC, xmmA
|
||||||
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||||
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||||
|
|
||||||
movdqa xmmD,xmmB
|
movdqa xmmD, xmmB
|
||||||
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||||
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||||
|
|
||||||
movdqa xmmG,xmmE
|
movdqa xmmG, xmmE
|
||||||
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||||
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
|
punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
|
||||||
|
|
||||||
punpcklbw xmmF,xmmH
|
punpcklbw xmmF, xmmH
|
||||||
punpckhbw xmmH,xmmH
|
punpckhbw xmmH, xmmH
|
||||||
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||||
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
|
psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
|
||||||
|
|
||||||
%endif ; RGB_PIXELSIZE ; ---------------
|
%endif ; RGB_PIXELSIZE ; ---------------
|
||||||
|
|
||||||
@@ -309,158 +309,158 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
|
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
|
||||||
movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
|
movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
|
||||||
|
|
||||||
movdqa xmm6,xmm1
|
movdqa xmm6, xmm1
|
||||||
punpcklwd xmm1,xmm3
|
punpcklwd xmm1, xmm3
|
||||||
punpckhwd xmm6,xmm3
|
punpckhwd xmm6, xmm3
|
||||||
movdqa xmm7,xmm1
|
movdqa xmm7, xmm1
|
||||||
movdqa xmm4,xmm6
|
movdqa xmm4, xmm6
|
||||||
pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||||
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||||
pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
|
pmaddwd xmm7, [GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
|
||||||
pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
|
pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
|
movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||||
movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
|
movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||||
|
|
||||||
pxor xmm1,xmm1
|
pxor xmm1, xmm1
|
||||||
pxor xmm6,xmm6
|
pxor xmm6, xmm6
|
||||||
punpcklwd xmm1,xmm5 ; xmm1=BOL
|
punpcklwd xmm1, xmm5 ; xmm1=BOL
|
||||||
punpckhwd xmm6,xmm5 ; xmm6=BOH
|
punpckhwd xmm6, xmm5 ; xmm6=BOH
|
||||||
psrld xmm1,1 ; xmm1=BOL*FIX(0.500)
|
psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
|
||||||
psrld xmm6,1 ; xmm6=BOH*FIX(0.500)
|
psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
|
||||||
|
|
||||||
movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
|
movdqa xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
|
||||||
|
|
||||||
paddd xmm7,xmm1
|
paddd xmm7, xmm1
|
||||||
paddd xmm4,xmm6
|
paddd xmm4, xmm6
|
||||||
paddd xmm7,xmm5
|
paddd xmm7, xmm5
|
||||||
paddd xmm4,xmm5
|
paddd xmm4, xmm5
|
||||||
psrld xmm7,SCALEBITS ; xmm7=CbOL
|
psrld xmm7, SCALEBITS ; xmm7=CbOL
|
||||||
psrld xmm4,SCALEBITS ; xmm4=CbOH
|
psrld xmm4, SCALEBITS ; xmm4=CbOH
|
||||||
packssdw xmm7,xmm4 ; xmm7=CbO
|
packssdw xmm7, xmm4 ; xmm7=CbO
|
||||||
|
|
||||||
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
|
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
|
||||||
|
|
||||||
movdqa xmm6,xmm0
|
movdqa xmm6, xmm0
|
||||||
punpcklwd xmm0,xmm2
|
punpcklwd xmm0, xmm2
|
||||||
punpckhwd xmm6,xmm2
|
punpckhwd xmm6, xmm2
|
||||||
movdqa xmm5,xmm0
|
movdqa xmm5, xmm0
|
||||||
movdqa xmm4,xmm6
|
movdqa xmm4, xmm6
|
||||||
pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||||
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||||
pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
|
pmaddwd xmm5, [GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
|
||||||
pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
|
pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
|
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||||
movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
|
movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||||
|
|
||||||
pxor xmm0,xmm0
|
pxor xmm0, xmm0
|
||||||
pxor xmm6,xmm6
|
pxor xmm6, xmm6
|
||||||
punpcklwd xmm0,xmm1 ; xmm0=BEL
|
punpcklwd xmm0, xmm1 ; xmm0=BEL
|
||||||
punpckhwd xmm6,xmm1 ; xmm6=BEH
|
punpckhwd xmm6, xmm1 ; xmm6=BEH
|
||||||
psrld xmm0,1 ; xmm0=BEL*FIX(0.500)
|
psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
|
||||||
psrld xmm6,1 ; xmm6=BEH*FIX(0.500)
|
psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
|
||||||
|
|
||||||
movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
|
movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
|
||||||
|
|
||||||
paddd xmm5,xmm0
|
paddd xmm5, xmm0
|
||||||
paddd xmm4,xmm6
|
paddd xmm4, xmm6
|
||||||
paddd xmm5,xmm1
|
paddd xmm5, xmm1
|
||||||
paddd xmm4,xmm1
|
paddd xmm4, xmm1
|
||||||
psrld xmm5,SCALEBITS ; xmm5=CbEL
|
psrld xmm5, SCALEBITS ; xmm5=CbEL
|
||||||
psrld xmm4,SCALEBITS ; xmm4=CbEH
|
psrld xmm4, SCALEBITS ; xmm4=CbEH
|
||||||
packssdw xmm5,xmm4 ; xmm5=CbE
|
packssdw xmm5, xmm4 ; xmm5=CbE
|
||||||
|
|
||||||
psllw xmm7,BYTE_BIT
|
psllw xmm7, BYTE_BIT
|
||||||
por xmm5,xmm7 ; xmm5=Cb
|
por xmm5, xmm7 ; xmm5=Cb
|
||||||
movdqa XMMWORD [ebx], xmm5 ; Save Cb
|
movdqa XMMWORD [ebx], xmm5 ; Save Cb
|
||||||
|
|
||||||
movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
|
movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
|
||||||
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
|
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
|
||||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
|
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
|
||||||
|
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
punpcklwd xmm0,xmm3
|
punpcklwd xmm0, xmm3
|
||||||
punpckhwd xmm4,xmm3
|
punpckhwd xmm4, xmm3
|
||||||
movdqa xmm7,xmm0
|
movdqa xmm7, xmm0
|
||||||
movdqa xmm5,xmm4
|
movdqa xmm5, xmm4
|
||||||
pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||||
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||||
pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
|
pmaddwd xmm7, [GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
|
||||||
pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
|
pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
|
||||||
|
|
||||||
movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
|
movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
|
||||||
|
|
||||||
paddd xmm0, XMMWORD [wk(4)]
|
paddd xmm0, XMMWORD [wk(4)]
|
||||||
paddd xmm4, XMMWORD [wk(5)]
|
paddd xmm4, XMMWORD [wk(5)]
|
||||||
paddd xmm0,xmm3
|
paddd xmm0, xmm3
|
||||||
paddd xmm4,xmm3
|
paddd xmm4, xmm3
|
||||||
psrld xmm0,SCALEBITS ; xmm0=YOL
|
psrld xmm0, SCALEBITS ; xmm0=YOL
|
||||||
psrld xmm4,SCALEBITS ; xmm4=YOH
|
psrld xmm4, SCALEBITS ; xmm4=YOH
|
||||||
packssdw xmm0,xmm4 ; xmm0=YO
|
packssdw xmm0, xmm4 ; xmm0=YO
|
||||||
|
|
||||||
pxor xmm3,xmm3
|
pxor xmm3, xmm3
|
||||||
pxor xmm4,xmm4
|
pxor xmm4, xmm4
|
||||||
punpcklwd xmm3,xmm1 ; xmm3=ROL
|
punpcklwd xmm3, xmm1 ; xmm3=ROL
|
||||||
punpckhwd xmm4,xmm1 ; xmm4=ROH
|
punpckhwd xmm4, xmm1 ; xmm4=ROH
|
||||||
psrld xmm3,1 ; xmm3=ROL*FIX(0.500)
|
psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
|
||||||
psrld xmm4,1 ; xmm4=ROH*FIX(0.500)
|
psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
|
||||||
|
|
||||||
movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
|
movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
|
||||||
|
|
||||||
paddd xmm7,xmm3
|
paddd xmm7, xmm3
|
||||||
paddd xmm5,xmm4
|
paddd xmm5, xmm4
|
||||||
paddd xmm7,xmm1
|
paddd xmm7, xmm1
|
||||||
paddd xmm5,xmm1
|
paddd xmm5, xmm1
|
||||||
psrld xmm7,SCALEBITS ; xmm7=CrOL
|
psrld xmm7, SCALEBITS ; xmm7=CrOL
|
||||||
psrld xmm5,SCALEBITS ; xmm5=CrOH
|
psrld xmm5, SCALEBITS ; xmm5=CrOH
|
||||||
packssdw xmm7,xmm5 ; xmm7=CrO
|
packssdw xmm7, xmm5 ; xmm7=CrO
|
||||||
|
|
||||||
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
|
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
|
||||||
|
|
||||||
movdqa xmm4,xmm6
|
movdqa xmm4, xmm6
|
||||||
punpcklwd xmm6,xmm2
|
punpcklwd xmm6, xmm2
|
||||||
punpckhwd xmm4,xmm2
|
punpckhwd xmm4, xmm2
|
||||||
movdqa xmm1,xmm6
|
movdqa xmm1, xmm6
|
||||||
movdqa xmm5,xmm4
|
movdqa xmm5, xmm4
|
||||||
pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||||
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||||
pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
|
pmaddwd xmm1, [GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
|
||||||
pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
|
pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
|
||||||
|
|
||||||
movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
|
movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
|
||||||
|
|
||||||
paddd xmm6, XMMWORD [wk(6)]
|
paddd xmm6, XMMWORD [wk(6)]
|
||||||
paddd xmm4, XMMWORD [wk(7)]
|
paddd xmm4, XMMWORD [wk(7)]
|
||||||
paddd xmm6,xmm2
|
paddd xmm6, xmm2
|
||||||
paddd xmm4,xmm2
|
paddd xmm4, xmm2
|
||||||
psrld xmm6,SCALEBITS ; xmm6=YEL
|
psrld xmm6, SCALEBITS ; xmm6=YEL
|
||||||
psrld xmm4,SCALEBITS ; xmm4=YEH
|
psrld xmm4, SCALEBITS ; xmm4=YEH
|
||||||
packssdw xmm6,xmm4 ; xmm6=YE
|
packssdw xmm6, xmm4 ; xmm6=YE
|
||||||
|
|
||||||
psllw xmm0,BYTE_BIT
|
psllw xmm0, BYTE_BIT
|
||||||
por xmm6,xmm0 ; xmm6=Y
|
por xmm6, xmm0 ; xmm6=Y
|
||||||
movdqa XMMWORD [edi], xmm6 ; Save Y
|
movdqa XMMWORD [edi], xmm6 ; Save Y
|
||||||
|
|
||||||
pxor xmm2,xmm2
|
pxor xmm2, xmm2
|
||||||
pxor xmm4,xmm4
|
pxor xmm4, xmm4
|
||||||
punpcklwd xmm2,xmm3 ; xmm2=REL
|
punpcklwd xmm2, xmm3 ; xmm2=REL
|
||||||
punpckhwd xmm4,xmm3 ; xmm4=REH
|
punpckhwd xmm4, xmm3 ; xmm4=REH
|
||||||
psrld xmm2,1 ; xmm2=REL*FIX(0.500)
|
psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
|
||||||
psrld xmm4,1 ; xmm4=REH*FIX(0.500)
|
psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
|
||||||
|
|
||||||
movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
|
movdqa xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
|
||||||
|
|
||||||
paddd xmm1,xmm2
|
paddd xmm1, xmm2
|
||||||
paddd xmm5,xmm4
|
paddd xmm5, xmm4
|
||||||
paddd xmm1,xmm0
|
paddd xmm1, xmm0
|
||||||
paddd xmm5,xmm0
|
paddd xmm5, xmm0
|
||||||
psrld xmm1,SCALEBITS ; xmm1=CrEL
|
psrld xmm1, SCALEBITS ; xmm1=CrEL
|
||||||
psrld xmm5,SCALEBITS ; xmm5=CrEH
|
psrld xmm5, SCALEBITS ; xmm5=CrEH
|
||||||
packssdw xmm1,xmm5 ; xmm1=CrE
|
packssdw xmm1, xmm5 ; xmm1=CrE
|
||||||
|
|
||||||
psllw xmm7,BYTE_BIT
|
psllw xmm7, BYTE_BIT
|
||||||
por xmm1,xmm7 ; xmm1=Cr
|
por xmm1, xmm7 ; xmm1=Cr
|
||||||
movdqa XMMWORD [edx], xmm1 ; Save Cr
|
movdqa XMMWORD [edx], xmm1 ; Save Cr
|
||||||
|
|
||||||
sub ecx, byte SIZEOF_XMMWORD
|
sub ecx, byte SIZEOF_XMMWORD
|
||||||
@@ -470,7 +470,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
add edx, byte SIZEOF_XMMWORD ; outptr2
|
add edx, byte SIZEOF_XMMWORD ; outptr2
|
||||||
cmp ecx, byte SIZEOF_XMMWORD
|
cmp ecx, byte SIZEOF_XMMWORD
|
||||||
jae near .columnloop
|
jae near .columnloop
|
||||||
test ecx,ecx
|
test ecx, ecx
|
||||||
jnz near .column_ld1
|
jnz near .column_ld1
|
||||||
|
|
||||||
pop ecx ; col
|
pop ecx ; col
|
||||||
@@ -493,7 +493,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
; pop edx ; need not be preserved
|
; pop edx ; need not be preserved
|
||||||
; pop ecx ; need not be preserved
|
; pop ecx ; need not be preserved
|
||||||
pop ebx
|
pop ebx
|
||||||
mov esp,ebp ; esp <- aligned ebp
|
mov esp, ebp ; esp <- aligned ebp
|
||||||
pop esp ; esp <- original ebp
|
pop esp ; esp <- original ebp
|
||||||
pop ebp
|
pop ebp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -42,17 +42,17 @@
|
|||||||
|
|
||||||
EXTN(jsimd_rgb_gray_convert_sse2):
|
EXTN(jsimd_rgb_gray_convert_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp ; rax = original rbp
|
mov rax, rsp ; rax = original rbp
|
||||||
sub rsp, byte 4
|
sub rsp, byte 4
|
||||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [rsp],rax
|
mov [rsp], rax
|
||||||
mov rbp,rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
mov ecx, r10d
|
mov ecx, r10d
|
||||||
test rcx,rcx
|
test rcx, rcx
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
push rcx
|
push rcx
|
||||||
@@ -66,7 +66,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
|
|
||||||
mov rsi, r11
|
mov rsi, r11
|
||||||
mov eax, r14d
|
mov eax, r14d
|
||||||
test rax,rax
|
test rax, rax
|
||||||
jle near .return
|
jle near .return
|
||||||
.rowloop:
|
.rowloop:
|
||||||
push rdi
|
push rdi
|
||||||
@@ -84,7 +84,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
.column_ld1:
|
.column_ld1:
|
||||||
push rax
|
push rax
|
||||||
push rdx
|
push rdx
|
||||||
lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
|
lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
|
||||||
test cl, SIZEOF_BYTE
|
test cl, SIZEOF_BYTE
|
||||||
jz short .column_ld2
|
jz short .column_ld2
|
||||||
sub rcx, byte SIZEOF_BYTE
|
sub rcx, byte SIZEOF_BYTE
|
||||||
@@ -95,9 +95,9 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
sub rcx, byte SIZEOF_WORD
|
sub rcx, byte SIZEOF_WORD
|
||||||
movzx rdx, WORD [rsi+rcx]
|
movzx rdx, WORD [rsi+rcx]
|
||||||
shl rax, WORD_BIT
|
shl rax, WORD_BIT
|
||||||
or rax,rdx
|
or rax, rdx
|
||||||
.column_ld4:
|
.column_ld4:
|
||||||
movd xmmA,eax
|
movd xmmA, eax
|
||||||
pop rdx
|
pop rdx
|
||||||
pop rax
|
pop rax
|
||||||
test cl, SIZEOF_DWORD
|
test cl, SIZEOF_DWORD
|
||||||
@@ -105,18 +105,18 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
sub rcx, byte SIZEOF_DWORD
|
sub rcx, byte SIZEOF_DWORD
|
||||||
movd xmmF, XMM_DWORD [rsi+rcx]
|
movd xmmF, XMM_DWORD [rsi+rcx]
|
||||||
pslldq xmmA, SIZEOF_DWORD
|
pslldq xmmA, SIZEOF_DWORD
|
||||||
por xmmA,xmmF
|
por xmmA, xmmF
|
||||||
.column_ld8:
|
.column_ld8:
|
||||||
test cl, SIZEOF_MMWORD
|
test cl, SIZEOF_MMWORD
|
||||||
jz short .column_ld16
|
jz short .column_ld16
|
||||||
sub rcx, byte SIZEOF_MMWORD
|
sub rcx, byte SIZEOF_MMWORD
|
||||||
movq xmmB, XMM_MMWORD [rsi+rcx]
|
movq xmmB, XMM_MMWORD [rsi+rcx]
|
||||||
pslldq xmmA, SIZEOF_MMWORD
|
pslldq xmmA, SIZEOF_MMWORD
|
||||||
por xmmA,xmmB
|
por xmmA, xmmB
|
||||||
.column_ld16:
|
.column_ld16:
|
||||||
test cl, SIZEOF_XMMWORD
|
test cl, SIZEOF_XMMWORD
|
||||||
jz short .column_ld32
|
jz short .column_ld32
|
||||||
movdqa xmmF,xmmA
|
movdqa xmmF, xmmA
|
||||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||||
mov rcx, SIZEOF_XMMWORD
|
mov rcx, SIZEOF_XMMWORD
|
||||||
jmp short .rgb_gray_cnv
|
jmp short .rgb_gray_cnv
|
||||||
@@ -124,7 +124,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
test cl, 2*SIZEOF_XMMWORD
|
test cl, 2*SIZEOF_XMMWORD
|
||||||
mov rcx, SIZEOF_XMMWORD
|
mov rcx, SIZEOF_XMMWORD
|
||||||
jz short .rgb_gray_cnv
|
jz short .rgb_gray_cnv
|
||||||
movdqa xmmB,xmmA
|
movdqa xmmB, xmmA
|
||||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||||
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||||
jmp short .rgb_gray_cnv
|
jmp short .rgb_gray_cnv
|
||||||
@@ -139,49 +139,49 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||||
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||||
|
|
||||||
movdqa xmmG,xmmA
|
movdqa xmmG, xmmA
|
||||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
|
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
|
||||||
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
|
psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
|
||||||
|
|
||||||
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
|
punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
|
||||||
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
|
pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
|
||||||
|
|
||||||
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
|
punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
|
||||||
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
|
punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
|
||||||
|
|
||||||
movdqa xmmD,xmmA
|
movdqa xmmD, xmmA
|
||||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
|
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
|
||||||
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
|
psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
|
||||||
|
|
||||||
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
|
punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
|
||||||
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
|
pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
|
||||||
|
|
||||||
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
|
punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
|
||||||
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
|
punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
|
||||||
|
|
||||||
movdqa xmmE,xmmA
|
movdqa xmmE, xmmA
|
||||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
|
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
|
||||||
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
|
psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
|
||||||
|
|
||||||
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||||
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
|
pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
|
||||||
|
|
||||||
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
|
punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
|
||||||
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
|
punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
|
||||||
|
|
||||||
pxor xmmH,xmmH
|
pxor xmmH, xmmH
|
||||||
|
|
||||||
movdqa xmmC,xmmA
|
movdqa xmmC, xmmA
|
||||||
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||||
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||||
|
|
||||||
movdqa xmmB,xmmE
|
movdqa xmmB, xmmE
|
||||||
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||||
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||||
|
|
||||||
movdqa xmmF,xmmD
|
movdqa xmmF, xmmD
|
||||||
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||||
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||||
|
|
||||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||||
|
|
||||||
@@ -196,19 +196,19 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
sub rcx, byte SIZEOF_XMMWORD/8
|
sub rcx, byte SIZEOF_XMMWORD/8
|
||||||
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
|
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||||
pslldq xmmA, SIZEOF_MMWORD
|
pslldq xmmA, SIZEOF_MMWORD
|
||||||
por xmmA,xmmE
|
por xmmA, xmmE
|
||||||
.column_ld4:
|
.column_ld4:
|
||||||
test cl, SIZEOF_XMMWORD/4
|
test cl, SIZEOF_XMMWORD/4
|
||||||
jz short .column_ld8
|
jz short .column_ld8
|
||||||
sub rcx, byte SIZEOF_XMMWORD/4
|
sub rcx, byte SIZEOF_XMMWORD/4
|
||||||
movdqa xmmE,xmmA
|
movdqa xmmE, xmmA
|
||||||
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
|
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||||
.column_ld8:
|
.column_ld8:
|
||||||
test cl, SIZEOF_XMMWORD/2
|
test cl, SIZEOF_XMMWORD/2
|
||||||
mov rcx, SIZEOF_XMMWORD
|
mov rcx, SIZEOF_XMMWORD
|
||||||
jz short .rgb_gray_cnv
|
jz short .rgb_gray_cnv
|
||||||
movdqa xmmF,xmmA
|
movdqa xmmF, xmmA
|
||||||
movdqa xmmH,xmmE
|
movdqa xmmH, xmmE
|
||||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||||
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||||
jmp short .rgb_gray_cnv
|
jmp short .rgb_gray_cnv
|
||||||
@@ -225,48 +225,48 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||||
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||||
|
|
||||||
movdqa xmmD,xmmA
|
movdqa xmmD, xmmA
|
||||||
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
|
punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
|
||||||
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
|
punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
|
||||||
|
|
||||||
movdqa xmmC,xmmF
|
movdqa xmmC, xmmF
|
||||||
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
|
punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
|
||||||
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
|
punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
|
||||||
|
|
||||||
movdqa xmmB,xmmA
|
movdqa xmmB, xmmA
|
||||||
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
|
punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
|
||||||
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
|
punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
|
||||||
|
|
||||||
movdqa xmmG,xmmD
|
movdqa xmmG, xmmD
|
||||||
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
|
punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
|
||||||
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
|
punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
|
||||||
|
|
||||||
movdqa xmmE,xmmA
|
movdqa xmmE, xmmA
|
||||||
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||||
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
|
punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
|
||||||
|
|
||||||
movdqa xmmH,xmmB
|
movdqa xmmH, xmmB
|
||||||
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
|
punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
|
||||||
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
|
punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
|
||||||
|
|
||||||
pxor xmmF,xmmF
|
pxor xmmF, xmmF
|
||||||
|
|
||||||
movdqa xmmC,xmmA
|
movdqa xmmC, xmmA
|
||||||
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||||
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||||
|
|
||||||
movdqa xmmD,xmmB
|
movdqa xmmD, xmmB
|
||||||
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||||
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||||
|
|
||||||
movdqa xmmG,xmmE
|
movdqa xmmG, xmmE
|
||||||
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||||
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
|
punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
|
||||||
|
|
||||||
punpcklbw xmmF,xmmH
|
punpcklbw xmmF, xmmH
|
||||||
punpckhbw xmmH,xmmH
|
punpckhbw xmmH, xmmH
|
||||||
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||||
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
|
psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
|
||||||
|
|
||||||
%endif ; RGB_PIXELSIZE ; ---------------
|
%endif ; RGB_PIXELSIZE ; ---------------
|
||||||
|
|
||||||
@@ -279,19 +279,19 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
; (This implementation)
|
; (This implementation)
|
||||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||||
|
|
||||||
movdqa xmm6,xmm1
|
movdqa xmm6, xmm1
|
||||||
punpcklwd xmm1,xmm3
|
punpcklwd xmm1, xmm3
|
||||||
punpckhwd xmm6,xmm3
|
punpckhwd xmm6, xmm3
|
||||||
pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||||
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||||
|
|
||||||
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
|
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||||
|
|
||||||
movdqa xmm6,xmm0
|
movdqa xmm6, xmm0
|
||||||
punpcklwd xmm0,xmm2
|
punpcklwd xmm0, xmm2
|
||||||
punpckhwd xmm6,xmm2
|
punpckhwd xmm6, xmm2
|
||||||
pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||||
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
|
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
|
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||||
@@ -299,40 +299,40 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
movdqa xmm0, xmm5 ; xmm0=BO
|
movdqa xmm0, xmm5 ; xmm0=BO
|
||||||
movdqa xmm6, xmm4 ; xmm6=BE
|
movdqa xmm6, xmm4 ; xmm6=BE
|
||||||
|
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
punpcklwd xmm0,xmm3
|
punpcklwd xmm0, xmm3
|
||||||
punpckhwd xmm4,xmm3
|
punpckhwd xmm4, xmm3
|
||||||
pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||||
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||||
|
|
||||||
movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
|
movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
|
||||||
|
|
||||||
paddd xmm0, xmm1
|
paddd xmm0, xmm1
|
||||||
paddd xmm4, xmm7
|
paddd xmm4, xmm7
|
||||||
paddd xmm0,xmm3
|
paddd xmm0, xmm3
|
||||||
paddd xmm4,xmm3
|
paddd xmm4, xmm3
|
||||||
psrld xmm0,SCALEBITS ; xmm0=YOL
|
psrld xmm0, SCALEBITS ; xmm0=YOL
|
||||||
psrld xmm4,SCALEBITS ; xmm4=YOH
|
psrld xmm4, SCALEBITS ; xmm4=YOH
|
||||||
packssdw xmm0,xmm4 ; xmm0=YO
|
packssdw xmm0, xmm4 ; xmm0=YO
|
||||||
|
|
||||||
movdqa xmm4,xmm6
|
movdqa xmm4, xmm6
|
||||||
punpcklwd xmm6,xmm2
|
punpcklwd xmm6, xmm2
|
||||||
punpckhwd xmm4,xmm2
|
punpckhwd xmm4, xmm2
|
||||||
pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||||
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||||
|
|
||||||
movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
|
movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
|
||||||
|
|
||||||
paddd xmm6, XMMWORD [wk(0)]
|
paddd xmm6, XMMWORD [wk(0)]
|
||||||
paddd xmm4, XMMWORD [wk(1)]
|
paddd xmm4, XMMWORD [wk(1)]
|
||||||
paddd xmm6,xmm2
|
paddd xmm6, xmm2
|
||||||
paddd xmm4,xmm2
|
paddd xmm4, xmm2
|
||||||
psrld xmm6,SCALEBITS ; xmm6=YEL
|
psrld xmm6, SCALEBITS ; xmm6=YEL
|
||||||
psrld xmm4,SCALEBITS ; xmm4=YEH
|
psrld xmm4, SCALEBITS ; xmm4=YEH
|
||||||
packssdw xmm6,xmm4 ; xmm6=YE
|
packssdw xmm6, xmm4 ; xmm6=YE
|
||||||
|
|
||||||
psllw xmm0,BYTE_BIT
|
psllw xmm0, BYTE_BIT
|
||||||
por xmm6,xmm0 ; xmm6=Y
|
por xmm6, xmm0 ; xmm6=Y
|
||||||
movdqa XMMWORD [rdi], xmm6 ; Save Y
|
movdqa XMMWORD [rdi], xmm6 ; Save Y
|
||||||
|
|
||||||
sub rcx, byte SIZEOF_XMMWORD
|
sub rcx, byte SIZEOF_XMMWORD
|
||||||
@@ -340,7 +340,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
add rdi, byte SIZEOF_XMMWORD ; outptr0
|
add rdi, byte SIZEOF_XMMWORD ; outptr0
|
||||||
cmp rcx, byte SIZEOF_XMMWORD
|
cmp rcx, byte SIZEOF_XMMWORD
|
||||||
jae near .columnloop
|
jae near .columnloop
|
||||||
test rcx,rcx
|
test rcx, rcx
|
||||||
jnz near .column_ld1
|
jnz near .column_ld1
|
||||||
|
|
||||||
pop rcx ; col
|
pop rcx ; col
|
||||||
@@ -355,7 +355,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
.return:
|
.return:
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args
|
||||||
mov rsp,rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -44,11 +44,11 @@
|
|||||||
|
|
||||||
EXTN(jsimd_rgb_gray_convert_sse2):
|
EXTN(jsimd_rgb_gray_convert_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov eax,esp ; eax = original ebp
|
mov eax, esp ; eax = original ebp
|
||||||
sub esp, byte 4
|
sub esp, byte 4
|
||||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [esp],eax
|
mov [esp], eax
|
||||||
mov ebp,esp ; ebp = aligned ebp
|
mov ebp, esp ; ebp = aligned ebp
|
||||||
lea esp, [wk(0)]
|
lea esp, [wk(0)]
|
||||||
pushpic eax ; make a room for GOT address
|
pushpic eax ; make a room for GOT address
|
||||||
push ebx
|
push ebx
|
||||||
@@ -61,7 +61,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
movpic POINTER [gotptr], ebx ; save GOT address
|
movpic POINTER [gotptr], ebx ; save GOT address
|
||||||
|
|
||||||
mov ecx, JDIMENSION [img_width(eax)]
|
mov ecx, JDIMENSION [img_width(eax)]
|
||||||
test ecx,ecx
|
test ecx, ecx
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
push ecx
|
push ecx
|
||||||
@@ -75,9 +75,9 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
|
|
||||||
mov esi, JSAMPARRAY [input_buf(eax)]
|
mov esi, JSAMPARRAY [input_buf(eax)]
|
||||||
mov eax, INT [num_rows(eax)]
|
mov eax, INT [num_rows(eax)]
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jle near .return
|
jle near .return
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.rowloop:
|
.rowloop:
|
||||||
pushpic eax
|
pushpic eax
|
||||||
push edi
|
push edi
|
||||||
@@ -90,14 +90,14 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
|
|
||||||
cmp ecx, byte SIZEOF_XMMWORD
|
cmp ecx, byte SIZEOF_XMMWORD
|
||||||
jae near .columnloop
|
jae near .columnloop
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||||
|
|
||||||
.column_ld1:
|
.column_ld1:
|
||||||
push eax
|
push eax
|
||||||
push edx
|
push edx
|
||||||
lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
|
lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
|
||||||
test cl, SIZEOF_BYTE
|
test cl, SIZEOF_BYTE
|
||||||
jz short .column_ld2
|
jz short .column_ld2
|
||||||
sub ecx, byte SIZEOF_BYTE
|
sub ecx, byte SIZEOF_BYTE
|
||||||
@@ -108,9 +108,9 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
sub ecx, byte SIZEOF_WORD
|
sub ecx, byte SIZEOF_WORD
|
||||||
movzx edx, WORD [esi+ecx]
|
movzx edx, WORD [esi+ecx]
|
||||||
shl eax, WORD_BIT
|
shl eax, WORD_BIT
|
||||||
or eax,edx
|
or eax, edx
|
||||||
.column_ld4:
|
.column_ld4:
|
||||||
movd xmmA,eax
|
movd xmmA, eax
|
||||||
pop edx
|
pop edx
|
||||||
pop eax
|
pop eax
|
||||||
test cl, SIZEOF_DWORD
|
test cl, SIZEOF_DWORD
|
||||||
@@ -118,18 +118,18 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
sub ecx, byte SIZEOF_DWORD
|
sub ecx, byte SIZEOF_DWORD
|
||||||
movd xmmF, XMM_DWORD [esi+ecx]
|
movd xmmF, XMM_DWORD [esi+ecx]
|
||||||
pslldq xmmA, SIZEOF_DWORD
|
pslldq xmmA, SIZEOF_DWORD
|
||||||
por xmmA,xmmF
|
por xmmA, xmmF
|
||||||
.column_ld8:
|
.column_ld8:
|
||||||
test cl, SIZEOF_MMWORD
|
test cl, SIZEOF_MMWORD
|
||||||
jz short .column_ld16
|
jz short .column_ld16
|
||||||
sub ecx, byte SIZEOF_MMWORD
|
sub ecx, byte SIZEOF_MMWORD
|
||||||
movq xmmB, XMM_MMWORD [esi+ecx]
|
movq xmmB, XMM_MMWORD [esi+ecx]
|
||||||
pslldq xmmA, SIZEOF_MMWORD
|
pslldq xmmA, SIZEOF_MMWORD
|
||||||
por xmmA,xmmB
|
por xmmA, xmmB
|
||||||
.column_ld16:
|
.column_ld16:
|
||||||
test cl, SIZEOF_XMMWORD
|
test cl, SIZEOF_XMMWORD
|
||||||
jz short .column_ld32
|
jz short .column_ld32
|
||||||
movdqa xmmF,xmmA
|
movdqa xmmF, xmmA
|
||||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
mov ecx, SIZEOF_XMMWORD
|
mov ecx, SIZEOF_XMMWORD
|
||||||
jmp short .rgb_gray_cnv
|
jmp short .rgb_gray_cnv
|
||||||
@@ -137,11 +137,11 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
test cl, 2*SIZEOF_XMMWORD
|
test cl, 2*SIZEOF_XMMWORD
|
||||||
mov ecx, SIZEOF_XMMWORD
|
mov ecx, SIZEOF_XMMWORD
|
||||||
jz short .rgb_gray_cnv
|
jz short .rgb_gray_cnv
|
||||||
movdqa xmmB,xmmA
|
movdqa xmmB, xmmA
|
||||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||||
jmp short .rgb_gray_cnv
|
jmp short .rgb_gray_cnv
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.columnloop:
|
.columnloop:
|
||||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
@@ -153,49 +153,49 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||||
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||||
|
|
||||||
movdqa xmmG,xmmA
|
movdqa xmmG, xmmA
|
||||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
|
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
|
||||||
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
|
psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
|
||||||
|
|
||||||
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
|
punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
|
||||||
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
|
pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
|
||||||
|
|
||||||
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
|
punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
|
||||||
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
|
punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
|
||||||
|
|
||||||
movdqa xmmD,xmmA
|
movdqa xmmD, xmmA
|
||||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
|
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
|
||||||
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
|
psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
|
||||||
|
|
||||||
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
|
punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
|
||||||
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
|
pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
|
||||||
|
|
||||||
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
|
punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
|
||||||
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
|
punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
|
||||||
|
|
||||||
movdqa xmmE,xmmA
|
movdqa xmmE, xmmA
|
||||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
|
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
|
||||||
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
|
psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
|
||||||
|
|
||||||
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||||
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
|
pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
|
||||||
|
|
||||||
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
|
punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
|
||||||
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
|
punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
|
||||||
|
|
||||||
pxor xmmH,xmmH
|
pxor xmmH, xmmH
|
||||||
|
|
||||||
movdqa xmmC,xmmA
|
movdqa xmmC, xmmA
|
||||||
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||||
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||||
|
|
||||||
movdqa xmmB,xmmE
|
movdqa xmmB, xmmE
|
||||||
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||||
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||||
|
|
||||||
movdqa xmmF,xmmD
|
movdqa xmmF, xmmD
|
||||||
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||||
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||||
|
|
||||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||||
|
|
||||||
@@ -210,23 +210,23 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
sub ecx, byte SIZEOF_XMMWORD/8
|
sub ecx, byte SIZEOF_XMMWORD/8
|
||||||
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
|
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||||
pslldq xmmA, SIZEOF_MMWORD
|
pslldq xmmA, SIZEOF_MMWORD
|
||||||
por xmmA,xmmE
|
por xmmA, xmmE
|
||||||
.column_ld4:
|
.column_ld4:
|
||||||
test cl, SIZEOF_XMMWORD/4
|
test cl, SIZEOF_XMMWORD/4
|
||||||
jz short .column_ld8
|
jz short .column_ld8
|
||||||
sub ecx, byte SIZEOF_XMMWORD/4
|
sub ecx, byte SIZEOF_XMMWORD/4
|
||||||
movdqa xmmE,xmmA
|
movdqa xmmE, xmmA
|
||||||
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
|
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||||
.column_ld8:
|
.column_ld8:
|
||||||
test cl, SIZEOF_XMMWORD/2
|
test cl, SIZEOF_XMMWORD/2
|
||||||
mov ecx, SIZEOF_XMMWORD
|
mov ecx, SIZEOF_XMMWORD
|
||||||
jz short .rgb_gray_cnv
|
jz short .rgb_gray_cnv
|
||||||
movdqa xmmF,xmmA
|
movdqa xmmF, xmmA
|
||||||
movdqa xmmH,xmmE
|
movdqa xmmH, xmmE
|
||||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||||
jmp short .rgb_gray_cnv
|
jmp short .rgb_gray_cnv
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.columnloop:
|
.columnloop:
|
||||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
@@ -240,48 +240,48 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||||
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||||
|
|
||||||
movdqa xmmD,xmmA
|
movdqa xmmD, xmmA
|
||||||
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
|
punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
|
||||||
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
|
punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
|
||||||
|
|
||||||
movdqa xmmC,xmmF
|
movdqa xmmC, xmmF
|
||||||
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
|
punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
|
||||||
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
|
punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
|
||||||
|
|
||||||
movdqa xmmB,xmmA
|
movdqa xmmB, xmmA
|
||||||
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
|
punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
|
||||||
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
|
punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
|
||||||
|
|
||||||
movdqa xmmG,xmmD
|
movdqa xmmG, xmmD
|
||||||
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
|
punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
|
||||||
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
|
punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
|
||||||
|
|
||||||
movdqa xmmE,xmmA
|
movdqa xmmE, xmmA
|
||||||
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||||
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
|
punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
|
||||||
|
|
||||||
movdqa xmmH,xmmB
|
movdqa xmmH, xmmB
|
||||||
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
|
punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
|
||||||
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
|
punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
|
||||||
|
|
||||||
pxor xmmF,xmmF
|
pxor xmmF, xmmF
|
||||||
|
|
||||||
movdqa xmmC,xmmA
|
movdqa xmmC, xmmA
|
||||||
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||||
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||||
|
|
||||||
movdqa xmmD,xmmB
|
movdqa xmmD, xmmB
|
||||||
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||||
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||||
|
|
||||||
movdqa xmmG,xmmE
|
movdqa xmmG, xmmE
|
||||||
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||||
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
|
punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
|
||||||
|
|
||||||
punpcklbw xmmF,xmmH
|
punpcklbw xmmF, xmmH
|
||||||
punpckhbw xmmH,xmmH
|
punpckhbw xmmH, xmmH
|
||||||
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||||
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
|
psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
|
||||||
|
|
||||||
%endif ; RGB_PIXELSIZE ; ---------------
|
%endif ; RGB_PIXELSIZE ; ---------------
|
||||||
|
|
||||||
@@ -294,19 +294,19 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
; (This implementation)
|
; (This implementation)
|
||||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||||
|
|
||||||
movdqa xmm6,xmm1
|
movdqa xmm6, xmm1
|
||||||
punpcklwd xmm1,xmm3
|
punpcklwd xmm1, xmm3
|
||||||
punpckhwd xmm6,xmm3
|
punpckhwd xmm6, xmm3
|
||||||
pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||||
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||||
|
|
||||||
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
|
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||||
|
|
||||||
movdqa xmm6,xmm0
|
movdqa xmm6, xmm0
|
||||||
punpcklwd xmm0,xmm2
|
punpcklwd xmm0, xmm2
|
||||||
punpckhwd xmm6,xmm2
|
punpckhwd xmm6, xmm2
|
||||||
pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||||
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
|
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
|
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||||
@@ -314,40 +314,40 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
movdqa xmm0, xmm5 ; xmm0=BO
|
movdqa xmm0, xmm5 ; xmm0=BO
|
||||||
movdqa xmm6, xmm4 ; xmm6=BE
|
movdqa xmm6, xmm4 ; xmm6=BE
|
||||||
|
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
punpcklwd xmm0,xmm3
|
punpcklwd xmm0, xmm3
|
||||||
punpckhwd xmm4,xmm3
|
punpckhwd xmm4, xmm3
|
||||||
pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||||
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||||
|
|
||||||
movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
|
movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
|
||||||
|
|
||||||
paddd xmm0, xmm1
|
paddd xmm0, xmm1
|
||||||
paddd xmm4, xmm7
|
paddd xmm4, xmm7
|
||||||
paddd xmm0,xmm3
|
paddd xmm0, xmm3
|
||||||
paddd xmm4,xmm3
|
paddd xmm4, xmm3
|
||||||
psrld xmm0,SCALEBITS ; xmm0=YOL
|
psrld xmm0, SCALEBITS ; xmm0=YOL
|
||||||
psrld xmm4,SCALEBITS ; xmm4=YOH
|
psrld xmm4, SCALEBITS ; xmm4=YOH
|
||||||
packssdw xmm0,xmm4 ; xmm0=YO
|
packssdw xmm0, xmm4 ; xmm0=YO
|
||||||
|
|
||||||
movdqa xmm4,xmm6
|
movdqa xmm4, xmm6
|
||||||
punpcklwd xmm6,xmm2
|
punpcklwd xmm6, xmm2
|
||||||
punpckhwd xmm4,xmm2
|
punpckhwd xmm4, xmm2
|
||||||
pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||||
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||||
|
|
||||||
movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
|
movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
|
||||||
|
|
||||||
paddd xmm6, XMMWORD [wk(0)]
|
paddd xmm6, XMMWORD [wk(0)]
|
||||||
paddd xmm4, XMMWORD [wk(1)]
|
paddd xmm4, XMMWORD [wk(1)]
|
||||||
paddd xmm6,xmm2
|
paddd xmm6, xmm2
|
||||||
paddd xmm4,xmm2
|
paddd xmm4, xmm2
|
||||||
psrld xmm6,SCALEBITS ; xmm6=YEL
|
psrld xmm6, SCALEBITS ; xmm6=YEL
|
||||||
psrld xmm4,SCALEBITS ; xmm4=YEH
|
psrld xmm4, SCALEBITS ; xmm4=YEH
|
||||||
packssdw xmm6,xmm4 ; xmm6=YE
|
packssdw xmm6, xmm4 ; xmm6=YE
|
||||||
|
|
||||||
psllw xmm0,BYTE_BIT
|
psllw xmm0, BYTE_BIT
|
||||||
por xmm6,xmm0 ; xmm6=Y
|
por xmm6, xmm0 ; xmm6=Y
|
||||||
movdqa XMMWORD [edi], xmm6 ; Save Y
|
movdqa XMMWORD [edi], xmm6 ; Save Y
|
||||||
|
|
||||||
sub ecx, byte SIZEOF_XMMWORD
|
sub ecx, byte SIZEOF_XMMWORD
|
||||||
@@ -355,7 +355,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
add edi, byte SIZEOF_XMMWORD ; outptr0
|
add edi, byte SIZEOF_XMMWORD ; outptr0
|
||||||
cmp ecx, byte SIZEOF_XMMWORD
|
cmp ecx, byte SIZEOF_XMMWORD
|
||||||
jae near .columnloop
|
jae near .columnloop
|
||||||
test ecx,ecx
|
test ecx, ecx
|
||||||
jnz near .column_ld1
|
jnz near .column_ld1
|
||||||
|
|
||||||
pop ecx ; col
|
pop ecx ; col
|
||||||
@@ -374,7 +374,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
; pop edx ; need not be preserved
|
; pop edx ; need not be preserved
|
||||||
; pop ecx ; need not be preserved
|
; pop ecx ; need not be preserved
|
||||||
pop ebx
|
pop ebx
|
||||||
mov esp,ebp ; esp <- aligned ebp
|
mov esp, ebp ; esp <- aligned ebp
|
||||||
pop esp ; esp <- original ebp
|
pop esp ; esp <- original ebp
|
||||||
pop ebp
|
pop ebp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -187,10 +187,10 @@ EXTN(jconst_huff_encode_one_block):
|
|||||||
|
|
||||||
EXTN(jsimd_huff_encode_one_block_sse2):
|
EXTN(jsimd_huff_encode_one_block_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp ; rax = original rbp
|
mov rax, rsp ; rax = original rbp
|
||||||
sub rsp, byte 4
|
sub rsp, byte 4
|
||||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [rsp],rax
|
mov [rsp], rax
|
||||||
mov rbp,rsp ; rbp = aligned rbp
|
mov rbp,rsp ; rbp = aligned rbp
|
||||||
lea rsp, [t2]
|
lea rsp, [t2]
|
||||||
collect_args
|
collect_args
|
||||||
@@ -350,7 +350,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
|||||||
add rsp, 4*SIZEOF_XMMWORD
|
add rsp, 4*SIZEOF_XMMWORD
|
||||||
%endif
|
%endif
|
||||||
uncollect_args
|
uncollect_args
|
||||||
mov rsp,rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -218,7 +218,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
|||||||
mov esi, ecx
|
mov esi, ecx
|
||||||
|
|
||||||
; This is a well-known technique for obtaining the absolute value
|
; This is a well-known technique for obtaining the absolute value
|
||||||
; without a branch. It is derived from an assembly language technique
|
; with out a branch. It is derived from an assembly language technique
|
||||||
; presented in "How to Optimize for the Pentium Processors",
|
; presented in "How to Optimize for the Pentium Processors",
|
||||||
; Copyright (c) 1996, 1997 by Agner Fog.
|
; Copyright (c) 1996, 1997 by Agner Fog.
|
||||||
mov edx, ecx
|
mov edx, ecx
|
||||||
|
|||||||
@@ -44,12 +44,12 @@
|
|||||||
|
|
||||||
EXTN(jsimd_h2v1_downsample_sse2):
|
EXTN(jsimd_h2v1_downsample_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp
|
mov rax, rsp
|
||||||
mov rbp,rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args
|
||||||
|
|
||||||
mov ecx, r13d
|
mov ecx, r13d
|
||||||
shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
|
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov edx, r10d
|
mov edx, r10d
|
||||||
@@ -57,12 +57,12 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
|||||||
; -- expand_right_edge
|
; -- expand_right_edge
|
||||||
|
|
||||||
push rcx
|
push rcx
|
||||||
shl rcx,1 ; output_cols * 2
|
shl rcx, 1 ; output_cols * 2
|
||||||
sub rcx,rdx
|
sub rcx, rdx
|
||||||
jle short .expand_end
|
jle short .expand_end
|
||||||
|
|
||||||
mov rax, r11
|
mov rax, r11
|
||||||
test rax,rax
|
test rax, rax
|
||||||
jle short .expand_end
|
jle short .expand_end
|
||||||
|
|
||||||
cld
|
cld
|
||||||
@@ -72,7 +72,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
|||||||
push rcx
|
push rcx
|
||||||
|
|
||||||
mov rdi, JSAMPROW [rsi]
|
mov rdi, JSAMPROW [rsi]
|
||||||
add rdi,rdx
|
add rdi, rdx
|
||||||
mov al, JSAMPLE [rdi-1]
|
mov al, JSAMPLE [rdi-1]
|
||||||
|
|
||||||
rep stosb
|
rep stosb
|
||||||
@@ -90,14 +90,14 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
|||||||
; -- h2v1_downsample
|
; -- h2v1_downsample
|
||||||
|
|
||||||
mov eax, r12d ; rowctr
|
mov eax, r12d ; rowctr
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jle near .return
|
jle near .return
|
||||||
|
|
||||||
mov rdx, 0x00010000 ; bias pattern
|
mov rdx, 0x00010000 ; bias pattern
|
||||||
movd xmm7,edx
|
movd xmm7, edx
|
||||||
pcmpeqw xmm6,xmm6
|
pcmpeqw xmm6, xmm6
|
||||||
pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
|
pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
|
||||||
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
||||||
|
|
||||||
mov rsi, r14 ; input_data
|
mov rsi, r14 ; input_data
|
||||||
mov rdi, r15 ; output_data
|
mov rdi, r15 ; output_data
|
||||||
@@ -114,7 +114,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
|||||||
|
|
||||||
.columnloop_r8:
|
.columnloop_r8:
|
||||||
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||||
pxor xmm1,xmm1
|
pxor xmm1, xmm1
|
||||||
mov rcx, SIZEOF_XMMWORD
|
mov rcx, SIZEOF_XMMWORD
|
||||||
jmp short .downsample
|
jmp short .downsample
|
||||||
|
|
||||||
@@ -123,22 +123,22 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
|||||||
movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
.downsample:
|
.downsample:
|
||||||
movdqa xmm2,xmm0
|
movdqa xmm2, xmm0
|
||||||
movdqa xmm3,xmm1
|
movdqa xmm3, xmm1
|
||||||
|
|
||||||
pand xmm0,xmm6
|
pand xmm0, xmm6
|
||||||
psrlw xmm2,BYTE_BIT
|
psrlw xmm2, BYTE_BIT
|
||||||
pand xmm1,xmm6
|
pand xmm1, xmm6
|
||||||
psrlw xmm3,BYTE_BIT
|
psrlw xmm3, BYTE_BIT
|
||||||
|
|
||||||
paddw xmm0,xmm2
|
paddw xmm0, xmm2
|
||||||
paddw xmm1,xmm3
|
paddw xmm1, xmm3
|
||||||
paddw xmm0,xmm7
|
paddw xmm0, xmm7
|
||||||
paddw xmm1,xmm7
|
paddw xmm1, xmm7
|
||||||
psrlw xmm0,1
|
psrlw xmm0, 1
|
||||||
psrlw xmm1,1
|
psrlw xmm1, 1
|
||||||
|
|
||||||
packuswb xmm0,xmm1
|
packuswb xmm0, xmm1
|
||||||
|
|
||||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
|
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
|
||||||
|
|
||||||
@@ -147,7 +147,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
|||||||
add rdi, byte 1*SIZEOF_XMMWORD ; outptr
|
add rdi, byte 1*SIZEOF_XMMWORD ; outptr
|
||||||
cmp rcx, byte SIZEOF_XMMWORD
|
cmp rcx, byte SIZEOF_XMMWORD
|
||||||
jae short .columnloop
|
jae short .columnloop
|
||||||
test rcx,rcx
|
test rcx, rcx
|
||||||
jnz short .columnloop_r8
|
jnz short .columnloop_r8
|
||||||
|
|
||||||
pop rsi
|
pop rsi
|
||||||
@@ -188,12 +188,12 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
|||||||
|
|
||||||
EXTN(jsimd_h2v2_downsample_sse2):
|
EXTN(jsimd_h2v2_downsample_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp
|
mov rax, rsp
|
||||||
mov rbp,rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args
|
||||||
|
|
||||||
mov ecx, r13d
|
mov ecx, r13d
|
||||||
shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
|
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov edx, r10d
|
mov edx, r10d
|
||||||
@@ -201,12 +201,12 @@ EXTN(jsimd_h2v2_downsample_sse2):
|
|||||||
; -- expand_right_edge
|
; -- expand_right_edge
|
||||||
|
|
||||||
push rcx
|
push rcx
|
||||||
shl rcx,1 ; output_cols * 2
|
shl rcx, 1 ; output_cols * 2
|
||||||
sub rcx,rdx
|
sub rcx, rdx
|
||||||
jle short .expand_end
|
jle short .expand_end
|
||||||
|
|
||||||
mov rax, r11
|
mov rax, r11
|
||||||
test rax,rax
|
test rax, rax
|
||||||
jle short .expand_end
|
jle short .expand_end
|
||||||
|
|
||||||
cld
|
cld
|
||||||
@@ -216,7 +216,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
|
|||||||
push rcx
|
push rcx
|
||||||
|
|
||||||
mov rdi, JSAMPROW [rsi]
|
mov rdi, JSAMPROW [rsi]
|
||||||
add rdi,rdx
|
add rdi, rdx
|
||||||
mov al, JSAMPLE [rdi-1]
|
mov al, JSAMPLE [rdi-1]
|
||||||
|
|
||||||
rep stosb
|
rep stosb
|
||||||
@@ -234,14 +234,14 @@ EXTN(jsimd_h2v2_downsample_sse2):
|
|||||||
; -- h2v2_downsample
|
; -- h2v2_downsample
|
||||||
|
|
||||||
mov eax, r12d ; rowctr
|
mov eax, r12d ; rowctr
|
||||||
test rax,rax
|
test rax, rax
|
||||||
jle near .return
|
jle near .return
|
||||||
|
|
||||||
mov rdx, 0x00020001 ; bias pattern
|
mov rdx, 0x00020001 ; bias pattern
|
||||||
movd xmm7,edx
|
movd xmm7, edx
|
||||||
pcmpeqw xmm6,xmm6
|
pcmpeqw xmm6, xmm6
|
||||||
pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
|
pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
|
||||||
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
||||||
|
|
||||||
mov rsi, r14 ; input_data
|
mov rsi, r14 ; input_data
|
||||||
mov rdi, r15 ; output_data
|
mov rdi, r15 ; output_data
|
||||||
@@ -260,8 +260,8 @@ EXTN(jsimd_h2v2_downsample_sse2):
|
|||||||
.columnloop_r8:
|
.columnloop_r8:
|
||||||
movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
|
movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
|
||||||
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||||
pxor xmm2,xmm2
|
pxor xmm2, xmm2
|
||||||
pxor xmm3,xmm3
|
pxor xmm3, xmm3
|
||||||
mov rcx, SIZEOF_XMMWORD
|
mov rcx, SIZEOF_XMMWORD
|
||||||
jmp short .downsample
|
jmp short .downsample
|
||||||
|
|
||||||
@@ -272,32 +272,32 @@ EXTN(jsimd_h2v2_downsample_sse2):
|
|||||||
movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
.downsample:
|
.downsample:
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
pand xmm0,xmm6
|
pand xmm0, xmm6
|
||||||
psrlw xmm4,BYTE_BIT
|
psrlw xmm4, BYTE_BIT
|
||||||
pand xmm1,xmm6
|
pand xmm1, xmm6
|
||||||
psrlw xmm5,BYTE_BIT
|
psrlw xmm5, BYTE_BIT
|
||||||
paddw xmm0,xmm4
|
paddw xmm0, xmm4
|
||||||
paddw xmm1,xmm5
|
paddw xmm1, xmm5
|
||||||
|
|
||||||
movdqa xmm4,xmm2
|
movdqa xmm4, xmm2
|
||||||
movdqa xmm5,xmm3
|
movdqa xmm5, xmm3
|
||||||
pand xmm2,xmm6
|
pand xmm2, xmm6
|
||||||
psrlw xmm4,BYTE_BIT
|
psrlw xmm4, BYTE_BIT
|
||||||
pand xmm3,xmm6
|
pand xmm3, xmm6
|
||||||
psrlw xmm5,BYTE_BIT
|
psrlw xmm5, BYTE_BIT
|
||||||
paddw xmm2,xmm4
|
paddw xmm2, xmm4
|
||||||
paddw xmm3,xmm5
|
paddw xmm3, xmm5
|
||||||
|
|
||||||
paddw xmm0,xmm1
|
paddw xmm0, xmm1
|
||||||
paddw xmm2,xmm3
|
paddw xmm2, xmm3
|
||||||
paddw xmm0,xmm7
|
paddw xmm0, xmm7
|
||||||
paddw xmm2,xmm7
|
paddw xmm2, xmm7
|
||||||
psrlw xmm0,2
|
psrlw xmm0, 2
|
||||||
psrlw xmm2,2
|
psrlw xmm2, 2
|
||||||
|
|
||||||
packuswb xmm0,xmm2
|
packuswb xmm0, xmm2
|
||||||
|
|
||||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
|
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
|
||||||
|
|
||||||
@@ -307,7 +307,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
|
|||||||
add rdi, byte 1*SIZEOF_XMMWORD ; outptr
|
add rdi, byte 1*SIZEOF_XMMWORD ; outptr
|
||||||
cmp rcx, byte SIZEOF_XMMWORD
|
cmp rcx, byte SIZEOF_XMMWORD
|
||||||
jae near .columnloop
|
jae near .columnloop
|
||||||
test rcx,rcx
|
test rcx, rcx
|
||||||
jnz near .columnloop_r8
|
jnz near .columnloop_r8
|
||||||
|
|
||||||
pop rsi
|
pop rsi
|
||||||
|
|||||||
@@ -43,7 +43,7 @@
|
|||||||
|
|
||||||
EXTN(jsimd_h2v1_downsample_sse2):
|
EXTN(jsimd_h2v1_downsample_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov ebp,esp
|
mov ebp, esp
|
||||||
; push ebx ; unused
|
; push ebx ; unused
|
||||||
; push ecx ; need not be preserved
|
; push ecx ; need not be preserved
|
||||||
; push edx ; need not be preserved
|
; push edx ; need not be preserved
|
||||||
@@ -51,7 +51,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
|||||||
push edi
|
push edi
|
||||||
|
|
||||||
mov ecx, JDIMENSION [width_blks(ebp)]
|
mov ecx, JDIMENSION [width_blks(ebp)]
|
||||||
shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
|
shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov edx, JDIMENSION [img_width(ebp)]
|
mov edx, JDIMENSION [img_width(ebp)]
|
||||||
@@ -59,23 +59,23 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
|||||||
; -- expand_right_edge
|
; -- expand_right_edge
|
||||||
|
|
||||||
push ecx
|
push ecx
|
||||||
shl ecx,1 ; output_cols * 2
|
shl ecx, 1 ; output_cols * 2
|
||||||
sub ecx,edx
|
sub ecx, edx
|
||||||
jle short .expand_end
|
jle short .expand_end
|
||||||
|
|
||||||
mov eax, INT [max_v_samp(ebp)]
|
mov eax, INT [max_v_samp(ebp)]
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jle short .expand_end
|
jle short .expand_end
|
||||||
|
|
||||||
cld
|
cld
|
||||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.expandloop:
|
.expandloop:
|
||||||
push eax
|
push eax
|
||||||
push ecx
|
push ecx
|
||||||
|
|
||||||
mov edi, JSAMPROW [esi]
|
mov edi, JSAMPROW [esi]
|
||||||
add edi,edx
|
add edi, edx
|
||||||
mov al, JSAMPLE [edi-1]
|
mov al, JSAMPLE [edi-1]
|
||||||
|
|
||||||
rep stosb
|
rep stosb
|
||||||
@@ -93,18 +93,18 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
|||||||
; -- h2v1_downsample
|
; -- h2v1_downsample
|
||||||
|
|
||||||
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
|
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jle near .return
|
jle near .return
|
||||||
|
|
||||||
mov edx, 0x00010000 ; bias pattern
|
mov edx, 0x00010000 ; bias pattern
|
||||||
movd xmm7,edx
|
movd xmm7, edx
|
||||||
pcmpeqw xmm6,xmm6
|
pcmpeqw xmm6, xmm6
|
||||||
pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
|
pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
|
||||||
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
||||||
|
|
||||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||||
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
|
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.rowloop:
|
.rowloop:
|
||||||
push ecx
|
push ecx
|
||||||
push edi
|
push edi
|
||||||
@@ -115,36 +115,36 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
|||||||
|
|
||||||
cmp ecx, byte SIZEOF_XMMWORD
|
cmp ecx, byte SIZEOF_XMMWORD
|
||||||
jae short .columnloop
|
jae short .columnloop
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.columnloop_r8:
|
.columnloop_r8:
|
||||||
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
pxor xmm1,xmm1
|
pxor xmm1, xmm1
|
||||||
mov ecx, SIZEOF_XMMWORD
|
mov ecx, SIZEOF_XMMWORD
|
||||||
jmp short .downsample
|
jmp short .downsample
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.columnloop:
|
.columnloop:
|
||||||
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
.downsample:
|
.downsample:
|
||||||
movdqa xmm2,xmm0
|
movdqa xmm2, xmm0
|
||||||
movdqa xmm3,xmm1
|
movdqa xmm3, xmm1
|
||||||
|
|
||||||
pand xmm0,xmm6
|
pand xmm0, xmm6
|
||||||
psrlw xmm2,BYTE_BIT
|
psrlw xmm2, BYTE_BIT
|
||||||
pand xmm1,xmm6
|
pand xmm1, xmm6
|
||||||
psrlw xmm3,BYTE_BIT
|
psrlw xmm3, BYTE_BIT
|
||||||
|
|
||||||
paddw xmm0,xmm2
|
paddw xmm0, xmm2
|
||||||
paddw xmm1,xmm3
|
paddw xmm1, xmm3
|
||||||
paddw xmm0,xmm7
|
paddw xmm0, xmm7
|
||||||
paddw xmm1,xmm7
|
paddw xmm1, xmm7
|
||||||
psrlw xmm0,1
|
psrlw xmm0, 1
|
||||||
psrlw xmm1,1
|
psrlw xmm1, 1
|
||||||
|
|
||||||
packuswb xmm0,xmm1
|
packuswb xmm0, xmm1
|
||||||
|
|
||||||
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
|
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
|
||||||
|
|
||||||
@@ -153,7 +153,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
|||||||
add edi, byte 1*SIZEOF_XMMWORD ; outptr
|
add edi, byte 1*SIZEOF_XMMWORD ; outptr
|
||||||
cmp ecx, byte SIZEOF_XMMWORD
|
cmp ecx, byte SIZEOF_XMMWORD
|
||||||
jae short .columnloop
|
jae short .columnloop
|
||||||
test ecx,ecx
|
test ecx, ecx
|
||||||
jnz short .columnloop_r8
|
jnz short .columnloop_r8
|
||||||
|
|
||||||
pop esi
|
pop esi
|
||||||
@@ -198,7 +198,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
|||||||
|
|
||||||
EXTN(jsimd_h2v2_downsample_sse2):
|
EXTN(jsimd_h2v2_downsample_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov ebp,esp
|
mov ebp, esp
|
||||||
; push ebx ; unused
|
; push ebx ; unused
|
||||||
; push ecx ; need not be preserved
|
; push ecx ; need not be preserved
|
||||||
; push edx ; need not be preserved
|
; push edx ; need not be preserved
|
||||||
@@ -206,7 +206,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
|
|||||||
push edi
|
push edi
|
||||||
|
|
||||||
mov ecx, JDIMENSION [width_blks(ebp)]
|
mov ecx, JDIMENSION [width_blks(ebp)]
|
||||||
shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
|
shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov edx, JDIMENSION [img_width(ebp)]
|
mov edx, JDIMENSION [img_width(ebp)]
|
||||||
@@ -214,23 +214,23 @@ EXTN(jsimd_h2v2_downsample_sse2):
|
|||||||
; -- expand_right_edge
|
; -- expand_right_edge
|
||||||
|
|
||||||
push ecx
|
push ecx
|
||||||
shl ecx,1 ; output_cols * 2
|
shl ecx, 1 ; output_cols * 2
|
||||||
sub ecx,edx
|
sub ecx, edx
|
||||||
jle short .expand_end
|
jle short .expand_end
|
||||||
|
|
||||||
mov eax, INT [max_v_samp(ebp)]
|
mov eax, INT [max_v_samp(ebp)]
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jle short .expand_end
|
jle short .expand_end
|
||||||
|
|
||||||
cld
|
cld
|
||||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.expandloop:
|
.expandloop:
|
||||||
push eax
|
push eax
|
||||||
push ecx
|
push ecx
|
||||||
|
|
||||||
mov edi, JSAMPROW [esi]
|
mov edi, JSAMPROW [esi]
|
||||||
add edi,edx
|
add edi, edx
|
||||||
mov al, JSAMPLE [edi-1]
|
mov al, JSAMPLE [edi-1]
|
||||||
|
|
||||||
rep stosb
|
rep stosb
|
||||||
@@ -248,18 +248,18 @@ EXTN(jsimd_h2v2_downsample_sse2):
|
|||||||
; -- h2v2_downsample
|
; -- h2v2_downsample
|
||||||
|
|
||||||
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
|
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jle near .return
|
jle near .return
|
||||||
|
|
||||||
mov edx, 0x00020001 ; bias pattern
|
mov edx, 0x00020001 ; bias pattern
|
||||||
movd xmm7,edx
|
movd xmm7, edx
|
||||||
pcmpeqw xmm6,xmm6
|
pcmpeqw xmm6, xmm6
|
||||||
pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
|
pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
|
||||||
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
||||||
|
|
||||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||||
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
|
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.rowloop:
|
.rowloop:
|
||||||
push ecx
|
push ecx
|
||||||
push edi
|
push edi
|
||||||
@@ -271,16 +271,16 @@ EXTN(jsimd_h2v2_downsample_sse2):
|
|||||||
|
|
||||||
cmp ecx, byte SIZEOF_XMMWORD
|
cmp ecx, byte SIZEOF_XMMWORD
|
||||||
jae short .columnloop
|
jae short .columnloop
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.columnloop_r8:
|
.columnloop_r8:
|
||||||
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
|
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
|
||||||
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
pxor xmm2,xmm2
|
pxor xmm2, xmm2
|
||||||
pxor xmm3,xmm3
|
pxor xmm3, xmm3
|
||||||
mov ecx, SIZEOF_XMMWORD
|
mov ecx, SIZEOF_XMMWORD
|
||||||
jmp short .downsample
|
jmp short .downsample
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.columnloop:
|
.columnloop:
|
||||||
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
|
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
|
||||||
@@ -289,32 +289,32 @@ EXTN(jsimd_h2v2_downsample_sse2):
|
|||||||
movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
.downsample:
|
.downsample:
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
pand xmm0,xmm6
|
pand xmm0, xmm6
|
||||||
psrlw xmm4,BYTE_BIT
|
psrlw xmm4, BYTE_BIT
|
||||||
pand xmm1,xmm6
|
pand xmm1, xmm6
|
||||||
psrlw xmm5,BYTE_BIT
|
psrlw xmm5, BYTE_BIT
|
||||||
paddw xmm0,xmm4
|
paddw xmm0, xmm4
|
||||||
paddw xmm1,xmm5
|
paddw xmm1, xmm5
|
||||||
|
|
||||||
movdqa xmm4,xmm2
|
movdqa xmm4, xmm2
|
||||||
movdqa xmm5,xmm3
|
movdqa xmm5, xmm3
|
||||||
pand xmm2,xmm6
|
pand xmm2, xmm6
|
||||||
psrlw xmm4,BYTE_BIT
|
psrlw xmm4, BYTE_BIT
|
||||||
pand xmm3,xmm6
|
pand xmm3, xmm6
|
||||||
psrlw xmm5,BYTE_BIT
|
psrlw xmm5, BYTE_BIT
|
||||||
paddw xmm2,xmm4
|
paddw xmm2, xmm4
|
||||||
paddw xmm3,xmm5
|
paddw xmm3, xmm5
|
||||||
|
|
||||||
paddw xmm0,xmm1
|
paddw xmm0, xmm1
|
||||||
paddw xmm2,xmm3
|
paddw xmm2, xmm3
|
||||||
paddw xmm0,xmm7
|
paddw xmm0, xmm7
|
||||||
paddw xmm2,xmm7
|
paddw xmm2, xmm7
|
||||||
psrlw xmm0,2
|
psrlw xmm0, 2
|
||||||
psrlw xmm2,2
|
psrlw xmm2, 2
|
||||||
|
|
||||||
packuswb xmm0,xmm2
|
packuswb xmm0, xmm2
|
||||||
|
|
||||||
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
|
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
|
||||||
|
|
||||||
@@ -324,7 +324,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
|
|||||||
add edi, byte 1*SIZEOF_XMMWORD ; outptr
|
add edi, byte 1*SIZEOF_XMMWORD ; outptr
|
||||||
cmp ecx, byte SIZEOF_XMMWORD
|
cmp ecx, byte SIZEOF_XMMWORD
|
||||||
jae near .columnloop
|
jae near .columnloop
|
||||||
test ecx,ecx
|
test ecx, ecx
|
||||||
jnz near .columnloop_r8
|
jnz near .columnloop_r8
|
||||||
|
|
||||||
pop esi
|
pop esi
|
||||||
|
|||||||
@@ -42,17 +42,17 @@
|
|||||||
|
|
||||||
EXTN(jsimd_ycc_rgb_convert_sse2):
|
EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp ; rax = original rbp
|
mov rax, rsp ; rax = original rbp
|
||||||
sub rsp, byte 4
|
sub rsp, byte 4
|
||||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [rsp],rax
|
mov [rsp], rax
|
||||||
mov rbp,rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
mov ecx, r10d ; num_cols
|
mov ecx, r10d ; num_cols
|
||||||
test rcx,rcx
|
test rcx, rcx
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
push rcx
|
push rcx
|
||||||
@@ -70,7 +70,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
|
|
||||||
mov rdi, r13
|
mov rdi, r13
|
||||||
mov eax, r14d
|
mov eax, r14d
|
||||||
test rax,rax
|
test rax, rax
|
||||||
jle near .return
|
jle near .return
|
||||||
.rowloop:
|
.rowloop:
|
||||||
push rax
|
push rax
|
||||||
@@ -89,21 +89,21 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF)
|
movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF)
|
||||||
movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF)
|
movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF)
|
||||||
|
|
||||||
pcmpeqw xmm4,xmm4
|
pcmpeqw xmm4, xmm4
|
||||||
pcmpeqw xmm7,xmm7
|
pcmpeqw xmm7, xmm7
|
||||||
psrlw xmm4,BYTE_BIT
|
psrlw xmm4, BYTE_BIT
|
||||||
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||||
movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
|
movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
|
||||||
|
|
||||||
pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE
|
pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
|
||||||
psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
|
psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
|
||||||
pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE
|
pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
|
||||||
psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
|
psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
|
||||||
|
|
||||||
paddw xmm4,xmm7
|
paddw xmm4, xmm7
|
||||||
paddw xmm5,xmm7
|
paddw xmm5, xmm7
|
||||||
paddw xmm0,xmm7
|
paddw xmm0, xmm7
|
||||||
paddw xmm1,xmm7
|
paddw xmm1, xmm7
|
||||||
|
|
||||||
; (Original)
|
; (Original)
|
||||||
; R = Y + 1.40200 * Cr
|
; R = Y + 1.40200 * Cr
|
||||||
@@ -115,85 +115,85 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||||
|
|
||||||
movdqa xmm2,xmm4 ; xmm2=CbE
|
movdqa xmm2, xmm4 ; xmm2=CbE
|
||||||
movdqa xmm3,xmm5 ; xmm3=CbO
|
movdqa xmm3, xmm5 ; xmm3=CbO
|
||||||
paddw xmm4,xmm4 ; xmm4=2*CbE
|
paddw xmm4, xmm4 ; xmm4=2*CbE
|
||||||
paddw xmm5,xmm5 ; xmm5=2*CbO
|
paddw xmm5, xmm5 ; xmm5=2*CbO
|
||||||
movdqa xmm6,xmm0 ; xmm6=CrE
|
movdqa xmm6, xmm0 ; xmm6=CrE
|
||||||
movdqa xmm7,xmm1 ; xmm7=CrO
|
movdqa xmm7, xmm1 ; xmm7=CrO
|
||||||
paddw xmm0,xmm0 ; xmm0=2*CrE
|
paddw xmm0, xmm0 ; xmm0=2*CrE
|
||||||
paddw xmm1,xmm1 ; xmm1=2*CrO
|
paddw xmm1, xmm1 ; xmm1=2*CrO
|
||||||
|
|
||||||
pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800))
|
pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800))
|
||||||
pmulhw xmm5,[rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800))
|
pmulhw xmm5, [rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800))
|
||||||
pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200))
|
pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200))
|
||||||
pmulhw xmm1,[rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200))
|
pmulhw xmm1, [rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200))
|
||||||
|
|
||||||
paddw xmm4,[rel PW_ONE]
|
paddw xmm4, [rel PW_ONE]
|
||||||
paddw xmm5,[rel PW_ONE]
|
paddw xmm5, [rel PW_ONE]
|
||||||
psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800))
|
psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
|
||||||
psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800))
|
psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
|
||||||
paddw xmm0,[rel PW_ONE]
|
paddw xmm0, [rel PW_ONE]
|
||||||
paddw xmm1,[rel PW_ONE]
|
paddw xmm1, [rel PW_ONE]
|
||||||
psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200))
|
psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
|
||||||
psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200))
|
psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
|
||||||
|
|
||||||
paddw xmm4,xmm2
|
paddw xmm4, xmm2
|
||||||
paddw xmm5,xmm3
|
paddw xmm5, xmm3
|
||||||
paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
|
paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
|
||||||
paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
|
paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
|
||||||
paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
|
paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
|
||||||
paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
|
paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
|
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
|
||||||
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
|
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
|
||||||
|
|
||||||
movdqa xmm4,xmm2
|
movdqa xmm4, xmm2
|
||||||
movdqa xmm5,xmm3
|
movdqa xmm5, xmm3
|
||||||
punpcklwd xmm2,xmm6
|
punpcklwd xmm2, xmm6
|
||||||
punpckhwd xmm4,xmm6
|
punpckhwd xmm4, xmm6
|
||||||
pmaddwd xmm2,[rel PW_MF0344_F0285]
|
pmaddwd xmm2, [rel PW_MF0344_F0285]
|
||||||
pmaddwd xmm4,[rel PW_MF0344_F0285]
|
pmaddwd xmm4, [rel PW_MF0344_F0285]
|
||||||
punpcklwd xmm3,xmm7
|
punpcklwd xmm3, xmm7
|
||||||
punpckhwd xmm5,xmm7
|
punpckhwd xmm5, xmm7
|
||||||
pmaddwd xmm3,[rel PW_MF0344_F0285]
|
pmaddwd xmm3, [rel PW_MF0344_F0285]
|
||||||
pmaddwd xmm5,[rel PW_MF0344_F0285]
|
pmaddwd xmm5, [rel PW_MF0344_F0285]
|
||||||
|
|
||||||
paddd xmm2,[rel PD_ONEHALF]
|
paddd xmm2, [rel PD_ONEHALF]
|
||||||
paddd xmm4,[rel PD_ONEHALF]
|
paddd xmm4, [rel PD_ONEHALF]
|
||||||
psrad xmm2,SCALEBITS
|
psrad xmm2, SCALEBITS
|
||||||
psrad xmm4,SCALEBITS
|
psrad xmm4, SCALEBITS
|
||||||
paddd xmm3,[rel PD_ONEHALF]
|
paddd xmm3, [rel PD_ONEHALF]
|
||||||
paddd xmm5,[rel PD_ONEHALF]
|
paddd xmm5, [rel PD_ONEHALF]
|
||||||
psrad xmm3,SCALEBITS
|
psrad xmm3, SCALEBITS
|
||||||
psrad xmm5,SCALEBITS
|
psrad xmm5, SCALEBITS
|
||||||
|
|
||||||
packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
|
packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
|
||||||
packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
|
packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
|
||||||
psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
|
psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
|
||||||
psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
|
psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
|
||||||
|
|
||||||
movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF)
|
movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF)
|
||||||
|
|
||||||
pcmpeqw xmm4,xmm4
|
pcmpeqw xmm4, xmm4
|
||||||
psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
|
psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
|
||||||
pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE
|
pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
|
||||||
psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO
|
psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
|
||||||
|
|
||||||
paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
|
paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
|
||||||
paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
|
paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
|
||||||
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
|
packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
|
||||||
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
|
packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
|
||||||
|
|
||||||
paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
|
paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
|
||||||
paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
|
paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
|
||||||
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
|
packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
|
||||||
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
|
packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
|
||||||
|
|
||||||
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
|
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
|
||||||
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
|
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
|
||||||
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
|
packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
|
||||||
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
|
packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
|
||||||
|
|
||||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||||
|
|
||||||
@@ -202,44 +202,44 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||||
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
|
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
|
||||||
|
|
||||||
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||||
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
|
punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
|
||||||
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
|
punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
|
||||||
|
|
||||||
movdqa xmmG,xmmA
|
movdqa xmmG, xmmA
|
||||||
movdqa xmmH,xmmA
|
movdqa xmmH, xmmA
|
||||||
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
|
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
|
||||||
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
|
punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
|
||||||
|
|
||||||
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
|
psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
|
||||||
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
|
psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
|
||||||
|
|
||||||
movdqa xmmC,xmmD
|
movdqa xmmC, xmmD
|
||||||
movdqa xmmB,xmmD
|
movdqa xmmB, xmmD
|
||||||
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
|
punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
|
||||||
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
|
punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
|
||||||
|
|
||||||
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
|
psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
|
||||||
|
|
||||||
movdqa xmmF,xmmE
|
movdqa xmmF, xmmE
|
||||||
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
|
punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
|
||||||
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
|
punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
|
||||||
|
|
||||||
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
|
pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
|
||||||
movdqa xmmB,xmmE
|
movdqa xmmB, xmmE
|
||||||
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
|
punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
|
||||||
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
|
punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
|
||||||
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
|
punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
|
||||||
|
|
||||||
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
|
pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
|
||||||
movdqa xmmB,xmmF
|
movdqa xmmB, xmmF
|
||||||
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
|
punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
|
||||||
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
|
punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
|
||||||
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
|
punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
|
||||||
|
|
||||||
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||||
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||||
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||||
|
|
||||||
cmp rcx, byte SIZEOF_XMMWORD
|
cmp rcx, byte SIZEOF_XMMWORD
|
||||||
jb short .column_st32
|
jb short .column_st32
|
||||||
@@ -272,7 +272,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmF
|
movdqa xmmA, xmmF
|
||||||
sub rcx, byte 2*SIZEOF_XMMWORD
|
sub rcx, byte 2*SIZEOF_XMMWORD
|
||||||
jmp short .column_st15
|
jmp short .column_st15
|
||||||
.column_st16:
|
.column_st16:
|
||||||
@@ -280,7 +280,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
jb short .column_st15
|
jb short .column_st15
|
||||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmD
|
movdqa xmmA, xmmD
|
||||||
sub rcx, byte SIZEOF_XMMWORD
|
sub rcx, byte SIZEOF_XMMWORD
|
||||||
.column_st15:
|
.column_st15:
|
||||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||||
@@ -320,35 +320,35 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||||
|
|
||||||
%ifdef RGBX_FILLER_0XFF
|
%ifdef RGBX_FILLER_0XFF
|
||||||
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
|
pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
|
||||||
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
|
pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
|
||||||
%else
|
%else
|
||||||
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
|
pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
|
||||||
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
|
pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
|
||||||
%endif
|
%endif
|
||||||
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
||||||
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
||||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||||
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
|
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
|
||||||
|
|
||||||
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||||
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
|
punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
|
||||||
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
|
punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
|
||||||
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
|
punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
|
||||||
|
|
||||||
movdqa xmmC,xmmA
|
movdqa xmmC, xmmA
|
||||||
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
|
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
|
||||||
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
|
punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
|
||||||
movdqa xmmG,xmmB
|
movdqa xmmG, xmmB
|
||||||
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
|
punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
|
||||||
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
|
punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
|
||||||
|
|
||||||
movdqa xmmD,xmmA
|
movdqa xmmD, xmmA
|
||||||
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||||
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||||
movdqa xmmH,xmmC
|
movdqa xmmH, xmmC
|
||||||
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||||
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||||
|
|
||||||
cmp rcx, byte SIZEOF_XMMWORD
|
cmp rcx, byte SIZEOF_XMMWORD
|
||||||
jb short .column_st32
|
jb short .column_st32
|
||||||
@@ -382,15 +382,15 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmC
|
movdqa xmmA, xmmC
|
||||||
movdqa xmmD,xmmH
|
movdqa xmmD, xmmH
|
||||||
sub rcx, byte SIZEOF_XMMWORD/2
|
sub rcx, byte SIZEOF_XMMWORD/2
|
||||||
.column_st16:
|
.column_st16:
|
||||||
cmp rcx, byte SIZEOF_XMMWORD/4
|
cmp rcx, byte SIZEOF_XMMWORD/4
|
||||||
jb short .column_st15
|
jb short .column_st15
|
||||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmD
|
movdqa xmmA, xmmD
|
||||||
sub rcx, byte SIZEOF_XMMWORD/4
|
sub rcx, byte SIZEOF_XMMWORD/4
|
||||||
.column_st15:
|
.column_st15:
|
||||||
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
||||||
@@ -430,7 +430,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
.return:
|
.return:
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args
|
||||||
mov rsp,rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -44,11 +44,11 @@
|
|||||||
|
|
||||||
EXTN(jsimd_ycc_rgb_convert_sse2):
|
EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov eax,esp ; eax = original ebp
|
mov eax, esp ; eax = original ebp
|
||||||
sub esp, byte 4
|
sub esp, byte 4
|
||||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [esp],eax
|
mov [esp], eax
|
||||||
mov ebp,esp ; ebp = aligned ebp
|
mov ebp, esp ; ebp = aligned ebp
|
||||||
lea esp, [wk(0)]
|
lea esp, [wk(0)]
|
||||||
pushpic eax ; make a room for GOT address
|
pushpic eax ; make a room for GOT address
|
||||||
push ebx
|
push ebx
|
||||||
@@ -61,7 +61,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
movpic POINTER [gotptr], ebx ; save GOT address
|
movpic POINTER [gotptr], ebx ; save GOT address
|
||||||
|
|
||||||
mov ecx, JDIMENSION [out_width(eax)] ; num_cols
|
mov ecx, JDIMENSION [out_width(eax)] ; num_cols
|
||||||
test ecx,ecx
|
test ecx, ecx
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
push ecx
|
push ecx
|
||||||
@@ -79,9 +79,9 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
|
|
||||||
mov edi, JSAMPARRAY [output_buf(eax)]
|
mov edi, JSAMPARRAY [output_buf(eax)]
|
||||||
mov eax, INT [num_rows(eax)]
|
mov eax, INT [num_rows(eax)]
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jle near .return
|
jle near .return
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.rowloop:
|
.rowloop:
|
||||||
push eax
|
push eax
|
||||||
push edi
|
push edi
|
||||||
@@ -95,27 +95,27 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
mov edx, JSAMPROW [edx] ; inptr2
|
mov edx, JSAMPROW [edx] ; inptr2
|
||||||
mov edi, JSAMPROW [edi] ; outptr
|
mov edi, JSAMPROW [edi] ; outptr
|
||||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.columnloop:
|
.columnloop:
|
||||||
|
|
||||||
movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
|
movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
|
||||||
movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
|
movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
|
||||||
|
|
||||||
pcmpeqw xmm4,xmm4
|
pcmpeqw xmm4, xmm4
|
||||||
pcmpeqw xmm7,xmm7
|
pcmpeqw xmm7, xmm7
|
||||||
psrlw xmm4,BYTE_BIT
|
psrlw xmm4, BYTE_BIT
|
||||||
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||||
movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
|
movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
|
||||||
|
|
||||||
pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE
|
pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
|
||||||
psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
|
psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
|
||||||
pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE
|
pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
|
||||||
psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
|
psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
|
||||||
|
|
||||||
paddw xmm4,xmm7
|
paddw xmm4, xmm7
|
||||||
paddw xmm5,xmm7
|
paddw xmm5, xmm7
|
||||||
paddw xmm0,xmm7
|
paddw xmm0, xmm7
|
||||||
paddw xmm1,xmm7
|
paddw xmm1, xmm7
|
||||||
|
|
||||||
; (Original)
|
; (Original)
|
||||||
; R = Y + 1.40200 * Cr
|
; R = Y + 1.40200 * Cr
|
||||||
@@ -127,85 +127,85 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||||
|
|
||||||
movdqa xmm2,xmm4 ; xmm2=CbE
|
movdqa xmm2, xmm4 ; xmm2=CbE
|
||||||
movdqa xmm3,xmm5 ; xmm3=CbO
|
movdqa xmm3, xmm5 ; xmm3=CbO
|
||||||
paddw xmm4,xmm4 ; xmm4=2*CbE
|
paddw xmm4, xmm4 ; xmm4=2*CbE
|
||||||
paddw xmm5,xmm5 ; xmm5=2*CbO
|
paddw xmm5, xmm5 ; xmm5=2*CbO
|
||||||
movdqa xmm6,xmm0 ; xmm6=CrE
|
movdqa xmm6, xmm0 ; xmm6=CrE
|
||||||
movdqa xmm7,xmm1 ; xmm7=CrO
|
movdqa xmm7, xmm1 ; xmm7=CrO
|
||||||
paddw xmm0,xmm0 ; xmm0=2*CrE
|
paddw xmm0, xmm0 ; xmm0=2*CrE
|
||||||
paddw xmm1,xmm1 ; xmm1=2*CrO
|
paddw xmm1, xmm1 ; xmm1=2*CrO
|
||||||
|
|
||||||
pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
|
pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
|
||||||
pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
|
pmulhw xmm5, [GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
|
||||||
pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
|
pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
|
||||||
pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
|
pmulhw xmm1, [GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
|
||||||
|
|
||||||
paddw xmm4,[GOTOFF(eax,PW_ONE)]
|
paddw xmm4, [GOTOFF(eax,PW_ONE)]
|
||||||
paddw xmm5,[GOTOFF(eax,PW_ONE)]
|
paddw xmm5, [GOTOFF(eax,PW_ONE)]
|
||||||
psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800))
|
psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
|
||||||
psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800))
|
psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
|
||||||
paddw xmm0,[GOTOFF(eax,PW_ONE)]
|
paddw xmm0, [GOTOFF(eax,PW_ONE)]
|
||||||
paddw xmm1,[GOTOFF(eax,PW_ONE)]
|
paddw xmm1, [GOTOFF(eax,PW_ONE)]
|
||||||
psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200))
|
psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
|
||||||
psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200))
|
psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
|
||||||
|
|
||||||
paddw xmm4,xmm2
|
paddw xmm4, xmm2
|
||||||
paddw xmm5,xmm3
|
paddw xmm5, xmm3
|
||||||
paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
|
paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
|
||||||
paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
|
paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
|
||||||
paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
|
paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
|
||||||
paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
|
paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
|
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
|
||||||
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
|
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
|
||||||
|
|
||||||
movdqa xmm4,xmm2
|
movdqa xmm4, xmm2
|
||||||
movdqa xmm5,xmm3
|
movdqa xmm5, xmm3
|
||||||
punpcklwd xmm2,xmm6
|
punpcklwd xmm2, xmm6
|
||||||
punpckhwd xmm4,xmm6
|
punpckhwd xmm4, xmm6
|
||||||
pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
|
pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||||
pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
|
pmaddwd xmm4, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||||
punpcklwd xmm3,xmm7
|
punpcklwd xmm3, xmm7
|
||||||
punpckhwd xmm5,xmm7
|
punpckhwd xmm5, xmm7
|
||||||
pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
|
pmaddwd xmm3, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||||
pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
|
pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||||
|
|
||||||
paddd xmm2,[GOTOFF(eax,PD_ONEHALF)]
|
paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
|
||||||
paddd xmm4,[GOTOFF(eax,PD_ONEHALF)]
|
paddd xmm4, [GOTOFF(eax,PD_ONEHALF)]
|
||||||
psrad xmm2,SCALEBITS
|
psrad xmm2, SCALEBITS
|
||||||
psrad xmm4,SCALEBITS
|
psrad xmm4, SCALEBITS
|
||||||
paddd xmm3,[GOTOFF(eax,PD_ONEHALF)]
|
paddd xmm3, [GOTOFF(eax,PD_ONEHALF)]
|
||||||
paddd xmm5,[GOTOFF(eax,PD_ONEHALF)]
|
paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
|
||||||
psrad xmm3,SCALEBITS
|
psrad xmm3, SCALEBITS
|
||||||
psrad xmm5,SCALEBITS
|
psrad xmm5, SCALEBITS
|
||||||
|
|
||||||
packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
|
packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
|
||||||
packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
|
packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
|
||||||
psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
|
psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
|
||||||
psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
|
psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
|
||||||
|
|
||||||
movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
|
movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
|
||||||
|
|
||||||
pcmpeqw xmm4,xmm4
|
pcmpeqw xmm4, xmm4
|
||||||
psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
|
psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
|
||||||
pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE
|
pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
|
||||||
psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO
|
psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
|
||||||
|
|
||||||
paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
|
paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
|
||||||
paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
|
paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
|
||||||
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
|
packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
|
||||||
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
|
packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
|
||||||
|
|
||||||
paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
|
paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
|
||||||
paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
|
paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
|
||||||
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
|
packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
|
||||||
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
|
packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
|
||||||
|
|
||||||
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
|
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
|
||||||
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
|
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
|
||||||
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
|
packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
|
||||||
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
|
packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
|
||||||
|
|
||||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||||
|
|
||||||
@@ -214,44 +214,44 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||||
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
|
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
|
||||||
|
|
||||||
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||||
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
|
punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
|
||||||
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
|
punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
|
||||||
|
|
||||||
movdqa xmmG,xmmA
|
movdqa xmmG, xmmA
|
||||||
movdqa xmmH,xmmA
|
movdqa xmmH, xmmA
|
||||||
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
|
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
|
||||||
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
|
punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
|
||||||
|
|
||||||
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
|
psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
|
||||||
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
|
psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
|
||||||
|
|
||||||
movdqa xmmC,xmmD
|
movdqa xmmC, xmmD
|
||||||
movdqa xmmB,xmmD
|
movdqa xmmB, xmmD
|
||||||
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
|
punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
|
||||||
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
|
punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
|
||||||
|
|
||||||
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
|
psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
|
||||||
|
|
||||||
movdqa xmmF,xmmE
|
movdqa xmmF, xmmE
|
||||||
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
|
punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
|
||||||
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
|
punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
|
||||||
|
|
||||||
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
|
pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
|
||||||
movdqa xmmB,xmmE
|
movdqa xmmB, xmmE
|
||||||
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
|
punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
|
||||||
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
|
punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
|
||||||
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
|
punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
|
||||||
|
|
||||||
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
|
pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
|
||||||
movdqa xmmB,xmmF
|
movdqa xmmB, xmmF
|
||||||
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
|
punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
|
||||||
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
|
punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
|
||||||
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
|
punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
|
||||||
|
|
||||||
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||||
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||||
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||||
|
|
||||||
cmp ecx, byte SIZEOF_XMMWORD
|
cmp ecx, byte SIZEOF_XMMWORD
|
||||||
jb short .column_st32
|
jb short .column_st32
|
||||||
@@ -276,7 +276,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
add ebx, byte SIZEOF_XMMWORD ; inptr1
|
add ebx, byte SIZEOF_XMMWORD ; inptr1
|
||||||
add edx, byte SIZEOF_XMMWORD ; inptr2
|
add edx, byte SIZEOF_XMMWORD ; inptr2
|
||||||
jmp near .columnloop
|
jmp near .columnloop
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.column_st32:
|
.column_st32:
|
||||||
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
||||||
@@ -285,7 +285,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmF
|
movdqa xmmA, xmmF
|
||||||
sub ecx, byte 2*SIZEOF_XMMWORD
|
sub ecx, byte 2*SIZEOF_XMMWORD
|
||||||
jmp short .column_st15
|
jmp short .column_st15
|
||||||
.column_st16:
|
.column_st16:
|
||||||
@@ -293,7 +293,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
jb short .column_st15
|
jb short .column_st15
|
||||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmD
|
movdqa xmmA, xmmD
|
||||||
sub ecx, byte SIZEOF_XMMWORD
|
sub ecx, byte SIZEOF_XMMWORD
|
||||||
.column_st15:
|
.column_st15:
|
||||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||||
@@ -333,35 +333,35 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||||
|
|
||||||
%ifdef RGBX_FILLER_0XFF
|
%ifdef RGBX_FILLER_0XFF
|
||||||
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
|
pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
|
||||||
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
|
pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
|
||||||
%else
|
%else
|
||||||
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
|
pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
|
||||||
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
|
pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
|
||||||
%endif
|
%endif
|
||||||
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
||||||
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
||||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||||
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
|
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
|
||||||
|
|
||||||
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||||
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
|
punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
|
||||||
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
|
punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
|
||||||
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
|
punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
|
||||||
|
|
||||||
movdqa xmmC,xmmA
|
movdqa xmmC, xmmA
|
||||||
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
|
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
|
||||||
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
|
punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
|
||||||
movdqa xmmG,xmmB
|
movdqa xmmG, xmmB
|
||||||
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
|
punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
|
||||||
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
|
punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
|
||||||
|
|
||||||
movdqa xmmD,xmmA
|
movdqa xmmD, xmmA
|
||||||
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||||
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||||
movdqa xmmH,xmmC
|
movdqa xmmH, xmmC
|
||||||
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||||
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||||
|
|
||||||
cmp ecx, byte SIZEOF_XMMWORD
|
cmp ecx, byte SIZEOF_XMMWORD
|
||||||
jb short .column_st32
|
jb short .column_st32
|
||||||
@@ -388,7 +388,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
add ebx, byte SIZEOF_XMMWORD ; inptr1
|
add ebx, byte SIZEOF_XMMWORD ; inptr1
|
||||||
add edx, byte SIZEOF_XMMWORD ; inptr2
|
add edx, byte SIZEOF_XMMWORD ; inptr2
|
||||||
jmp near .columnloop
|
jmp near .columnloop
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.column_st32:
|
.column_st32:
|
||||||
cmp ecx, byte SIZEOF_XMMWORD/2
|
cmp ecx, byte SIZEOF_XMMWORD/2
|
||||||
@@ -396,15 +396,15 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmC
|
movdqa xmmA, xmmC
|
||||||
movdqa xmmD,xmmH
|
movdqa xmmD, xmmH
|
||||||
sub ecx, byte SIZEOF_XMMWORD/2
|
sub ecx, byte SIZEOF_XMMWORD/2
|
||||||
.column_st16:
|
.column_st16:
|
||||||
cmp ecx, byte SIZEOF_XMMWORD/4
|
cmp ecx, byte SIZEOF_XMMWORD/4
|
||||||
jb short .column_st15
|
jb short .column_st15
|
||||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmD
|
movdqa xmmA, xmmD
|
||||||
sub ecx, byte SIZEOF_XMMWORD/4
|
sub ecx, byte SIZEOF_XMMWORD/4
|
||||||
.column_st15:
|
.column_st15:
|
||||||
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
||||||
@@ -424,7 +424,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
|
|
||||||
%endif ; RGB_PIXELSIZE ; ---------------
|
%endif ; RGB_PIXELSIZE ; ---------------
|
||||||
|
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.nextrow:
|
.nextrow:
|
||||||
pop ecx
|
pop ecx
|
||||||
@@ -449,7 +449,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
; pop edx ; need not be preserved
|
; pop edx ; need not be preserved
|
||||||
; pop ecx ; need not be preserved
|
; pop ecx ; need not be preserved
|
||||||
pop ebx
|
pop ebx
|
||||||
mov esp,ebp ; esp <- aligned ebp
|
mov esp, ebp ; esp <- aligned ebp
|
||||||
pop esp ; esp <- original ebp
|
pop esp ; esp <- original ebp
|
||||||
pop ebp
|
pop ebp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -42,17 +42,17 @@
|
|||||||
|
|
||||||
EXTN(jsimd_h2v1_merged_upsample_sse2):
|
EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp ; rax = original rbp
|
mov rax, rsp ; rax = original rbp
|
||||||
sub rsp, byte 4
|
sub rsp, byte 4
|
||||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [rsp],rax
|
mov [rsp], rax
|
||||||
mov rbp,rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
mov ecx, r10d ; col
|
mov ecx, r10d ; col
|
||||||
test rcx,rcx
|
test rcx, rcx
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
push rcx
|
push rcx
|
||||||
@@ -75,21 +75,21 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF)
|
movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF)
|
||||||
movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF)
|
movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF)
|
||||||
|
|
||||||
pxor xmm1,xmm1 ; xmm1=(all 0's)
|
pxor xmm1, xmm1 ; xmm1=(all 0's)
|
||||||
pcmpeqw xmm3,xmm3
|
pcmpeqw xmm3, xmm3
|
||||||
psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||||
|
|
||||||
movdqa xmm4,xmm6
|
movdqa xmm4, xmm6
|
||||||
punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH
|
punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
|
||||||
punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL
|
punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
|
||||||
movdqa xmm0,xmm7
|
movdqa xmm0, xmm7
|
||||||
punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH
|
punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
|
||||||
punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL
|
punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
|
||||||
|
|
||||||
paddw xmm6,xmm3
|
paddw xmm6, xmm3
|
||||||
paddw xmm4,xmm3
|
paddw xmm4, xmm3
|
||||||
paddw xmm7,xmm3
|
paddw xmm7, xmm3
|
||||||
paddw xmm0,xmm3
|
paddw xmm0, xmm3
|
||||||
|
|
||||||
; (Original)
|
; (Original)
|
||||||
; R = Y + 1.40200 * Cr
|
; R = Y + 1.40200 * Cr
|
||||||
@@ -101,67 +101,67 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||||
|
|
||||||
movdqa xmm5,xmm6 ; xmm5=CbH
|
movdqa xmm5, xmm6 ; xmm5=CbH
|
||||||
movdqa xmm2,xmm4 ; xmm2=CbL
|
movdqa xmm2, xmm4 ; xmm2=CbL
|
||||||
paddw xmm6,xmm6 ; xmm6=2*CbH
|
paddw xmm6, xmm6 ; xmm6=2*CbH
|
||||||
paddw xmm4,xmm4 ; xmm4=2*CbL
|
paddw xmm4, xmm4 ; xmm4=2*CbL
|
||||||
movdqa xmm1,xmm7 ; xmm1=CrH
|
movdqa xmm1, xmm7 ; xmm1=CrH
|
||||||
movdqa xmm3,xmm0 ; xmm3=CrL
|
movdqa xmm3, xmm0 ; xmm3=CrL
|
||||||
paddw xmm7,xmm7 ; xmm7=2*CrH
|
paddw xmm7, xmm7 ; xmm7=2*CrH
|
||||||
paddw xmm0,xmm0 ; xmm0=2*CrL
|
paddw xmm0, xmm0 ; xmm0=2*CrL
|
||||||
|
|
||||||
pmulhw xmm6,[rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800))
|
pmulhw xmm6, [rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800))
|
||||||
pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800))
|
pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800))
|
||||||
pmulhw xmm7,[rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
|
pmulhw xmm7, [rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
|
||||||
pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
|
pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
|
||||||
|
|
||||||
paddw xmm6,[rel PW_ONE]
|
paddw xmm6, [rel PW_ONE]
|
||||||
paddw xmm4,[rel PW_ONE]
|
paddw xmm4, [rel PW_ONE]
|
||||||
psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800))
|
psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
|
||||||
psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800))
|
psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
|
||||||
paddw xmm7,[rel PW_ONE]
|
paddw xmm7, [rel PW_ONE]
|
||||||
paddw xmm0,[rel PW_ONE]
|
paddw xmm0, [rel PW_ONE]
|
||||||
psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200))
|
psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
|
||||||
psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200))
|
psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
|
||||||
|
|
||||||
paddw xmm6,xmm5
|
paddw xmm6, xmm5
|
||||||
paddw xmm4,xmm2
|
paddw xmm4, xmm2
|
||||||
paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
|
paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
|
||||||
paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
|
paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
|
||||||
paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
|
paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
|
||||||
paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
|
paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
|
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
|
||||||
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
|
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
|
||||||
|
|
||||||
movdqa xmm6,xmm5
|
movdqa xmm6, xmm5
|
||||||
movdqa xmm7,xmm2
|
movdqa xmm7, xmm2
|
||||||
punpcklwd xmm5,xmm1
|
punpcklwd xmm5, xmm1
|
||||||
punpckhwd xmm6,xmm1
|
punpckhwd xmm6, xmm1
|
||||||
pmaddwd xmm5,[rel PW_MF0344_F0285]
|
pmaddwd xmm5, [rel PW_MF0344_F0285]
|
||||||
pmaddwd xmm6,[rel PW_MF0344_F0285]
|
pmaddwd xmm6, [rel PW_MF0344_F0285]
|
||||||
punpcklwd xmm2,xmm3
|
punpcklwd xmm2, xmm3
|
||||||
punpckhwd xmm7,xmm3
|
punpckhwd xmm7, xmm3
|
||||||
pmaddwd xmm2,[rel PW_MF0344_F0285]
|
pmaddwd xmm2, [rel PW_MF0344_F0285]
|
||||||
pmaddwd xmm7,[rel PW_MF0344_F0285]
|
pmaddwd xmm7, [rel PW_MF0344_F0285]
|
||||||
|
|
||||||
paddd xmm5,[rel PD_ONEHALF]
|
paddd xmm5, [rel PD_ONEHALF]
|
||||||
paddd xmm6,[rel PD_ONEHALF]
|
paddd xmm6, [rel PD_ONEHALF]
|
||||||
psrad xmm5,SCALEBITS
|
psrad xmm5, SCALEBITS
|
||||||
psrad xmm6,SCALEBITS
|
psrad xmm6, SCALEBITS
|
||||||
paddd xmm2,[rel PD_ONEHALF]
|
paddd xmm2, [rel PD_ONEHALF]
|
||||||
paddd xmm7,[rel PD_ONEHALF]
|
paddd xmm7, [rel PD_ONEHALF]
|
||||||
psrad xmm2,SCALEBITS
|
psrad xmm2, SCALEBITS
|
||||||
psrad xmm7,SCALEBITS
|
psrad xmm7, SCALEBITS
|
||||||
|
|
||||||
packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
|
packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
|
||||||
packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
|
packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
|
||||||
psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
|
psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
|
||||||
psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
|
psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
|
||||||
|
|
||||||
movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
|
movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
|
||||||
|
|
||||||
mov al,2 ; Yctr
|
mov al, 2 ; Yctr
|
||||||
jmp short .Yloop_1st
|
jmp short .Yloop_1st
|
||||||
|
|
||||||
.Yloop_2nd:
|
.Yloop_2nd:
|
||||||
@@ -172,29 +172,29 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
.Yloop_1st:
|
.Yloop_1st:
|
||||||
movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF)
|
movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF)
|
||||||
|
|
||||||
pcmpeqw xmm6,xmm6
|
pcmpeqw xmm6, xmm6
|
||||||
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
||||||
pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE
|
pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
|
||||||
psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO
|
psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
|
||||||
|
|
||||||
movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H)
|
movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
|
||||||
movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H)
|
movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
|
||||||
movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H)
|
movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
|
||||||
|
|
||||||
paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
|
paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
|
||||||
paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
|
paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
|
||||||
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
|
packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
|
||||||
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
|
packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
|
||||||
|
|
||||||
paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
|
paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
|
||||||
paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
|
paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
|
||||||
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
|
packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
|
||||||
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
|
packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
|
||||||
|
|
||||||
paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
|
paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
|
||||||
paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
|
paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
|
||||||
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
|
packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
|
||||||
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
|
packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
|
||||||
|
|
||||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||||
|
|
||||||
@@ -203,44 +203,44 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||||
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
|
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
|
||||||
|
|
||||||
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||||
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
|
punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
|
||||||
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
|
punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
|
||||||
|
|
||||||
movdqa xmmG,xmmA
|
movdqa xmmG, xmmA
|
||||||
movdqa xmmH,xmmA
|
movdqa xmmH, xmmA
|
||||||
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
|
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
|
||||||
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
|
punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
|
||||||
|
|
||||||
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
|
psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
|
||||||
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
|
psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
|
||||||
|
|
||||||
movdqa xmmC,xmmD
|
movdqa xmmC, xmmD
|
||||||
movdqa xmmB,xmmD
|
movdqa xmmB, xmmD
|
||||||
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
|
punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
|
||||||
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
|
punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
|
||||||
|
|
||||||
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
|
psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
|
||||||
|
|
||||||
movdqa xmmF,xmmE
|
movdqa xmmF, xmmE
|
||||||
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
|
punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
|
||||||
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
|
punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
|
||||||
|
|
||||||
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
|
pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
|
||||||
movdqa xmmB,xmmE
|
movdqa xmmB, xmmE
|
||||||
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
|
punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
|
||||||
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
|
punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
|
||||||
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
|
punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
|
||||||
|
|
||||||
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
|
pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
|
||||||
movdqa xmmB,xmmF
|
movdqa xmmB, xmmF
|
||||||
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
|
punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
|
||||||
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
|
punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
|
||||||
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
|
punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
|
||||||
|
|
||||||
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||||
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||||
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||||
|
|
||||||
cmp rcx, byte SIZEOF_XMMWORD
|
cmp rcx, byte SIZEOF_XMMWORD
|
||||||
jb short .column_st32
|
jb short .column_st32
|
||||||
@@ -276,7 +276,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmF
|
movdqa xmmA, xmmF
|
||||||
sub rcx, byte 2*SIZEOF_XMMWORD
|
sub rcx, byte 2*SIZEOF_XMMWORD
|
||||||
jmp short .column_st15
|
jmp short .column_st15
|
||||||
.column_st16:
|
.column_st16:
|
||||||
@@ -284,7 +284,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
jb short .column_st15
|
jb short .column_st15
|
||||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmD
|
movdqa xmmA, xmmD
|
||||||
sub rcx, byte SIZEOF_XMMWORD
|
sub rcx, byte SIZEOF_XMMWORD
|
||||||
.column_st15:
|
.column_st15:
|
||||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||||
@@ -324,35 +324,35 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||||
|
|
||||||
%ifdef RGBX_FILLER_0XFF
|
%ifdef RGBX_FILLER_0XFF
|
||||||
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
|
pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
|
||||||
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
|
pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
|
||||||
%else
|
%else
|
||||||
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
|
pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
|
||||||
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
|
pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
|
||||||
%endif
|
%endif
|
||||||
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
||||||
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
||||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||||
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
|
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
|
||||||
|
|
||||||
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||||
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
|
punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
|
||||||
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
|
punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
|
||||||
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
|
punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
|
||||||
|
|
||||||
movdqa xmmC,xmmA
|
movdqa xmmC, xmmA
|
||||||
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
|
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
|
||||||
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
|
punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
|
||||||
movdqa xmmG,xmmB
|
movdqa xmmG, xmmB
|
||||||
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
|
punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
|
||||||
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
|
punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
|
||||||
|
|
||||||
movdqa xmmD,xmmA
|
movdqa xmmD, xmmA
|
||||||
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||||
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||||
movdqa xmmH,xmmC
|
movdqa xmmH, xmmC
|
||||||
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||||
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||||
|
|
||||||
cmp rcx, byte SIZEOF_XMMWORD
|
cmp rcx, byte SIZEOF_XMMWORD
|
||||||
jb short .column_st32
|
jb short .column_st32
|
||||||
@@ -389,15 +389,15 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmC
|
movdqa xmmA, xmmC
|
||||||
movdqa xmmD,xmmH
|
movdqa xmmD, xmmH
|
||||||
sub rcx, byte SIZEOF_XMMWORD/2
|
sub rcx, byte SIZEOF_XMMWORD/2
|
||||||
.column_st16:
|
.column_st16:
|
||||||
cmp rcx, byte SIZEOF_XMMWORD/4
|
cmp rcx, byte SIZEOF_XMMWORD/4
|
||||||
jb short .column_st15
|
jb short .column_st15
|
||||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmD
|
movdqa xmmA, xmmD
|
||||||
sub rcx, byte SIZEOF_XMMWORD/4
|
sub rcx, byte SIZEOF_XMMWORD/4
|
||||||
.column_st15:
|
.column_st15:
|
||||||
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
||||||
@@ -423,7 +423,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
.return:
|
.return:
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args
|
||||||
mov rsp,rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
@@ -449,8 +449,8 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
|
|
||||||
EXTN(jsimd_h2v2_merged_upsample_sse2):
|
EXTN(jsimd_h2v2_merged_upsample_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp
|
mov rax, rsp
|
||||||
mov rbp,rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
@@ -467,7 +467,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
|
|||||||
push rdx ; inptr2
|
push rdx ; inptr2
|
||||||
push rbx ; inptr1
|
push rbx ; inptr1
|
||||||
push rsi ; inptr00
|
push rsi ; inptr00
|
||||||
mov rbx,rsp
|
mov rbx, rsp
|
||||||
|
|
||||||
push rdi
|
push rdi
|
||||||
push rcx
|
push rcx
|
||||||
@@ -500,7 +500,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
|
|||||||
push rdx ; inptr2
|
push rdx ; inptr2
|
||||||
push rbx ; inptr1
|
push rbx ; inptr1
|
||||||
push rsi ; inptr00
|
push rsi ; inptr00
|
||||||
mov rbx,rsp
|
mov rbx, rsp
|
||||||
|
|
||||||
push rdi
|
push rdi
|
||||||
push rcx
|
push rcx
|
||||||
|
|||||||
@@ -44,11 +44,11 @@
|
|||||||
|
|
||||||
EXTN(jsimd_h2v1_merged_upsample_sse2):
|
EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov eax,esp ; eax = original ebp
|
mov eax, esp ; eax = original ebp
|
||||||
sub esp, byte 4
|
sub esp, byte 4
|
||||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [esp],eax
|
mov [esp], eax
|
||||||
mov ebp,esp ; ebp = aligned ebp
|
mov ebp, esp ; ebp = aligned ebp
|
||||||
lea esp, [wk(0)]
|
lea esp, [wk(0)]
|
||||||
pushpic eax ; make a room for GOT address
|
pushpic eax ; make a room for GOT address
|
||||||
push ebx
|
push ebx
|
||||||
@@ -61,7 +61,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
movpic POINTER [gotptr], ebx ; save GOT address
|
movpic POINTER [gotptr], ebx ; save GOT address
|
||||||
|
|
||||||
mov ecx, JDIMENSION [output_width(eax)] ; col
|
mov ecx, JDIMENSION [output_width(eax)] ; col
|
||||||
test ecx,ecx
|
test ecx, ecx
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
push ecx
|
push ecx
|
||||||
@@ -79,28 +79,28 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
|
|
||||||
pop ecx ; col
|
pop ecx ; col
|
||||||
|
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.columnloop:
|
.columnloop:
|
||||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||||
|
|
||||||
movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF)
|
movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF)
|
||||||
movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF)
|
movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF)
|
||||||
|
|
||||||
pxor xmm1,xmm1 ; xmm1=(all 0's)
|
pxor xmm1, xmm1 ; xmm1=(all 0's)
|
||||||
pcmpeqw xmm3,xmm3
|
pcmpeqw xmm3, xmm3
|
||||||
psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||||
|
|
||||||
movdqa xmm4,xmm6
|
movdqa xmm4, xmm6
|
||||||
punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH
|
punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
|
||||||
punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL
|
punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
|
||||||
movdqa xmm0,xmm7
|
movdqa xmm0, xmm7
|
||||||
punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH
|
punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
|
||||||
punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL
|
punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
|
||||||
|
|
||||||
paddw xmm6,xmm3
|
paddw xmm6, xmm3
|
||||||
paddw xmm4,xmm3
|
paddw xmm4, xmm3
|
||||||
paddw xmm7,xmm3
|
paddw xmm7, xmm3
|
||||||
paddw xmm0,xmm3
|
paddw xmm0, xmm3
|
||||||
|
|
||||||
; (Original)
|
; (Original)
|
||||||
; R = Y + 1.40200 * Cr
|
; R = Y + 1.40200 * Cr
|
||||||
@@ -112,102 +112,102 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||||
|
|
||||||
movdqa xmm5,xmm6 ; xmm5=CbH
|
movdqa xmm5, xmm6 ; xmm5=CbH
|
||||||
movdqa xmm2,xmm4 ; xmm2=CbL
|
movdqa xmm2, xmm4 ; xmm2=CbL
|
||||||
paddw xmm6,xmm6 ; xmm6=2*CbH
|
paddw xmm6, xmm6 ; xmm6=2*CbH
|
||||||
paddw xmm4,xmm4 ; xmm4=2*CbL
|
paddw xmm4, xmm4 ; xmm4=2*CbL
|
||||||
movdqa xmm1,xmm7 ; xmm1=CrH
|
movdqa xmm1, xmm7 ; xmm1=CrH
|
||||||
movdqa xmm3,xmm0 ; xmm3=CrL
|
movdqa xmm3, xmm0 ; xmm3=CrL
|
||||||
paddw xmm7,xmm7 ; xmm7=2*CrH
|
paddw xmm7, xmm7 ; xmm7=2*CrH
|
||||||
paddw xmm0,xmm0 ; xmm0=2*CrL
|
paddw xmm0, xmm0 ; xmm0=2*CrL
|
||||||
|
|
||||||
pmulhw xmm6,[GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800))
|
pmulhw xmm6, [GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800))
|
||||||
pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800))
|
pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800))
|
||||||
pmulhw xmm7,[GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200))
|
pmulhw xmm7, [GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200))
|
||||||
pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200))
|
pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200))
|
||||||
|
|
||||||
paddw xmm6,[GOTOFF(eax,PW_ONE)]
|
paddw xmm6, [GOTOFF(eax,PW_ONE)]
|
||||||
paddw xmm4,[GOTOFF(eax,PW_ONE)]
|
paddw xmm4, [GOTOFF(eax,PW_ONE)]
|
||||||
psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800))
|
psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
|
||||||
psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800))
|
psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
|
||||||
paddw xmm7,[GOTOFF(eax,PW_ONE)]
|
paddw xmm7, [GOTOFF(eax,PW_ONE)]
|
||||||
paddw xmm0,[GOTOFF(eax,PW_ONE)]
|
paddw xmm0, [GOTOFF(eax,PW_ONE)]
|
||||||
psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200))
|
psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
|
||||||
psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200))
|
psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
|
||||||
|
|
||||||
paddw xmm6,xmm5
|
paddw xmm6, xmm5
|
||||||
paddw xmm4,xmm2
|
paddw xmm4, xmm2
|
||||||
paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
|
paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
|
||||||
paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
|
paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
|
||||||
paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
|
paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
|
||||||
paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
|
paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
|
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
|
||||||
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
|
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
|
||||||
|
|
||||||
movdqa xmm6,xmm5
|
movdqa xmm6, xmm5
|
||||||
movdqa xmm7,xmm2
|
movdqa xmm7, xmm2
|
||||||
punpcklwd xmm5,xmm1
|
punpcklwd xmm5, xmm1
|
||||||
punpckhwd xmm6,xmm1
|
punpckhwd xmm6, xmm1
|
||||||
pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
|
pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||||
pmaddwd xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
|
pmaddwd xmm6, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||||
punpcklwd xmm2,xmm3
|
punpcklwd xmm2, xmm3
|
||||||
punpckhwd xmm7,xmm3
|
punpckhwd xmm7, xmm3
|
||||||
pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
|
pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||||
pmaddwd xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
|
pmaddwd xmm7, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||||
|
|
||||||
paddd xmm5,[GOTOFF(eax,PD_ONEHALF)]
|
paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
|
||||||
paddd xmm6,[GOTOFF(eax,PD_ONEHALF)]
|
paddd xmm6, [GOTOFF(eax,PD_ONEHALF)]
|
||||||
psrad xmm5,SCALEBITS
|
psrad xmm5, SCALEBITS
|
||||||
psrad xmm6,SCALEBITS
|
psrad xmm6, SCALEBITS
|
||||||
paddd xmm2,[GOTOFF(eax,PD_ONEHALF)]
|
paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
|
||||||
paddd xmm7,[GOTOFF(eax,PD_ONEHALF)]
|
paddd xmm7, [GOTOFF(eax,PD_ONEHALF)]
|
||||||
psrad xmm2,SCALEBITS
|
psrad xmm2, SCALEBITS
|
||||||
psrad xmm7,SCALEBITS
|
psrad xmm7, SCALEBITS
|
||||||
|
|
||||||
packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
|
packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
|
||||||
packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
|
packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
|
||||||
psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
|
psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
|
||||||
psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
|
psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
|
||||||
|
|
||||||
movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
|
movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
|
||||||
|
|
||||||
mov al,2 ; Yctr
|
mov al, 2 ; Yctr
|
||||||
jmp short .Yloop_1st
|
jmp short .Yloop_1st
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.Yloop_2nd:
|
.Yloop_2nd:
|
||||||
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
|
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
|
||||||
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
|
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
|
||||||
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
|
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.Yloop_1st:
|
.Yloop_1st:
|
||||||
movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF)
|
movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF)
|
||||||
|
|
||||||
pcmpeqw xmm6,xmm6
|
pcmpeqw xmm6, xmm6
|
||||||
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
||||||
pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE
|
pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
|
||||||
psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO
|
psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
|
||||||
|
|
||||||
movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H)
|
movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
|
||||||
movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H)
|
movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
|
||||||
movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H)
|
movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
|
||||||
|
|
||||||
paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
|
paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
|
||||||
paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
|
paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
|
||||||
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
|
packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
|
||||||
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
|
packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
|
||||||
|
|
||||||
paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
|
paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
|
||||||
paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
|
paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
|
||||||
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
|
packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
|
||||||
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
|
packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
|
||||||
|
|
||||||
paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
|
paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
|
||||||
paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
|
paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
|
||||||
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
|
packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
|
||||||
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
|
packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
|
||||||
|
|
||||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||||
|
|
||||||
@@ -216,44 +216,44 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||||
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
|
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
|
||||||
|
|
||||||
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||||
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
|
punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
|
||||||
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
|
punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
|
||||||
|
|
||||||
movdqa xmmG,xmmA
|
movdqa xmmG, xmmA
|
||||||
movdqa xmmH,xmmA
|
movdqa xmmH, xmmA
|
||||||
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
|
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
|
||||||
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
|
punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
|
||||||
|
|
||||||
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
|
psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
|
||||||
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
|
psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
|
||||||
|
|
||||||
movdqa xmmC,xmmD
|
movdqa xmmC, xmmD
|
||||||
movdqa xmmB,xmmD
|
movdqa xmmB, xmmD
|
||||||
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
|
punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
|
||||||
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
|
punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
|
||||||
|
|
||||||
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
|
psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
|
||||||
|
|
||||||
movdqa xmmF,xmmE
|
movdqa xmmF, xmmE
|
||||||
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
|
punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
|
||||||
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
|
punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
|
||||||
|
|
||||||
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
|
pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
|
||||||
movdqa xmmB,xmmE
|
movdqa xmmB, xmmE
|
||||||
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
|
punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
|
||||||
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
|
punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
|
||||||
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
|
punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
|
||||||
|
|
||||||
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
|
pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
|
||||||
movdqa xmmB,xmmF
|
movdqa xmmB, xmmF
|
||||||
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
|
punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
|
||||||
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
|
punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
|
||||||
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
|
punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
|
||||||
|
|
||||||
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||||
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||||
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||||
|
|
||||||
cmp ecx, byte SIZEOF_XMMWORD
|
cmp ecx, byte SIZEOF_XMMWORD
|
||||||
jb short .column_st32
|
jb short .column_st32
|
||||||
@@ -281,7 +281,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
add ebx, byte SIZEOF_XMMWORD ; inptr1
|
add ebx, byte SIZEOF_XMMWORD ; inptr1
|
||||||
add edx, byte SIZEOF_XMMWORD ; inptr2
|
add edx, byte SIZEOF_XMMWORD ; inptr2
|
||||||
jmp near .columnloop
|
jmp near .columnloop
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.column_st32:
|
.column_st32:
|
||||||
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
||||||
@@ -290,7 +290,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmF
|
movdqa xmmA, xmmF
|
||||||
sub ecx, byte 2*SIZEOF_XMMWORD
|
sub ecx, byte 2*SIZEOF_XMMWORD
|
||||||
jmp short .column_st15
|
jmp short .column_st15
|
||||||
.column_st16:
|
.column_st16:
|
||||||
@@ -298,7 +298,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
jb short .column_st15
|
jb short .column_st15
|
||||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmD
|
movdqa xmmA, xmmD
|
||||||
sub ecx, byte SIZEOF_XMMWORD
|
sub ecx, byte SIZEOF_XMMWORD
|
||||||
.column_st15:
|
.column_st15:
|
||||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||||
@@ -338,35 +338,35 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||||
|
|
||||||
%ifdef RGBX_FILLER_0XFF
|
%ifdef RGBX_FILLER_0XFF
|
||||||
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
|
pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
|
||||||
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
|
pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
|
||||||
%else
|
%else
|
||||||
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
|
pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
|
||||||
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
|
pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
|
||||||
%endif
|
%endif
|
||||||
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
||||||
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
||||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||||
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
|
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
|
||||||
|
|
||||||
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||||
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
|
punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
|
||||||
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
|
punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
|
||||||
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
|
punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
|
||||||
|
|
||||||
movdqa xmmC,xmmA
|
movdqa xmmC, xmmA
|
||||||
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
|
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
|
||||||
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
|
punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
|
||||||
movdqa xmmG,xmmB
|
movdqa xmmG, xmmB
|
||||||
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
|
punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
|
||||||
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
|
punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
|
||||||
|
|
||||||
movdqa xmmD,xmmA
|
movdqa xmmD, xmmA
|
||||||
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||||
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||||
movdqa xmmH,xmmC
|
movdqa xmmH, xmmC
|
||||||
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||||
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||||
|
|
||||||
cmp ecx, byte SIZEOF_XMMWORD
|
cmp ecx, byte SIZEOF_XMMWORD
|
||||||
jb short .column_st32
|
jb short .column_st32
|
||||||
@@ -396,7 +396,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
add ebx, byte SIZEOF_XMMWORD ; inptr1
|
add ebx, byte SIZEOF_XMMWORD ; inptr1
|
||||||
add edx, byte SIZEOF_XMMWORD ; inptr2
|
add edx, byte SIZEOF_XMMWORD ; inptr2
|
||||||
jmp near .columnloop
|
jmp near .columnloop
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.column_st32:
|
.column_st32:
|
||||||
cmp ecx, byte SIZEOF_XMMWORD/2
|
cmp ecx, byte SIZEOF_XMMWORD/2
|
||||||
@@ -404,15 +404,15 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmC
|
movdqa xmmA, xmmC
|
||||||
movdqa xmmD,xmmH
|
movdqa xmmD, xmmH
|
||||||
sub ecx, byte SIZEOF_XMMWORD/2
|
sub ecx, byte SIZEOF_XMMWORD/2
|
||||||
.column_st16:
|
.column_st16:
|
||||||
cmp ecx, byte SIZEOF_XMMWORD/4
|
cmp ecx, byte SIZEOF_XMMWORD/4
|
||||||
jb short .column_st15
|
jb short .column_st15
|
||||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmD
|
movdqa xmmA, xmmD
|
||||||
sub ecx, byte SIZEOF_XMMWORD/4
|
sub ecx, byte SIZEOF_XMMWORD/4
|
||||||
.column_st15:
|
.column_st15:
|
||||||
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
||||||
@@ -441,7 +441,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
; pop edx ; need not be preserved
|
; pop edx ; need not be preserved
|
||||||
; pop ecx ; need not be preserved
|
; pop ecx ; need not be preserved
|
||||||
pop ebx
|
pop ebx
|
||||||
mov esp,ebp ; esp <- aligned ebp
|
mov esp, ebp ; esp <- aligned ebp
|
||||||
pop esp ; esp <- original ebp
|
pop esp ; esp <- original ebp
|
||||||
pop ebp
|
pop ebp
|
||||||
ret
|
ret
|
||||||
@@ -467,7 +467,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
|
|
||||||
EXTN(jsimd_h2v2_merged_upsample_sse2):
|
EXTN(jsimd_h2v2_merged_upsample_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov ebp,esp
|
mov ebp, esp
|
||||||
push ebx
|
push ebx
|
||||||
; push ecx ; need not be preserved
|
; push ecx ; need not be preserved
|
||||||
; push edx ; need not be preserved
|
; push edx ; need not be preserved
|
||||||
@@ -487,7 +487,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
|
|||||||
push edx ; inptr2
|
push edx ; inptr2
|
||||||
push ebx ; inptr1
|
push ebx ; inptr1
|
||||||
push esi ; inptr00
|
push esi ; inptr00
|
||||||
mov ebx,esp
|
mov ebx, esp
|
||||||
|
|
||||||
push edi ; output_buf (outptr0)
|
push edi ; output_buf (outptr0)
|
||||||
push ecx ; in_row_group_ctr
|
push ecx ; in_row_group_ctr
|
||||||
|
|||||||
@@ -62,16 +62,16 @@ PW_EIGHT times 8 dw 8
|
|||||||
|
|
||||||
EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp
|
mov rax, rsp
|
||||||
mov rbp,rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args
|
||||||
|
|
||||||
mov eax, r11d ; colctr
|
mov eax, r11d ; colctr
|
||||||
test rax,rax
|
test rax, rax
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov rcx, r10 ; rowctr
|
mov rcx, r10 ; rowctr
|
||||||
test rcx,rcx
|
test rcx, rcx
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov rsi, r12 ; input_data
|
mov rsi, r12 ; input_data
|
||||||
@@ -90,9 +90,9 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
|||||||
mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
|
mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
|
||||||
mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
||||||
.skip:
|
.skip:
|
||||||
pxor xmm0,xmm0 ; xmm0=(all 0's)
|
pxor xmm0, xmm0 ; xmm0=(all 0's)
|
||||||
pcmpeqb xmm7,xmm7
|
pcmpeqb xmm7, xmm7
|
||||||
psrldq xmm7,(SIZEOF_XMMWORD-1)
|
psrldq xmm7, (SIZEOF_XMMWORD-1)
|
||||||
pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
add rax, byte SIZEOF_XMMWORD-1
|
add rax, byte SIZEOF_XMMWORD-1
|
||||||
@@ -101,58 +101,58 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
|||||||
ja short .columnloop
|
ja short .columnloop
|
||||||
|
|
||||||
.columnloop_last:
|
.columnloop_last:
|
||||||
pcmpeqb xmm6,xmm6
|
pcmpeqb xmm6, xmm6
|
||||||
pslldq xmm6,(SIZEOF_XMMWORD-1)
|
pslldq xmm6, (SIZEOF_XMMWORD-1)
|
||||||
pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||||
jmp short .upsample
|
jmp short .upsample
|
||||||
|
|
||||||
.columnloop:
|
.columnloop:
|
||||||
movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||||
pslldq xmm6,(SIZEOF_XMMWORD-1)
|
pslldq xmm6, (SIZEOF_XMMWORD-1)
|
||||||
|
|
||||||
.upsample:
|
.upsample:
|
||||||
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||||
movdqa xmm2,xmm1
|
movdqa xmm2, xmm1
|
||||||
movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
|
movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
|
||||||
pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14)
|
pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14)
|
||||||
psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --)
|
psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --)
|
||||||
|
|
||||||
por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
|
por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
|
||||||
por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
|
por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
|
||||||
|
|
||||||
movdqa xmm7,xmm1
|
movdqa xmm7, xmm1
|
||||||
psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
|
psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
|
||||||
|
|
||||||
movdqa xmm4,xmm1
|
movdqa xmm4, xmm1
|
||||||
punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
|
punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
|
||||||
punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
|
punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
|
||||||
movdqa xmm5,xmm2
|
movdqa xmm5, xmm2
|
||||||
punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
|
punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
|
||||||
punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
|
punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
|
||||||
movdqa xmm6,xmm3
|
movdqa xmm6, xmm3
|
||||||
punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
|
punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
|
||||||
punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
|
punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
|
||||||
|
|
||||||
pmullw xmm1,[rel PW_THREE]
|
pmullw xmm1, [rel PW_THREE]
|
||||||
pmullw xmm4,[rel PW_THREE]
|
pmullw xmm4, [rel PW_THREE]
|
||||||
paddw xmm2,[rel PW_ONE]
|
paddw xmm2, [rel PW_ONE]
|
||||||
paddw xmm5,[rel PW_ONE]
|
paddw xmm5, [rel PW_ONE]
|
||||||
paddw xmm3,[rel PW_TWO]
|
paddw xmm3, [rel PW_TWO]
|
||||||
paddw xmm6,[rel PW_TWO]
|
paddw xmm6, [rel PW_TWO]
|
||||||
|
|
||||||
paddw xmm2,xmm1
|
paddw xmm2, xmm1
|
||||||
paddw xmm5,xmm4
|
paddw xmm5, xmm4
|
||||||
psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
|
psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
|
||||||
psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
|
psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
|
||||||
paddw xmm3,xmm1
|
paddw xmm3, xmm1
|
||||||
paddw xmm6,xmm4
|
paddw xmm6, xmm4
|
||||||
psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
|
psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
|
||||||
psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
|
psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
|
||||||
|
|
||||||
psllw xmm3,BYTE_BIT
|
psllw xmm3, BYTE_BIT
|
||||||
psllw xmm6,BYTE_BIT
|
psllw xmm6, BYTE_BIT
|
||||||
por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
|
por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
|
||||||
por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
|
por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
|
||||||
|
|
||||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
|
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
|
||||||
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
|
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
|
||||||
@@ -162,7 +162,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
|||||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
cmp rax, byte SIZEOF_XMMWORD
|
cmp rax, byte SIZEOF_XMMWORD
|
||||||
ja near .columnloop
|
ja near .columnloop
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jnz near .columnloop_last
|
jnz near .columnloop_last
|
||||||
|
|
||||||
pop rsi
|
pop rsi
|
||||||
@@ -204,21 +204,21 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
|||||||
|
|
||||||
EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp ; rax = original rbp
|
mov rax, rsp ; rax = original rbp
|
||||||
sub rsp, byte 4
|
sub rsp, byte 4
|
||||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [rsp],rax
|
mov [rsp], rax
|
||||||
mov rbp,rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
mov eax, r11d ; colctr
|
mov eax, r11d ; colctr
|
||||||
test rax,rax
|
test rax, rax
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov rcx, r10 ; rowctr
|
mov rcx, r10 ; rowctr
|
||||||
test rcx,rcx
|
test rcx, rcx
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov rsi, r12 ; input_data
|
mov rsi, r12 ; input_data
|
||||||
@@ -253,35 +253,35 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
|
movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
|
||||||
movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
|
movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
|
||||||
|
|
||||||
pxor xmm3,xmm3 ; xmm3=(all 0's)
|
pxor xmm3, xmm3 ; xmm3=(all 0's)
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
|
punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
|
||||||
punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
|
punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
|
punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
|
||||||
punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
|
punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
|
||||||
movdqa xmm6,xmm2
|
movdqa xmm6, xmm2
|
||||||
punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
|
punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
|
||||||
punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
|
punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
|
||||||
|
|
||||||
pmullw xmm0,[rel PW_THREE]
|
pmullw xmm0, [rel PW_THREE]
|
||||||
pmullw xmm4,[rel PW_THREE]
|
pmullw xmm4, [rel PW_THREE]
|
||||||
|
|
||||||
pcmpeqb xmm7,xmm7
|
pcmpeqb xmm7, xmm7
|
||||||
psrldq xmm7,(SIZEOF_XMMWORD-2)
|
psrldq xmm7, (SIZEOF_XMMWORD-2)
|
||||||
|
|
||||||
paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
|
paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
|
||||||
paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
|
paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
|
||||||
paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
|
paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
|
||||||
paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
|
paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
|
||||||
|
|
||||||
movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
|
movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
|
||||||
movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
|
movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
|
||||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
|
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
|
||||||
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
|
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
|
||||||
|
|
||||||
pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
|
pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
|
||||||
pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
|
pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm1
|
movdqa XMMWORD [wk(0)], xmm1
|
||||||
movdqa XMMWORD [wk(1)], xmm2
|
movdqa XMMWORD [wk(1)], xmm2
|
||||||
@@ -294,9 +294,9 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
.columnloop_last:
|
.columnloop_last:
|
||||||
; -- process the last column block
|
; -- process the last column block
|
||||||
|
|
||||||
pcmpeqb xmm1,xmm1
|
pcmpeqb xmm1, xmm1
|
||||||
pslldq xmm1,(SIZEOF_XMMWORD-2)
|
pslldq xmm1, (SIZEOF_XMMWORD-2)
|
||||||
movdqa xmm2,xmm1
|
movdqa xmm2, xmm1
|
||||||
|
|
||||||
pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
|
pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
|
||||||
pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
|
pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
|
||||||
@@ -313,32 +313,32 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
|
movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
|
||||||
movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
|
movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
|
||||||
|
|
||||||
pxor xmm3,xmm3 ; xmm3=(all 0's)
|
pxor xmm3, xmm3 ; xmm3=(all 0's)
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
|
punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
|
||||||
punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
|
punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
|
punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
|
||||||
punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
|
punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
|
||||||
movdqa xmm6,xmm2
|
movdqa xmm6, xmm2
|
||||||
punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
|
punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
|
||||||
punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
|
punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
|
||||||
|
|
||||||
pmullw xmm0,[rel PW_THREE]
|
pmullw xmm0, [rel PW_THREE]
|
||||||
pmullw xmm4,[rel PW_THREE]
|
pmullw xmm4, [rel PW_THREE]
|
||||||
|
|
||||||
paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
|
paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
|
||||||
paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
|
paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
|
||||||
paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
|
paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
|
||||||
paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
|
paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
|
||||||
|
|
||||||
movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
|
movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
|
||||||
movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
|
movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
|
||||||
movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
|
movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
|
||||||
movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
|
movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
|
||||||
|
|
||||||
pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
|
pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
|
||||||
pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
|
pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(2)], xmm1
|
movdqa XMMWORD [wk(2)], xmm1
|
||||||
movdqa XMMWORD [wk(3)], xmm2
|
movdqa XMMWORD [wk(3)], xmm2
|
||||||
@@ -349,50 +349,50 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
|
movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
|
||||||
movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
|
movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
|
movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
|
||||||
movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
|
movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
|
||||||
psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --)
|
psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --)
|
||||||
pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
|
pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
|
||||||
movdqa xmm5,xmm7
|
movdqa xmm5, xmm7
|
||||||
movdqa xmm6,xmm3
|
movdqa xmm6, xmm3
|
||||||
psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
|
psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
|
||||||
pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14)
|
pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14)
|
||||||
|
|
||||||
por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
|
por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
|
||||||
por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
|
por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
|
||||||
|
|
||||||
movdqa xmm1,xmm7
|
movdqa xmm1, xmm7
|
||||||
movdqa xmm2,xmm3
|
movdqa xmm2, xmm3
|
||||||
pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
|
pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
|
||||||
psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --)
|
psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --)
|
||||||
movdqa xmm4,xmm3
|
movdqa xmm4, xmm3
|
||||||
psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
|
psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
|
||||||
|
|
||||||
por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
|
por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
|
||||||
por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
|
por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm4
|
movdqa XMMWORD [wk(0)], xmm4
|
||||||
|
|
||||||
pmullw xmm7,[rel PW_THREE]
|
pmullw xmm7, [rel PW_THREE]
|
||||||
pmullw xmm3,[rel PW_THREE]
|
pmullw xmm3, [rel PW_THREE]
|
||||||
paddw xmm1,[rel PW_EIGHT]
|
paddw xmm1, [rel PW_EIGHT]
|
||||||
paddw xmm5,[rel PW_EIGHT]
|
paddw xmm5, [rel PW_EIGHT]
|
||||||
paddw xmm0,[rel PW_SEVEN]
|
paddw xmm0, [rel PW_SEVEN]
|
||||||
paddw xmm2,[rel PW_SEVEN]
|
paddw xmm2, [rel PW_SEVEN]
|
||||||
|
|
||||||
paddw xmm1,xmm7
|
paddw xmm1, xmm7
|
||||||
paddw xmm5,xmm3
|
paddw xmm5, xmm3
|
||||||
psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
|
psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
|
||||||
psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
|
psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
|
||||||
paddw xmm0,xmm7
|
paddw xmm0, xmm7
|
||||||
paddw xmm2,xmm3
|
paddw xmm2, xmm3
|
||||||
psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
|
psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
|
||||||
psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
|
psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
|
||||||
|
|
||||||
psllw xmm0,BYTE_BIT
|
psllw xmm0, BYTE_BIT
|
||||||
psllw xmm2,BYTE_BIT
|
psllw xmm2, BYTE_BIT
|
||||||
por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
|
por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
|
||||||
por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
|
por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
|
||||||
|
|
||||||
movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
|
movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
|
||||||
movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
|
movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
|
||||||
@@ -402,50 +402,50 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
|
movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
|
||||||
movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
|
movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
|
movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
|
||||||
movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
|
movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
|
||||||
psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --)
|
psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --)
|
||||||
pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
|
pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
|
||||||
movdqa xmm0,xmm6
|
movdqa xmm0, xmm6
|
||||||
movdqa xmm2,xmm4
|
movdqa xmm2, xmm4
|
||||||
psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
|
psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
|
||||||
pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14)
|
pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14)
|
||||||
|
|
||||||
por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
|
por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
|
||||||
por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
|
por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
|
||||||
|
|
||||||
movdqa xmm1,xmm6
|
movdqa xmm1, xmm6
|
||||||
movdqa xmm5,xmm4
|
movdqa xmm5, xmm4
|
||||||
pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
|
pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
|
||||||
psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --)
|
psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --)
|
||||||
movdqa xmm3,xmm4
|
movdqa xmm3, xmm4
|
||||||
psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
|
psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
|
||||||
|
|
||||||
por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
|
por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
|
||||||
por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
|
por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(1)], xmm3
|
movdqa XMMWORD [wk(1)], xmm3
|
||||||
|
|
||||||
pmullw xmm6,[rel PW_THREE]
|
pmullw xmm6, [rel PW_THREE]
|
||||||
pmullw xmm4,[rel PW_THREE]
|
pmullw xmm4, [rel PW_THREE]
|
||||||
paddw xmm1,[rel PW_EIGHT]
|
paddw xmm1, [rel PW_EIGHT]
|
||||||
paddw xmm0,[rel PW_EIGHT]
|
paddw xmm0, [rel PW_EIGHT]
|
||||||
paddw xmm7,[rel PW_SEVEN]
|
paddw xmm7, [rel PW_SEVEN]
|
||||||
paddw xmm5,[rel PW_SEVEN]
|
paddw xmm5, [rel PW_SEVEN]
|
||||||
|
|
||||||
paddw xmm1,xmm6
|
paddw xmm1, xmm6
|
||||||
paddw xmm0,xmm4
|
paddw xmm0, xmm4
|
||||||
psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
|
psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
|
||||||
psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
|
psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
|
||||||
paddw xmm7,xmm6
|
paddw xmm7, xmm6
|
||||||
paddw xmm5,xmm4
|
paddw xmm5, xmm4
|
||||||
psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
|
psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
|
||||||
psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
|
psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
|
||||||
|
|
||||||
psllw xmm7,BYTE_BIT
|
psllw xmm7, BYTE_BIT
|
||||||
psllw xmm5,BYTE_BIT
|
psllw xmm5, BYTE_BIT
|
||||||
por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
|
por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
|
||||||
por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
|
por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
|
||||||
|
|
||||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
|
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
|
||||||
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
|
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
|
||||||
@@ -458,7 +458,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr1
|
add rdi, byte 2*SIZEOF_XMMWORD ; outptr1
|
||||||
cmp rax, byte SIZEOF_XMMWORD
|
cmp rax, byte SIZEOF_XMMWORD
|
||||||
ja near .columnloop
|
ja near .columnloop
|
||||||
test rax,rax
|
test rax, rax
|
||||||
jnz near .columnloop_last
|
jnz near .columnloop_last
|
||||||
|
|
||||||
pop rsi
|
pop rsi
|
||||||
@@ -474,7 +474,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
.return:
|
.return:
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args
|
||||||
mov rsp,rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
@@ -501,8 +501,8 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
|
|
||||||
EXTN(jsimd_h2v1_upsample_sse2):
|
EXTN(jsimd_h2v1_upsample_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp
|
mov rax, rsp
|
||||||
mov rbp,rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args
|
||||||
|
|
||||||
mov edx, r11d
|
mov edx, r11d
|
||||||
@@ -511,7 +511,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
|
|||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov rcx, r10 ; rowctr
|
mov rcx, r10 ; rowctr
|
||||||
test rcx,rcx
|
test rcx, rcx
|
||||||
jz short .return
|
jz short .return
|
||||||
|
|
||||||
mov rsi, r12 ; input_data
|
mov rsi, r12 ; input_data
|
||||||
@@ -523,14 +523,14 @@ EXTN(jsimd_h2v1_upsample_sse2):
|
|||||||
|
|
||||||
mov rsi, JSAMPROW [rsi] ; inptr
|
mov rsi, JSAMPROW [rsi] ; inptr
|
||||||
mov rdi, JSAMPROW [rdi] ; outptr
|
mov rdi, JSAMPROW [rdi] ; outptr
|
||||||
mov rax,rdx ; colctr
|
mov rax, rdx ; colctr
|
||||||
.columnloop:
|
.columnloop:
|
||||||
|
|
||||||
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
movdqa xmm1,xmm0
|
movdqa xmm1, xmm0
|
||||||
punpcklbw xmm0,xmm0
|
punpcklbw xmm0, xmm0
|
||||||
punpckhbw xmm1,xmm1
|
punpckhbw xmm1, xmm1
|
||||||
|
|
||||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
|
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
|
||||||
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
|
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
|
||||||
@@ -540,9 +540,9 @@ EXTN(jsimd_h2v1_upsample_sse2):
|
|||||||
|
|
||||||
movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
movdqa xmm3,xmm2
|
movdqa xmm3, xmm2
|
||||||
punpcklbw xmm2,xmm2
|
punpcklbw xmm2, xmm2
|
||||||
punpckhbw xmm3,xmm3
|
punpckhbw xmm3, xmm3
|
||||||
|
|
||||||
movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
|
movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
|
||||||
movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
|
movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
|
||||||
@@ -574,7 +574,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
|
|||||||
; It's still a box filter.
|
; It's still a box filter.
|
||||||
;
|
;
|
||||||
; GLOBAL(void)
|
; GLOBAL(void)
|
||||||
; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
|
; jsimd_h2v2_upsample_sse2 (int max_v_samp_factor,
|
||||||
; JDIMENSION output_width,
|
; JDIMENSION output_width,
|
||||||
; JSAMPARRAY input_data,
|
; JSAMPARRAY input_data,
|
||||||
; JSAMPARRAY *output_data_ptr);
|
; JSAMPARRAY *output_data_ptr);
|
||||||
@@ -590,8 +590,8 @@ EXTN(jsimd_h2v1_upsample_sse2):
|
|||||||
|
|
||||||
EXTN(jsimd_h2v2_upsample_sse2):
|
EXTN(jsimd_h2v2_upsample_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp
|
mov rax, rsp
|
||||||
mov rbp,rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
@@ -601,7 +601,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
|
|||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov rcx, r10 ; rowctr
|
mov rcx, r10 ; rowctr
|
||||||
test rcx,rcx
|
test rcx, rcx
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov rsi, r12 ; input_data
|
mov rsi, r12 ; input_data
|
||||||
@@ -614,14 +614,14 @@ EXTN(jsimd_h2v2_upsample_sse2):
|
|||||||
mov rsi, JSAMPROW [rsi] ; inptr
|
mov rsi, JSAMPROW [rsi] ; inptr
|
||||||
mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
|
mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
|
||||||
mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
|
mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
|
||||||
mov rax,rdx ; colctr
|
mov rax, rdx ; colctr
|
||||||
.columnloop:
|
.columnloop:
|
||||||
|
|
||||||
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
movdqa xmm1,xmm0
|
movdqa xmm1, xmm0
|
||||||
punpcklbw xmm0,xmm0
|
punpcklbw xmm0, xmm0
|
||||||
punpckhbw xmm1,xmm1
|
punpckhbw xmm1, xmm1
|
||||||
|
|
||||||
movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
|
movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
|
||||||
movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
|
movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
|
||||||
@@ -633,9 +633,9 @@ EXTN(jsimd_h2v2_upsample_sse2):
|
|||||||
|
|
||||||
movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
movdqa xmm3,xmm2
|
movdqa xmm3, xmm2
|
||||||
punpcklbw xmm2,xmm2
|
punpcklbw xmm2, xmm2
|
||||||
punpckhbw xmm3,xmm3
|
punpckhbw xmm3, xmm3
|
||||||
|
|
||||||
movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
|
movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
|
||||||
movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
|
movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ PW_EIGHT times 8 dw 8
|
|||||||
|
|
||||||
EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov ebp,esp
|
mov ebp, esp
|
||||||
pushpic ebx
|
pushpic ebx
|
||||||
; push ecx ; need not be preserved
|
; push ecx ; need not be preserved
|
||||||
; push edx ; need not be preserved
|
; push edx ; need not be preserved
|
||||||
@@ -71,17 +71,17 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
|||||||
get_GOT ebx ; get GOT address
|
get_GOT ebx ; get GOT address
|
||||||
|
|
||||||
mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
|
mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
||||||
test ecx,ecx
|
test ecx, ecx
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||||
mov edi, POINTER [output_data_ptr(ebp)]
|
mov edi, POINTER [output_data_ptr(ebp)]
|
||||||
mov edi, JSAMPARRAY [edi] ; output_data
|
mov edi, JSAMPARRAY [edi] ; output_data
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.rowloop:
|
.rowloop:
|
||||||
push eax ; colctr
|
push eax ; colctr
|
||||||
push edi
|
push edi
|
||||||
@@ -95,71 +95,71 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
|||||||
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
|
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
|
||||||
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
||||||
.skip:
|
.skip:
|
||||||
pxor xmm0,xmm0 ; xmm0=(all 0's)
|
pxor xmm0, xmm0 ; xmm0=(all 0's)
|
||||||
pcmpeqb xmm7,xmm7
|
pcmpeqb xmm7, xmm7
|
||||||
psrldq xmm7,(SIZEOF_XMMWORD-1)
|
psrldq xmm7, (SIZEOF_XMMWORD-1)
|
||||||
pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
add eax, byte SIZEOF_XMMWORD-1
|
add eax, byte SIZEOF_XMMWORD-1
|
||||||
and eax, byte -SIZEOF_XMMWORD
|
and eax, byte -SIZEOF_XMMWORD
|
||||||
cmp eax, byte SIZEOF_XMMWORD
|
cmp eax, byte SIZEOF_XMMWORD
|
||||||
ja short .columnloop
|
ja short .columnloop
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.columnloop_last:
|
.columnloop_last:
|
||||||
pcmpeqb xmm6,xmm6
|
pcmpeqb xmm6, xmm6
|
||||||
pslldq xmm6,(SIZEOF_XMMWORD-1)
|
pslldq xmm6, (SIZEOF_XMMWORD-1)
|
||||||
pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
jmp short .upsample
|
jmp short .upsample
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.columnloop:
|
.columnloop:
|
||||||
movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||||
pslldq xmm6,(SIZEOF_XMMWORD-1)
|
pslldq xmm6, (SIZEOF_XMMWORD-1)
|
||||||
|
|
||||||
.upsample:
|
.upsample:
|
||||||
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
movdqa xmm2,xmm1
|
movdqa xmm2, xmm1
|
||||||
movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
|
movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
|
||||||
pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14)
|
pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14)
|
||||||
psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --)
|
psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --)
|
||||||
|
|
||||||
por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
|
por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
|
||||||
por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
|
por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
|
||||||
|
|
||||||
movdqa xmm7,xmm1
|
movdqa xmm7, xmm1
|
||||||
psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
|
psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
|
||||||
|
|
||||||
movdqa xmm4,xmm1
|
movdqa xmm4, xmm1
|
||||||
punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
|
punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
|
||||||
punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
|
punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
|
||||||
movdqa xmm5,xmm2
|
movdqa xmm5, xmm2
|
||||||
punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
|
punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
|
||||||
punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
|
punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
|
||||||
movdqa xmm6,xmm3
|
movdqa xmm6, xmm3
|
||||||
punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
|
punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
|
||||||
punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
|
punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
|
||||||
|
|
||||||
pmullw xmm1,[GOTOFF(ebx,PW_THREE)]
|
pmullw xmm1, [GOTOFF(ebx,PW_THREE)]
|
||||||
pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
|
pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
|
||||||
paddw xmm2,[GOTOFF(ebx,PW_ONE)]
|
paddw xmm2, [GOTOFF(ebx,PW_ONE)]
|
||||||
paddw xmm5,[GOTOFF(ebx,PW_ONE)]
|
paddw xmm5, [GOTOFF(ebx,PW_ONE)]
|
||||||
paddw xmm3,[GOTOFF(ebx,PW_TWO)]
|
paddw xmm3, [GOTOFF(ebx,PW_TWO)]
|
||||||
paddw xmm6,[GOTOFF(ebx,PW_TWO)]
|
paddw xmm6, [GOTOFF(ebx,PW_TWO)]
|
||||||
|
|
||||||
paddw xmm2,xmm1
|
paddw xmm2, xmm1
|
||||||
paddw xmm5,xmm4
|
paddw xmm5, xmm4
|
||||||
psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
|
psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
|
||||||
psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
|
psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
|
||||||
paddw xmm3,xmm1
|
paddw xmm3, xmm1
|
||||||
paddw xmm6,xmm4
|
paddw xmm6, xmm4
|
||||||
psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
|
psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
|
||||||
psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
|
psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
|
||||||
|
|
||||||
psllw xmm3,BYTE_BIT
|
psllw xmm3, BYTE_BIT
|
||||||
psllw xmm6,BYTE_BIT
|
psllw xmm6, BYTE_BIT
|
||||||
por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
|
por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
|
||||||
por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
|
por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
|
||||||
|
|
||||||
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
|
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
|
||||||
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
|
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
|
||||||
@@ -169,7 +169,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
|||||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
cmp eax, byte SIZEOF_XMMWORD
|
cmp eax, byte SIZEOF_XMMWORD
|
||||||
ja near .columnloop
|
ja near .columnloop
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jnz near .columnloop_last
|
jnz near .columnloop_last
|
||||||
|
|
||||||
pop esi
|
pop esi
|
||||||
@@ -217,11 +217,11 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
|||||||
|
|
||||||
EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov eax,esp ; eax = original ebp
|
mov eax, esp ; eax = original ebp
|
||||||
sub esp, byte 4
|
sub esp, byte 4
|
||||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [esp],eax
|
mov [esp], eax
|
||||||
mov ebp,esp ; ebp = aligned ebp
|
mov ebp, esp ; ebp = aligned ebp
|
||||||
lea esp, [wk(0)]
|
lea esp, [wk(0)]
|
||||||
pushpic eax ; make a room for GOT address
|
pushpic eax ; make a room for GOT address
|
||||||
push ebx
|
push ebx
|
||||||
@@ -233,19 +233,19 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
get_GOT ebx ; get GOT address
|
get_GOT ebx ; get GOT address
|
||||||
movpic POINTER [gotptr], ebx ; save GOT address
|
movpic POINTER [gotptr], ebx ; save GOT address
|
||||||
|
|
||||||
mov edx,eax ; edx = original ebp
|
mov edx, eax ; edx = original ebp
|
||||||
mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
|
mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov ecx, INT [max_v_samp(edx)] ; rowctr
|
mov ecx, INT [max_v_samp(edx)] ; rowctr
|
||||||
test ecx,ecx
|
test ecx, ecx
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov esi, JSAMPARRAY [input_data(edx)] ; input_data
|
mov esi, JSAMPARRAY [input_data(edx)] ; input_data
|
||||||
mov edi, POINTER [output_data_ptr(edx)]
|
mov edi, POINTER [output_data_ptr(edx)]
|
||||||
mov edi, JSAMPARRAY [edi] ; output_data
|
mov edi, JSAMPARRAY [edi] ; output_data
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.rowloop:
|
.rowloop:
|
||||||
push eax ; colctr
|
push eax ; colctr
|
||||||
push ecx
|
push ecx
|
||||||
@@ -278,35 +278,35 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
pushpic ebx
|
pushpic ebx
|
||||||
movpic ebx, POINTER [gotptr] ; load GOT address
|
movpic ebx, POINTER [gotptr] ; load GOT address
|
||||||
|
|
||||||
pxor xmm3,xmm3 ; xmm3=(all 0's)
|
pxor xmm3, xmm3 ; xmm3=(all 0's)
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
|
punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
|
||||||
punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
|
punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
|
punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
|
||||||
punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
|
punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
|
||||||
movdqa xmm6,xmm2
|
movdqa xmm6, xmm2
|
||||||
punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
|
punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
|
||||||
punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
|
punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
|
||||||
|
|
||||||
pmullw xmm0,[GOTOFF(ebx,PW_THREE)]
|
pmullw xmm0, [GOTOFF(ebx,PW_THREE)]
|
||||||
pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
|
pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
|
||||||
|
|
||||||
pcmpeqb xmm7,xmm7
|
pcmpeqb xmm7, xmm7
|
||||||
psrldq xmm7,(SIZEOF_XMMWORD-2)
|
psrldq xmm7, (SIZEOF_XMMWORD-2)
|
||||||
|
|
||||||
paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
|
paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
|
||||||
paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
|
paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
|
||||||
paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
|
paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
|
||||||
paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
|
paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
|
||||||
|
|
||||||
movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
|
movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
|
||||||
movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
|
movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
|
||||||
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
|
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
|
||||||
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
|
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
|
||||||
|
|
||||||
pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
|
pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
|
||||||
pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
|
pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm1
|
movdqa XMMWORD [wk(0)], xmm1
|
||||||
movdqa XMMWORD [wk(1)], xmm2
|
movdqa XMMWORD [wk(1)], xmm2
|
||||||
@@ -317,7 +317,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
and eax, byte -SIZEOF_XMMWORD
|
and eax, byte -SIZEOF_XMMWORD
|
||||||
cmp eax, byte SIZEOF_XMMWORD
|
cmp eax, byte SIZEOF_XMMWORD
|
||||||
ja short .columnloop
|
ja short .columnloop
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.columnloop_last:
|
.columnloop_last:
|
||||||
; -- process the last column block
|
; -- process the last column block
|
||||||
@@ -325,9 +325,9 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
pushpic ebx
|
pushpic ebx
|
||||||
movpic ebx, POINTER [gotptr] ; load GOT address
|
movpic ebx, POINTER [gotptr] ; load GOT address
|
||||||
|
|
||||||
pcmpeqb xmm1,xmm1
|
pcmpeqb xmm1, xmm1
|
||||||
pslldq xmm1,(SIZEOF_XMMWORD-2)
|
pslldq xmm1, (SIZEOF_XMMWORD-2)
|
||||||
movdqa xmm2,xmm1
|
movdqa xmm2, xmm1
|
||||||
|
|
||||||
pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
|
pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
|
||||||
pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
|
pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
|
||||||
@@ -336,7 +336,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
|
movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
|
||||||
|
|
||||||
jmp near .upsample
|
jmp near .upsample
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.columnloop:
|
.columnloop:
|
||||||
; -- process the next column block
|
; -- process the next column block
|
||||||
@@ -348,32 +348,32 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
pushpic ebx
|
pushpic ebx
|
||||||
movpic ebx, POINTER [gotptr] ; load GOT address
|
movpic ebx, POINTER [gotptr] ; load GOT address
|
||||||
|
|
||||||
pxor xmm3,xmm3 ; xmm3=(all 0's)
|
pxor xmm3, xmm3 ; xmm3=(all 0's)
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
|
punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
|
||||||
punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
|
punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
|
punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
|
||||||
punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
|
punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
|
||||||
movdqa xmm6,xmm2
|
movdqa xmm6, xmm2
|
||||||
punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
|
punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
|
||||||
punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
|
punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
|
||||||
|
|
||||||
pmullw xmm0,[GOTOFF(ebx,PW_THREE)]
|
pmullw xmm0, [GOTOFF(ebx,PW_THREE)]
|
||||||
pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
|
pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
|
||||||
|
|
||||||
paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
|
paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
|
||||||
paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
|
paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
|
||||||
paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
|
paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
|
||||||
paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
|
paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
|
||||||
|
|
||||||
movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
|
movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
|
||||||
movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
|
movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
|
||||||
movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
|
movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
|
||||||
movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
|
movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
|
||||||
|
|
||||||
pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
|
pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
|
||||||
pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
|
pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(2)], xmm1
|
movdqa XMMWORD [wk(2)], xmm1
|
||||||
movdqa XMMWORD [wk(3)], xmm2
|
movdqa XMMWORD [wk(3)], xmm2
|
||||||
@@ -384,50 +384,50 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
|
movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
|
||||||
movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
|
movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
|
movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
|
||||||
movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
|
movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
|
||||||
psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --)
|
psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --)
|
||||||
pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
|
pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
|
||||||
movdqa xmm5,xmm7
|
movdqa xmm5, xmm7
|
||||||
movdqa xmm6,xmm3
|
movdqa xmm6, xmm3
|
||||||
psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
|
psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
|
||||||
pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14)
|
pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14)
|
||||||
|
|
||||||
por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
|
por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
|
||||||
por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
|
por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
|
||||||
|
|
||||||
movdqa xmm1,xmm7
|
movdqa xmm1, xmm7
|
||||||
movdqa xmm2,xmm3
|
movdqa xmm2, xmm3
|
||||||
pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
|
pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
|
||||||
psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --)
|
psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --)
|
||||||
movdqa xmm4,xmm3
|
movdqa xmm4, xmm3
|
||||||
psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
|
psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
|
||||||
|
|
||||||
por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
|
por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
|
||||||
por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
|
por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm4
|
movdqa XMMWORD [wk(0)], xmm4
|
||||||
|
|
||||||
pmullw xmm7,[GOTOFF(ebx,PW_THREE)]
|
pmullw xmm7, [GOTOFF(ebx,PW_THREE)]
|
||||||
pmullw xmm3,[GOTOFF(ebx,PW_THREE)]
|
pmullw xmm3, [GOTOFF(ebx,PW_THREE)]
|
||||||
paddw xmm1,[GOTOFF(ebx,PW_EIGHT)]
|
paddw xmm1, [GOTOFF(ebx,PW_EIGHT)]
|
||||||
paddw xmm5,[GOTOFF(ebx,PW_EIGHT)]
|
paddw xmm5, [GOTOFF(ebx,PW_EIGHT)]
|
||||||
paddw xmm0,[GOTOFF(ebx,PW_SEVEN)]
|
paddw xmm0, [GOTOFF(ebx,PW_SEVEN)]
|
||||||
paddw xmm2,[GOTOFF(ebx,PW_SEVEN)]
|
paddw xmm2, [GOTOFF(ebx,PW_SEVEN)]
|
||||||
|
|
||||||
paddw xmm1,xmm7
|
paddw xmm1, xmm7
|
||||||
paddw xmm5,xmm3
|
paddw xmm5, xmm3
|
||||||
psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
|
psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
|
||||||
psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
|
psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
|
||||||
paddw xmm0,xmm7
|
paddw xmm0, xmm7
|
||||||
paddw xmm2,xmm3
|
paddw xmm2, xmm3
|
||||||
psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
|
psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
|
||||||
psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
|
psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
|
||||||
|
|
||||||
psllw xmm0,BYTE_BIT
|
psllw xmm0, BYTE_BIT
|
||||||
psllw xmm2,BYTE_BIT
|
psllw xmm2, BYTE_BIT
|
||||||
por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
|
por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
|
||||||
por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
|
por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
|
||||||
|
|
||||||
movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
|
movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
|
||||||
movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
|
movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
|
||||||
@@ -437,50 +437,50 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
|
movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
|
||||||
movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
|
movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
|
movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
|
||||||
movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
|
movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
|
||||||
psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --)
|
psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --)
|
||||||
pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
|
pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
|
||||||
movdqa xmm0,xmm6
|
movdqa xmm0, xmm6
|
||||||
movdqa xmm2,xmm4
|
movdqa xmm2, xmm4
|
||||||
psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
|
psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
|
||||||
pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14)
|
pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14)
|
||||||
|
|
||||||
por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
|
por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
|
||||||
por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
|
por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
|
||||||
|
|
||||||
movdqa xmm1,xmm6
|
movdqa xmm1, xmm6
|
||||||
movdqa xmm5,xmm4
|
movdqa xmm5, xmm4
|
||||||
pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
|
pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
|
||||||
psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --)
|
psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --)
|
||||||
movdqa xmm3,xmm4
|
movdqa xmm3, xmm4
|
||||||
psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
|
psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
|
||||||
|
|
||||||
por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
|
por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
|
||||||
por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
|
por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(1)], xmm3
|
movdqa XMMWORD [wk(1)], xmm3
|
||||||
|
|
||||||
pmullw xmm6,[GOTOFF(ebx,PW_THREE)]
|
pmullw xmm6, [GOTOFF(ebx,PW_THREE)]
|
||||||
pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
|
pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
|
||||||
paddw xmm1,[GOTOFF(ebx,PW_EIGHT)]
|
paddw xmm1, [GOTOFF(ebx,PW_EIGHT)]
|
||||||
paddw xmm0,[GOTOFF(ebx,PW_EIGHT)]
|
paddw xmm0, [GOTOFF(ebx,PW_EIGHT)]
|
||||||
paddw xmm7,[GOTOFF(ebx,PW_SEVEN)]
|
paddw xmm7, [GOTOFF(ebx,PW_SEVEN)]
|
||||||
paddw xmm5,[GOTOFF(ebx,PW_SEVEN)]
|
paddw xmm5, [GOTOFF(ebx,PW_SEVEN)]
|
||||||
|
|
||||||
paddw xmm1,xmm6
|
paddw xmm1, xmm6
|
||||||
paddw xmm0,xmm4
|
paddw xmm0, xmm4
|
||||||
psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
|
psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
|
||||||
psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
|
psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
|
||||||
paddw xmm7,xmm6
|
paddw xmm7, xmm6
|
||||||
paddw xmm5,xmm4
|
paddw xmm5, xmm4
|
||||||
psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
|
psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
|
||||||
psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
|
psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
|
||||||
|
|
||||||
psllw xmm7,BYTE_BIT
|
psllw xmm7, BYTE_BIT
|
||||||
psllw xmm5,BYTE_BIT
|
psllw xmm5, BYTE_BIT
|
||||||
por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
|
por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
|
||||||
por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
|
por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
|
||||||
|
|
||||||
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
|
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
|
||||||
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
|
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
|
||||||
@@ -495,7 +495,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr1
|
add edi, byte 2*SIZEOF_XMMWORD ; outptr1
|
||||||
cmp eax, byte SIZEOF_XMMWORD
|
cmp eax, byte SIZEOF_XMMWORD
|
||||||
ja near .columnloop
|
ja near .columnloop
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jnz near .columnloop_last
|
jnz near .columnloop_last
|
||||||
|
|
||||||
pop esi
|
pop esi
|
||||||
@@ -514,7 +514,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
; pop edx ; need not be preserved
|
; pop edx ; need not be preserved
|
||||||
; pop ecx ; need not be preserved
|
; pop ecx ; need not be preserved
|
||||||
pop ebx
|
pop ebx
|
||||||
mov esp,ebp ; esp <- aligned ebp
|
mov esp, ebp ; esp <- aligned ebp
|
||||||
pop esp ; esp <- original ebp
|
pop esp ; esp <- original ebp
|
||||||
pop ebp
|
pop ebp
|
||||||
ret
|
ret
|
||||||
@@ -541,7 +541,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
|
|
||||||
EXTN(jsimd_h2v1_upsample_sse2):
|
EXTN(jsimd_h2v1_upsample_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov ebp,esp
|
mov ebp, esp
|
||||||
; push ebx ; unused
|
; push ebx ; unused
|
||||||
; push ecx ; need not be preserved
|
; push ecx ; need not be preserved
|
||||||
; push edx ; need not be preserved
|
; push edx ; need not be preserved
|
||||||
@@ -554,28 +554,28 @@ EXTN(jsimd_h2v1_upsample_sse2):
|
|||||||
jz short .return
|
jz short .return
|
||||||
|
|
||||||
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
||||||
test ecx,ecx
|
test ecx, ecx
|
||||||
jz short .return
|
jz short .return
|
||||||
|
|
||||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||||
mov edi, POINTER [output_data_ptr(ebp)]
|
mov edi, POINTER [output_data_ptr(ebp)]
|
||||||
mov edi, JSAMPARRAY [edi] ; output_data
|
mov edi, JSAMPARRAY [edi] ; output_data
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.rowloop:
|
.rowloop:
|
||||||
push edi
|
push edi
|
||||||
push esi
|
push esi
|
||||||
|
|
||||||
mov esi, JSAMPROW [esi] ; inptr
|
mov esi, JSAMPROW [esi] ; inptr
|
||||||
mov edi, JSAMPROW [edi] ; outptr
|
mov edi, JSAMPROW [edi] ; outptr
|
||||||
mov eax,edx ; colctr
|
mov eax, edx ; colctr
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.columnloop:
|
.columnloop:
|
||||||
|
|
||||||
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
movdqa xmm1,xmm0
|
movdqa xmm1, xmm0
|
||||||
punpcklbw xmm0,xmm0
|
punpcklbw xmm0, xmm0
|
||||||
punpckhbw xmm1,xmm1
|
punpckhbw xmm1, xmm1
|
||||||
|
|
||||||
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
|
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
|
||||||
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
|
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
|
||||||
@@ -585,9 +585,9 @@ EXTN(jsimd_h2v1_upsample_sse2):
|
|||||||
|
|
||||||
movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
movdqa xmm3,xmm2
|
movdqa xmm3, xmm2
|
||||||
punpcklbw xmm2,xmm2
|
punpcklbw xmm2, xmm2
|
||||||
punpckhbw xmm3,xmm3
|
punpckhbw xmm3, xmm3
|
||||||
|
|
||||||
movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
|
movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
|
||||||
movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
|
movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
|
||||||
@@ -598,7 +598,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
|
|||||||
add esi, byte 2*SIZEOF_XMMWORD ; inptr
|
add esi, byte 2*SIZEOF_XMMWORD ; inptr
|
||||||
add edi, byte 4*SIZEOF_XMMWORD ; outptr
|
add edi, byte 4*SIZEOF_XMMWORD ; outptr
|
||||||
jmp short .columnloop
|
jmp short .columnloop
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.nextrow:
|
.nextrow:
|
||||||
pop esi
|
pop esi
|
||||||
@@ -640,7 +640,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
|
|||||||
|
|
||||||
EXTN(jsimd_h2v2_upsample_sse2):
|
EXTN(jsimd_h2v2_upsample_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov ebp,esp
|
mov ebp, esp
|
||||||
push ebx
|
push ebx
|
||||||
; push ecx ; need not be preserved
|
; push ecx ; need not be preserved
|
||||||
; push edx ; need not be preserved
|
; push edx ; need not be preserved
|
||||||
@@ -653,13 +653,13 @@ EXTN(jsimd_h2v2_upsample_sse2):
|
|||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
||||||
test ecx,ecx
|
test ecx, ecx
|
||||||
jz near .return
|
jz near .return
|
||||||
|
|
||||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||||
mov edi, POINTER [output_data_ptr(ebp)]
|
mov edi, POINTER [output_data_ptr(ebp)]
|
||||||
mov edi, JSAMPARRAY [edi] ; output_data
|
mov edi, JSAMPARRAY [edi] ; output_data
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.rowloop:
|
.rowloop:
|
||||||
push edi
|
push edi
|
||||||
push esi
|
push esi
|
||||||
@@ -667,15 +667,15 @@ EXTN(jsimd_h2v2_upsample_sse2):
|
|||||||
mov esi, JSAMPROW [esi] ; inptr
|
mov esi, JSAMPROW [esi] ; inptr
|
||||||
mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
|
mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
|
||||||
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
|
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
|
||||||
mov eax,edx ; colctr
|
mov eax, edx ; colctr
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.columnloop:
|
.columnloop:
|
||||||
|
|
||||||
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
movdqa xmm1,xmm0
|
movdqa xmm1, xmm0
|
||||||
punpcklbw xmm0,xmm0
|
punpcklbw xmm0, xmm0
|
||||||
punpckhbw xmm1,xmm1
|
punpckhbw xmm1, xmm1
|
||||||
|
|
||||||
movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
|
movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
|
||||||
movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
|
movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
|
||||||
@@ -687,9 +687,9 @@ EXTN(jsimd_h2v2_upsample_sse2):
|
|||||||
|
|
||||||
movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||||
|
|
||||||
movdqa xmm3,xmm2
|
movdqa xmm3, xmm2
|
||||||
punpcklbw xmm2,xmm2
|
punpcklbw xmm2, xmm2
|
||||||
punpckhbw xmm3,xmm3
|
punpckhbw xmm3, xmm3
|
||||||
|
|
||||||
movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
|
movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
|
||||||
movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
|
movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
|
||||||
@@ -703,7 +703,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
|
|||||||
add ebx, byte 4*SIZEOF_XMMWORD ; outptr0
|
add ebx, byte 4*SIZEOF_XMMWORD ; outptr0
|
||||||
add edi, byte 4*SIZEOF_XMMWORD ; outptr1
|
add edi, byte 4*SIZEOF_XMMWORD ; outptr1
|
||||||
jmp short .columnloop
|
jmp short .columnloop
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
|
|
||||||
.nextrow:
|
.nextrow:
|
||||||
pop esi
|
pop esi
|
||||||
|
|||||||
@@ -26,11 +26,11 @@
|
|||||||
; --------------------------------------------------------------------------
|
; --------------------------------------------------------------------------
|
||||||
|
|
||||||
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
|
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
|
||||||
shufps %1,%2,0x44
|
shufps %1, %2, 0x44
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
|
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
|
||||||
shufps %1,%2,0xEE
|
shufps %1, %2, 0xEE
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
; --------------------------------------------------------------------------
|
; --------------------------------------------------------------------------
|
||||||
@@ -68,11 +68,11 @@ PD_1_306 times 4 dd 1.306562964876376527856643
|
|||||||
|
|
||||||
EXTN(jsimd_fdct_float_sse):
|
EXTN(jsimd_fdct_float_sse):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp ; rax = original rbp
|
mov rax, rsp ; rax = original rbp
|
||||||
sub rsp, byte 4
|
sub rsp, byte 4
|
||||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [rsp],rax
|
mov [rsp], rax
|
||||||
mov rbp,rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args
|
||||||
|
|
||||||
@@ -90,12 +90,12 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
|
; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
|
||||||
; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
|
; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
|
||||||
|
|
||||||
movaps xmm4,xmm0 ; transpose coefficients(phase 1)
|
movaps xmm4, xmm0 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31)
|
unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
|
||||||
unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33)
|
unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
|
||||||
movaps xmm5,xmm2 ; transpose coefficients(phase 1)
|
movaps xmm5, xmm2 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35)
|
unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
|
||||||
unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37)
|
unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
|
||||||
|
|
||||||
movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
|
movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||||
movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
|
movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||||
@@ -108,64 +108,64 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
|
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
|
||||||
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
|
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
|
||||||
|
|
||||||
movaps xmm4,xmm6 ; transpose coefficients(phase 1)
|
movaps xmm4, xmm6 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
|
unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
|
||||||
unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13)
|
unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
|
||||||
movaps xmm2,xmm1 ; transpose coefficients(phase 1)
|
movaps xmm2, xmm1 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15)
|
unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
|
||||||
unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17)
|
unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
|
||||||
|
|
||||||
movaps xmm7,xmm6 ; transpose coefficients(phase 2)
|
movaps xmm7, xmm6 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0
|
unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
|
||||||
unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1
|
unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
|
||||||
movaps xmm3,xmm2 ; transpose coefficients(phase 2)
|
movaps xmm3, xmm2 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6
|
unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
|
||||||
unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7
|
unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
|
||||||
|
|
||||||
movaps xmm0,xmm7
|
movaps xmm0, xmm7
|
||||||
movaps xmm5,xmm6
|
movaps xmm5, xmm6
|
||||||
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
|
subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
|
||||||
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
|
subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
|
||||||
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
|
addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
|
||||||
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
|
addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
|
||||||
|
|
||||||
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
|
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
|
||||||
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
|
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
|
||||||
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
|
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
|
||||||
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
||||||
|
|
||||||
movaps xmm7,xmm4 ; transpose coefficients(phase 2)
|
movaps xmm7, xmm4 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2
|
unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
|
||||||
unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3
|
unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
|
||||||
movaps xmm6,xmm1 ; transpose coefficients(phase 2)
|
movaps xmm6, xmm1 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4
|
unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
|
||||||
unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5
|
unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
|
||||||
|
|
||||||
movaps xmm2,xmm7
|
movaps xmm2, xmm7
|
||||||
movaps xmm3,xmm4
|
movaps xmm3, xmm4
|
||||||
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
|
addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
|
||||||
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
|
addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
|
||||||
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
|
subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
|
||||||
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
|
subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
movaps xmm1,xmm5
|
movaps xmm1, xmm5
|
||||||
movaps xmm6,xmm0
|
movaps xmm6, xmm0
|
||||||
subps xmm5,xmm7 ; xmm5=tmp13
|
subps xmm5, xmm7 ; xmm5=tmp13
|
||||||
subps xmm0,xmm4 ; xmm0=tmp12
|
subps xmm0, xmm4 ; xmm0=tmp12
|
||||||
addps xmm1,xmm7 ; xmm1=tmp10
|
addps xmm1, xmm7 ; xmm1=tmp10
|
||||||
addps xmm6,xmm4 ; xmm6=tmp11
|
addps xmm6, xmm4 ; xmm6=tmp11
|
||||||
|
|
||||||
addps xmm0,xmm5
|
addps xmm0, xmm5
|
||||||
mulps xmm0,[rel PD_0_707] ; xmm0=z1
|
mulps xmm0, [rel PD_0_707] ; xmm0=z1
|
||||||
|
|
||||||
movaps xmm7,xmm1
|
movaps xmm7, xmm1
|
||||||
movaps xmm4,xmm5
|
movaps xmm4, xmm5
|
||||||
subps xmm1,xmm6 ; xmm1=data4
|
subps xmm1, xmm6 ; xmm1=data4
|
||||||
subps xmm5,xmm0 ; xmm5=data6
|
subps xmm5, xmm0 ; xmm5=data6
|
||||||
addps xmm7,xmm6 ; xmm7=data0
|
addps xmm7, xmm6 ; xmm7=data0
|
||||||
addps xmm4,xmm0 ; xmm4=data2
|
addps xmm4, xmm0 ; xmm4=data2
|
||||||
|
|
||||||
movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
|
movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
|
||||||
movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
|
movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
|
||||||
@@ -177,30 +177,30 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
|
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
|
||||||
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
|
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
|
||||||
|
|
||||||
addps xmm2,xmm3 ; xmm2=tmp10
|
addps xmm2, xmm3 ; xmm2=tmp10
|
||||||
addps xmm3,xmm6 ; xmm3=tmp11
|
addps xmm3, xmm6 ; xmm3=tmp11
|
||||||
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
|
addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
|
||||||
|
|
||||||
mulps xmm3,[rel PD_0_707] ; xmm3=z3
|
mulps xmm3, [rel PD_0_707] ; xmm3=z3
|
||||||
|
|
||||||
movaps xmm1,xmm2 ; xmm1=tmp10
|
movaps xmm1, xmm2 ; xmm1=tmp10
|
||||||
subps xmm2,xmm6
|
subps xmm2, xmm6
|
||||||
mulps xmm2,[rel PD_0_382] ; xmm2=z5
|
mulps xmm2, [rel PD_0_382] ; xmm2=z5
|
||||||
mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
|
mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
|
||||||
mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
|
mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
|
||||||
addps xmm1,xmm2 ; xmm1=z2
|
addps xmm1, xmm2 ; xmm1=z2
|
||||||
addps xmm6,xmm2 ; xmm6=z4
|
addps xmm6, xmm2 ; xmm6=z4
|
||||||
|
|
||||||
movaps xmm5,xmm0
|
movaps xmm5, xmm0
|
||||||
subps xmm0,xmm3 ; xmm0=z13
|
subps xmm0, xmm3 ; xmm0=z13
|
||||||
addps xmm5,xmm3 ; xmm5=z11
|
addps xmm5, xmm3 ; xmm5=z11
|
||||||
|
|
||||||
movaps xmm7,xmm0
|
movaps xmm7, xmm0
|
||||||
movaps xmm4,xmm5
|
movaps xmm4, xmm5
|
||||||
subps xmm0,xmm1 ; xmm0=data3
|
subps xmm0, xmm1 ; xmm0=data3
|
||||||
subps xmm5,xmm6 ; xmm5=data7
|
subps xmm5, xmm6 ; xmm5=data7
|
||||||
addps xmm7,xmm1 ; xmm7=data5
|
addps xmm7, xmm1 ; xmm7=data5
|
||||||
addps xmm4,xmm6 ; xmm4=data1
|
addps xmm4, xmm6 ; xmm4=data1
|
||||||
|
|
||||||
movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
|
movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
|
||||||
movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
|
movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
|
||||||
@@ -225,12 +225,12 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
|
; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
|
||||||
; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
|
; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
|
||||||
|
|
||||||
movaps xmm4,xmm0 ; transpose coefficients(phase 1)
|
movaps xmm4, xmm0 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13)
|
unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
|
||||||
unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33)
|
unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
|
||||||
movaps xmm5,xmm2 ; transpose coefficients(phase 1)
|
movaps xmm5, xmm2 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53)
|
unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
|
||||||
unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73)
|
unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
|
||||||
|
|
||||||
movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
|
movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||||
movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
|
movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||||
@@ -243,64 +243,64 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
|
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
|
||||||
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
|
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
|
||||||
|
|
||||||
movaps xmm4,xmm6 ; transpose coefficients(phase 1)
|
movaps xmm4, xmm6 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11)
|
unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
|
||||||
unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31)
|
unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
|
||||||
movaps xmm2,xmm1 ; transpose coefficients(phase 1)
|
movaps xmm2, xmm1 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51)
|
unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
|
||||||
unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71)
|
unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
|
||||||
|
|
||||||
movaps xmm7,xmm6 ; transpose coefficients(phase 2)
|
movaps xmm7, xmm6 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0
|
unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
|
||||||
unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1
|
unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
|
||||||
movaps xmm3,xmm2 ; transpose coefficients(phase 2)
|
movaps xmm3, xmm2 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6
|
unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
|
||||||
unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7
|
unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
|
||||||
|
|
||||||
movaps xmm0,xmm7
|
movaps xmm0, xmm7
|
||||||
movaps xmm5,xmm6
|
movaps xmm5, xmm6
|
||||||
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
|
subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
|
||||||
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
|
subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
|
||||||
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
|
addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
|
||||||
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
|
addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
|
||||||
|
|
||||||
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
|
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
|
||||||
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
|
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
|
||||||
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
|
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
|
||||||
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
||||||
|
|
||||||
movaps xmm7,xmm4 ; transpose coefficients(phase 2)
|
movaps xmm7, xmm4 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2
|
unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
|
||||||
unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3
|
unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
|
||||||
movaps xmm6,xmm1 ; transpose coefficients(phase 2)
|
movaps xmm6, xmm1 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4
|
unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
|
||||||
unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5
|
unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
|
||||||
|
|
||||||
movaps xmm2,xmm7
|
movaps xmm2, xmm7
|
||||||
movaps xmm3,xmm4
|
movaps xmm3, xmm4
|
||||||
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
|
addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
|
||||||
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
|
addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
|
||||||
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
|
subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
|
||||||
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
|
subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
movaps xmm1,xmm5
|
movaps xmm1, xmm5
|
||||||
movaps xmm6,xmm0
|
movaps xmm6, xmm0
|
||||||
subps xmm5,xmm7 ; xmm5=tmp13
|
subps xmm5, xmm7 ; xmm5=tmp13
|
||||||
subps xmm0,xmm4 ; xmm0=tmp12
|
subps xmm0, xmm4 ; xmm0=tmp12
|
||||||
addps xmm1,xmm7 ; xmm1=tmp10
|
addps xmm1, xmm7 ; xmm1=tmp10
|
||||||
addps xmm6,xmm4 ; xmm6=tmp11
|
addps xmm6, xmm4 ; xmm6=tmp11
|
||||||
|
|
||||||
addps xmm0,xmm5
|
addps xmm0, xmm5
|
||||||
mulps xmm0,[rel PD_0_707] ; xmm0=z1
|
mulps xmm0, [rel PD_0_707] ; xmm0=z1
|
||||||
|
|
||||||
movaps xmm7,xmm1
|
movaps xmm7, xmm1
|
||||||
movaps xmm4,xmm5
|
movaps xmm4, xmm5
|
||||||
subps xmm1,xmm6 ; xmm1=data4
|
subps xmm1, xmm6 ; xmm1=data4
|
||||||
subps xmm5,xmm0 ; xmm5=data6
|
subps xmm5, xmm0 ; xmm5=data6
|
||||||
addps xmm7,xmm6 ; xmm7=data0
|
addps xmm7, xmm6 ; xmm7=data0
|
||||||
addps xmm4,xmm0 ; xmm4=data2
|
addps xmm4, xmm0 ; xmm4=data2
|
||||||
|
|
||||||
movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
|
movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
|
||||||
movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
|
movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
|
||||||
@@ -312,30 +312,30 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
|
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
|
||||||
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
|
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
|
||||||
|
|
||||||
addps xmm2,xmm3 ; xmm2=tmp10
|
addps xmm2, xmm3 ; xmm2=tmp10
|
||||||
addps xmm3,xmm6 ; xmm3=tmp11
|
addps xmm3, xmm6 ; xmm3=tmp11
|
||||||
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
|
addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
|
||||||
|
|
||||||
mulps xmm3,[rel PD_0_707] ; xmm3=z3
|
mulps xmm3, [rel PD_0_707] ; xmm3=z3
|
||||||
|
|
||||||
movaps xmm1,xmm2 ; xmm1=tmp10
|
movaps xmm1, xmm2 ; xmm1=tmp10
|
||||||
subps xmm2,xmm6
|
subps xmm2, xmm6
|
||||||
mulps xmm2,[rel PD_0_382] ; xmm2=z5
|
mulps xmm2, [rel PD_0_382] ; xmm2=z5
|
||||||
mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
|
mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
|
||||||
mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
|
mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
|
||||||
addps xmm1,xmm2 ; xmm1=z2
|
addps xmm1, xmm2 ; xmm1=z2
|
||||||
addps xmm6,xmm2 ; xmm6=z4
|
addps xmm6, xmm2 ; xmm6=z4
|
||||||
|
|
||||||
movaps xmm5,xmm0
|
movaps xmm5, xmm0
|
||||||
subps xmm0,xmm3 ; xmm0=z13
|
subps xmm0, xmm3 ; xmm0=z13
|
||||||
addps xmm5,xmm3 ; xmm5=z11
|
addps xmm5, xmm3 ; xmm5=z11
|
||||||
|
|
||||||
movaps xmm7,xmm0
|
movaps xmm7, xmm0
|
||||||
movaps xmm4,xmm5
|
movaps xmm4, xmm5
|
||||||
subps xmm0,xmm1 ; xmm0=data3
|
subps xmm0, xmm1 ; xmm0=data3
|
||||||
subps xmm5,xmm6 ; xmm5=data7
|
subps xmm5, xmm6 ; xmm5=data7
|
||||||
addps xmm7,xmm1 ; xmm7=data5
|
addps xmm7, xmm1 ; xmm7=data5
|
||||||
addps xmm4,xmm6 ; xmm4=data1
|
addps xmm4, xmm6 ; xmm4=data1
|
||||||
|
|
||||||
movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
|
movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
|
||||||
movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
|
movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
|
||||||
@@ -347,7 +347,7 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
jnz near .columnloop
|
jnz near .columnloop
|
||||||
|
|
||||||
uncollect_args
|
uncollect_args
|
||||||
mov rsp,rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -25,11 +25,11 @@
|
|||||||
; --------------------------------------------------------------------------
|
; --------------------------------------------------------------------------
|
||||||
|
|
||||||
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
|
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
|
||||||
shufps %1,%2,0x44
|
shufps %1, %2, 0x44
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
|
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
|
||||||
shufps %1,%2,0xEE
|
shufps %1, %2, 0xEE
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
; --------------------------------------------------------------------------
|
; --------------------------------------------------------------------------
|
||||||
@@ -68,11 +68,11 @@ PD_1_306 times 4 dd 1.306562964876376527856643
|
|||||||
|
|
||||||
EXTN(jsimd_fdct_float_sse):
|
EXTN(jsimd_fdct_float_sse):
|
||||||
push ebp
|
push ebp
|
||||||
mov eax,esp ; eax = original ebp
|
mov eax, esp ; eax = original ebp
|
||||||
sub esp, byte 4
|
sub esp, byte 4
|
||||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [esp],eax
|
mov [esp], eax
|
||||||
mov ebp,esp ; ebp = aligned ebp
|
mov ebp, esp ; ebp = aligned ebp
|
||||||
lea esp, [wk(0)]
|
lea esp, [wk(0)]
|
||||||
pushpic ebx
|
pushpic ebx
|
||||||
; push ecx ; need not be preserved
|
; push ecx ; need not be preserved
|
||||||
@@ -86,7 +86,7 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
|
|
||||||
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
|
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
|
||||||
mov ecx, DCTSIZE/4
|
mov ecx, DCTSIZE/4
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.rowloop:
|
.rowloop:
|
||||||
|
|
||||||
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
|
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
|
||||||
@@ -97,12 +97,12 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
|
; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
|
||||||
; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
|
; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
|
||||||
|
|
||||||
movaps xmm4,xmm0 ; transpose coefficients(phase 1)
|
movaps xmm4, xmm0 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31)
|
unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
|
||||||
unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33)
|
unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
|
||||||
movaps xmm5,xmm2 ; transpose coefficients(phase 1)
|
movaps xmm5, xmm2 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35)
|
unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
|
||||||
unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37)
|
unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
|
||||||
|
|
||||||
movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
|
movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
|
||||||
movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
|
movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
|
||||||
@@ -115,64 +115,64 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
|
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
|
||||||
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
|
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
|
||||||
|
|
||||||
movaps xmm4,xmm6 ; transpose coefficients(phase 1)
|
movaps xmm4, xmm6 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
|
unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
|
||||||
unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13)
|
unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
|
||||||
movaps xmm2,xmm1 ; transpose coefficients(phase 1)
|
movaps xmm2, xmm1 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15)
|
unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
|
||||||
unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17)
|
unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
|
||||||
|
|
||||||
movaps xmm7,xmm6 ; transpose coefficients(phase 2)
|
movaps xmm7, xmm6 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0
|
unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
|
||||||
unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1
|
unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
|
||||||
movaps xmm3,xmm2 ; transpose coefficients(phase 2)
|
movaps xmm3, xmm2 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6
|
unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
|
||||||
unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7
|
unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
|
||||||
|
|
||||||
movaps xmm0,xmm7
|
movaps xmm0, xmm7
|
||||||
movaps xmm5,xmm6
|
movaps xmm5, xmm6
|
||||||
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
|
subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
|
||||||
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
|
subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
|
||||||
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
|
addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
|
||||||
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
|
addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
|
||||||
|
|
||||||
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
|
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
|
||||||
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
|
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
|
||||||
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
|
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
|
||||||
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
||||||
|
|
||||||
movaps xmm7,xmm4 ; transpose coefficients(phase 2)
|
movaps xmm7, xmm4 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2
|
unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
|
||||||
unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3
|
unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
|
||||||
movaps xmm6,xmm1 ; transpose coefficients(phase 2)
|
movaps xmm6, xmm1 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4
|
unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
|
||||||
unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5
|
unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
|
||||||
|
|
||||||
movaps xmm2,xmm7
|
movaps xmm2, xmm7
|
||||||
movaps xmm3,xmm4
|
movaps xmm3, xmm4
|
||||||
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
|
addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
|
||||||
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
|
addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
|
||||||
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
|
subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
|
||||||
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
|
subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
movaps xmm1,xmm5
|
movaps xmm1, xmm5
|
||||||
movaps xmm6,xmm0
|
movaps xmm6, xmm0
|
||||||
subps xmm5,xmm7 ; xmm5=tmp13
|
subps xmm5, xmm7 ; xmm5=tmp13
|
||||||
subps xmm0,xmm4 ; xmm0=tmp12
|
subps xmm0, xmm4 ; xmm0=tmp12
|
||||||
addps xmm1,xmm7 ; xmm1=tmp10
|
addps xmm1, xmm7 ; xmm1=tmp10
|
||||||
addps xmm6,xmm4 ; xmm6=tmp11
|
addps xmm6, xmm4 ; xmm6=tmp11
|
||||||
|
|
||||||
addps xmm0,xmm5
|
addps xmm0, xmm5
|
||||||
mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
|
mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
|
||||||
|
|
||||||
movaps xmm7,xmm1
|
movaps xmm7, xmm1
|
||||||
movaps xmm4,xmm5
|
movaps xmm4, xmm5
|
||||||
subps xmm1,xmm6 ; xmm1=data4
|
subps xmm1, xmm6 ; xmm1=data4
|
||||||
subps xmm5,xmm0 ; xmm5=data6
|
subps xmm5, xmm0 ; xmm5=data6
|
||||||
addps xmm7,xmm6 ; xmm7=data0
|
addps xmm7, xmm6 ; xmm7=data0
|
||||||
addps xmm4,xmm0 ; xmm4=data2
|
addps xmm4, xmm0 ; xmm4=data2
|
||||||
|
|
||||||
movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
|
movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
|
||||||
movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
|
movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
|
||||||
@@ -184,30 +184,30 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
|
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
|
||||||
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
|
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
|
||||||
|
|
||||||
addps xmm2,xmm3 ; xmm2=tmp10
|
addps xmm2, xmm3 ; xmm2=tmp10
|
||||||
addps xmm3,xmm6 ; xmm3=tmp11
|
addps xmm3, xmm6 ; xmm3=tmp11
|
||||||
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
|
addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
|
||||||
|
|
||||||
mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
|
mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
|
||||||
|
|
||||||
movaps xmm1,xmm2 ; xmm1=tmp10
|
movaps xmm1, xmm2 ; xmm1=tmp10
|
||||||
subps xmm2,xmm6
|
subps xmm2, xmm6
|
||||||
mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
|
mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
|
||||||
mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
|
mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
|
||||||
mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
|
mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
|
||||||
addps xmm1,xmm2 ; xmm1=z2
|
addps xmm1, xmm2 ; xmm1=z2
|
||||||
addps xmm6,xmm2 ; xmm6=z4
|
addps xmm6, xmm2 ; xmm6=z4
|
||||||
|
|
||||||
movaps xmm5,xmm0
|
movaps xmm5, xmm0
|
||||||
subps xmm0,xmm3 ; xmm0=z13
|
subps xmm0, xmm3 ; xmm0=z13
|
||||||
addps xmm5,xmm3 ; xmm5=z11
|
addps xmm5, xmm3 ; xmm5=z11
|
||||||
|
|
||||||
movaps xmm7,xmm0
|
movaps xmm7, xmm0
|
||||||
movaps xmm4,xmm5
|
movaps xmm4, xmm5
|
||||||
subps xmm0,xmm1 ; xmm0=data3
|
subps xmm0, xmm1 ; xmm0=data3
|
||||||
subps xmm5,xmm6 ; xmm5=data7
|
subps xmm5, xmm6 ; xmm5=data7
|
||||||
addps xmm7,xmm1 ; xmm7=data5
|
addps xmm7, xmm1 ; xmm7=data5
|
||||||
addps xmm4,xmm6 ; xmm4=data1
|
addps xmm4, xmm6 ; xmm4=data1
|
||||||
|
|
||||||
movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
|
movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
|
||||||
movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
|
movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
|
||||||
@@ -222,7 +222,7 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
|
|
||||||
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
|
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
|
||||||
mov ecx, DCTSIZE/4
|
mov ecx, DCTSIZE/4
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.columnloop:
|
.columnloop:
|
||||||
|
|
||||||
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
|
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
|
||||||
@@ -233,12 +233,12 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
|
; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
|
||||||
; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
|
; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
|
||||||
|
|
||||||
movaps xmm4,xmm0 ; transpose coefficients(phase 1)
|
movaps xmm4, xmm0 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13)
|
unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
|
||||||
unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33)
|
unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
|
||||||
movaps xmm5,xmm2 ; transpose coefficients(phase 1)
|
movaps xmm5, xmm2 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53)
|
unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
|
||||||
unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73)
|
unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
|
||||||
|
|
||||||
movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
|
movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
|
||||||
movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
|
movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
|
||||||
@@ -251,64 +251,64 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
|
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
|
||||||
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
|
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
|
||||||
|
|
||||||
movaps xmm4,xmm6 ; transpose coefficients(phase 1)
|
movaps xmm4, xmm6 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11)
|
unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
|
||||||
unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31)
|
unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
|
||||||
movaps xmm2,xmm1 ; transpose coefficients(phase 1)
|
movaps xmm2, xmm1 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51)
|
unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
|
||||||
unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71)
|
unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
|
||||||
|
|
||||||
movaps xmm7,xmm6 ; transpose coefficients(phase 2)
|
movaps xmm7, xmm6 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0
|
unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
|
||||||
unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1
|
unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
|
||||||
movaps xmm3,xmm2 ; transpose coefficients(phase 2)
|
movaps xmm3, xmm2 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6
|
unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
|
||||||
unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7
|
unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
|
||||||
|
|
||||||
movaps xmm0,xmm7
|
movaps xmm0, xmm7
|
||||||
movaps xmm5,xmm6
|
movaps xmm5, xmm6
|
||||||
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
|
subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
|
||||||
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
|
subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
|
||||||
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
|
addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
|
||||||
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
|
addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
|
||||||
|
|
||||||
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
|
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
|
||||||
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
|
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
|
||||||
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
|
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
|
||||||
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
||||||
|
|
||||||
movaps xmm7,xmm4 ; transpose coefficients(phase 2)
|
movaps xmm7, xmm4 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2
|
unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
|
||||||
unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3
|
unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
|
||||||
movaps xmm6,xmm1 ; transpose coefficients(phase 2)
|
movaps xmm6, xmm1 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4
|
unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
|
||||||
unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5
|
unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
|
||||||
|
|
||||||
movaps xmm2,xmm7
|
movaps xmm2, xmm7
|
||||||
movaps xmm3,xmm4
|
movaps xmm3, xmm4
|
||||||
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
|
addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
|
||||||
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
|
addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
|
||||||
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
|
subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
|
||||||
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
|
subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
movaps xmm1,xmm5
|
movaps xmm1, xmm5
|
||||||
movaps xmm6,xmm0
|
movaps xmm6, xmm0
|
||||||
subps xmm5,xmm7 ; xmm5=tmp13
|
subps xmm5, xmm7 ; xmm5=tmp13
|
||||||
subps xmm0,xmm4 ; xmm0=tmp12
|
subps xmm0, xmm4 ; xmm0=tmp12
|
||||||
addps xmm1,xmm7 ; xmm1=tmp10
|
addps xmm1, xmm7 ; xmm1=tmp10
|
||||||
addps xmm6,xmm4 ; xmm6=tmp11
|
addps xmm6, xmm4 ; xmm6=tmp11
|
||||||
|
|
||||||
addps xmm0,xmm5
|
addps xmm0, xmm5
|
||||||
mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
|
mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
|
||||||
|
|
||||||
movaps xmm7,xmm1
|
movaps xmm7, xmm1
|
||||||
movaps xmm4,xmm5
|
movaps xmm4, xmm5
|
||||||
subps xmm1,xmm6 ; xmm1=data4
|
subps xmm1, xmm6 ; xmm1=data4
|
||||||
subps xmm5,xmm0 ; xmm5=data6
|
subps xmm5, xmm0 ; xmm5=data6
|
||||||
addps xmm7,xmm6 ; xmm7=data0
|
addps xmm7, xmm6 ; xmm7=data0
|
||||||
addps xmm4,xmm0 ; xmm4=data2
|
addps xmm4, xmm0 ; xmm4=data2
|
||||||
|
|
||||||
movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
|
movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
|
||||||
movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
|
movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
|
||||||
@@ -320,30 +320,30 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
|
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
|
||||||
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
|
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
|
||||||
|
|
||||||
addps xmm2,xmm3 ; xmm2=tmp10
|
addps xmm2, xmm3 ; xmm2=tmp10
|
||||||
addps xmm3,xmm6 ; xmm3=tmp11
|
addps xmm3, xmm6 ; xmm3=tmp11
|
||||||
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
|
addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
|
||||||
|
|
||||||
mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
|
mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
|
||||||
|
|
||||||
movaps xmm1,xmm2 ; xmm1=tmp10
|
movaps xmm1, xmm2 ; xmm1=tmp10
|
||||||
subps xmm2,xmm6
|
subps xmm2, xmm6
|
||||||
mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
|
mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
|
||||||
mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
|
mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
|
||||||
mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
|
mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
|
||||||
addps xmm1,xmm2 ; xmm1=z2
|
addps xmm1, xmm2 ; xmm1=z2
|
||||||
addps xmm6,xmm2 ; xmm6=z4
|
addps xmm6, xmm2 ; xmm6=z4
|
||||||
|
|
||||||
movaps xmm5,xmm0
|
movaps xmm5, xmm0
|
||||||
subps xmm0,xmm3 ; xmm0=z13
|
subps xmm0, xmm3 ; xmm0=z13
|
||||||
addps xmm5,xmm3 ; xmm5=z11
|
addps xmm5, xmm3 ; xmm5=z11
|
||||||
|
|
||||||
movaps xmm7,xmm0
|
movaps xmm7, xmm0
|
||||||
movaps xmm4,xmm5
|
movaps xmm4, xmm5
|
||||||
subps xmm0,xmm1 ; xmm0=data3
|
subps xmm0, xmm1 ; xmm0=data3
|
||||||
subps xmm5,xmm6 ; xmm5=data7
|
subps xmm5, xmm6 ; xmm5=data7
|
||||||
addps xmm7,xmm1 ; xmm7=data5
|
addps xmm7, xmm1 ; xmm7=data5
|
||||||
addps xmm4,xmm6 ; xmm4=data1
|
addps xmm4, xmm6 ; xmm4=data1
|
||||||
|
|
||||||
movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
|
movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
|
||||||
movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
|
movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
|
||||||
@@ -359,7 +359,7 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
; pop edx ; need not be preserved
|
; pop edx ; need not be preserved
|
||||||
; pop ecx ; need not be preserved
|
; pop ecx ; need not be preserved
|
||||||
poppic ebx
|
poppic ebx
|
||||||
mov esp,ebp ; esp <- aligned ebp
|
mov esp, ebp ; esp <- aligned ebp
|
||||||
pop esp ; esp <- original ebp
|
pop esp ; esp <- original ebp
|
||||||
pop ebp
|
pop ebp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -36,10 +36,10 @@ F_1_306 equ 334 ; FIX(1.306562965)
|
|||||||
%else
|
%else
|
||||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||||
F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433)
|
F_0_382 equ DESCALE( 410903207, 30-CONST_BITS) ; FIX(0.382683433)
|
||||||
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
|
F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
|
||||||
F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781)
|
F_0_707 equ DESCALE( 759250124, 30-CONST_BITS) ; FIX(0.707106781)
|
||||||
F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965)
|
F_1_306 equ DESCALE(1402911301, 30-CONST_BITS) ; FIX(1.306562965)
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
; --------------------------------------------------------------------------
|
; --------------------------------------------------------------------------
|
||||||
@@ -83,11 +83,11 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
|
|||||||
|
|
||||||
EXTN(jsimd_fdct_ifast_sse2):
|
EXTN(jsimd_fdct_ifast_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp ; rax = original rbp
|
mov rax, rsp ; rax = original rbp
|
||||||
sub rsp, byte 4
|
sub rsp, byte 4
|
||||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [rsp],rax
|
mov [rsp], rax
|
||||||
mov rbp,rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args
|
||||||
|
|
||||||
@@ -103,12 +103,12 @@ EXTN(jsimd_fdct_ifast_sse2):
|
|||||||
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
|
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
|
||||||
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
|
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
|
||||||
|
|
||||||
movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
|
movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
|
punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
|
||||||
punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
|
punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
|
||||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
|
movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
|
punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
|
||||||
punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
|
punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
|
||||||
|
|
||||||
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
|
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
|
||||||
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
|
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
|
||||||
@@ -121,84 +121,84 @@ EXTN(jsimd_fdct_ifast_sse2):
|
|||||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
|
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
|
||||||
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
|
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
|
||||||
|
|
||||||
movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
|
movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
|
punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
|
||||||
punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
|
punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
|
||||||
movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
|
movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
|
punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
|
||||||
punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
|
punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
|
||||||
|
|
||||||
movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
|
movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
|
punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
|
||||||
punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
|
punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
|
||||||
movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
|
movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
|
punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
|
||||||
punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
|
punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
|
||||||
|
|
||||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
|
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
|
||||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
|
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
|
||||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
|
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
|
||||||
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
|
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
|
||||||
|
|
||||||
movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
|
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
|
punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
|
||||||
punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
|
punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
|
||||||
movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
|
movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
|
punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
|
||||||
punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
|
punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
|
||||||
|
|
||||||
movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
|
movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
|
punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
|
||||||
punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
|
punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
|
||||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
|
movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
|
punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
|
||||||
punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
|
punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
|
||||||
|
|
||||||
movdqa xmm6,xmm1
|
movdqa xmm6, xmm1
|
||||||
movdqa xmm3,xmm0
|
movdqa xmm3, xmm0
|
||||||
psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
|
psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
|
||||||
psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
|
psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
|
||||||
paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
|
paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
|
||||||
paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
|
paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
|
||||||
|
|
||||||
movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
|
movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
|
||||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
|
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
|
||||||
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
|
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
|
||||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
|
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
|
||||||
|
|
||||||
movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
|
movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
|
punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
|
||||||
punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
|
punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
|
||||||
movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
|
movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
|
punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
|
||||||
punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
|
punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
|
||||||
|
|
||||||
movdqa xmm2,xmm1
|
movdqa xmm2, xmm1
|
||||||
movdqa xmm5,xmm7
|
movdqa xmm5, xmm7
|
||||||
paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
|
paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
|
||||||
paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
|
paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
|
||||||
psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
|
psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
|
||||||
psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
|
psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
movdqa xmm4,xmm3
|
movdqa xmm4, xmm3
|
||||||
movdqa xmm0,xmm6
|
movdqa xmm0, xmm6
|
||||||
psubw xmm3,xmm1 ; xmm3=tmp13
|
psubw xmm3, xmm1 ; xmm3=tmp13
|
||||||
psubw xmm6,xmm7 ; xmm6=tmp12
|
psubw xmm6, xmm7 ; xmm6=tmp12
|
||||||
paddw xmm4,xmm1 ; xmm4=tmp10
|
paddw xmm4, xmm1 ; xmm4=tmp10
|
||||||
paddw xmm0,xmm7 ; xmm0=tmp11
|
paddw xmm0, xmm7 ; xmm0=tmp11
|
||||||
|
|
||||||
paddw xmm6,xmm3
|
paddw xmm6, xmm3
|
||||||
psllw xmm6,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm6, PRE_MULTIPLY_SCALE_BITS
|
||||||
pmulhw xmm6,[rel PW_F0707] ; xmm6=z1
|
pmulhw xmm6, [rel PW_F0707] ; xmm6=z1
|
||||||
|
|
||||||
movdqa xmm1,xmm4
|
movdqa xmm1, xmm4
|
||||||
movdqa xmm7,xmm3
|
movdqa xmm7, xmm3
|
||||||
psubw xmm4,xmm0 ; xmm4=data4
|
psubw xmm4, xmm0 ; xmm4=data4
|
||||||
psubw xmm3,xmm6 ; xmm3=data6
|
psubw xmm3, xmm6 ; xmm3=data6
|
||||||
paddw xmm1,xmm0 ; xmm1=data0
|
paddw xmm1, xmm0 ; xmm1=data0
|
||||||
paddw xmm7,xmm6 ; xmm7=data2
|
paddw xmm7, xmm6 ; xmm7=data2
|
||||||
|
|
||||||
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
|
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
|
||||||
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
|
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
|
||||||
@@ -207,46 +207,46 @@ EXTN(jsimd_fdct_ifast_sse2):
|
|||||||
|
|
||||||
; -- Odd part
|
; -- Odd part
|
||||||
|
|
||||||
paddw xmm2,xmm5 ; xmm2=tmp10
|
paddw xmm2, xmm5 ; xmm2=tmp10
|
||||||
paddw xmm5,xmm0 ; xmm5=tmp11
|
paddw xmm5, xmm0 ; xmm5=tmp11
|
||||||
paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7
|
paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
|
||||||
|
|
||||||
psllw xmm2,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
|
||||||
psllw xmm0,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
|
||||||
|
|
||||||
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
|
||||||
pmulhw xmm5,[rel PW_F0707] ; xmm5=z3
|
pmulhw xmm5, [rel PW_F0707] ; xmm5=z3
|
||||||
|
|
||||||
movdqa xmm4,xmm2 ; xmm4=tmp10
|
movdqa xmm4, xmm2 ; xmm4=tmp10
|
||||||
psubw xmm2,xmm0
|
psubw xmm2, xmm0
|
||||||
pmulhw xmm2,[rel PW_F0382] ; xmm2=z5
|
pmulhw xmm2, [rel PW_F0382] ; xmm2=z5
|
||||||
pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
|
pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
|
||||||
pmulhw xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
|
pmulhw xmm0, [rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
|
||||||
paddw xmm4,xmm2 ; xmm4=z2
|
paddw xmm4, xmm2 ; xmm4=z2
|
||||||
paddw xmm0,xmm2 ; xmm0=z4
|
paddw xmm0, xmm2 ; xmm0=z4
|
||||||
|
|
||||||
movdqa xmm3,xmm6
|
movdqa xmm3, xmm6
|
||||||
psubw xmm6,xmm5 ; xmm6=z13
|
psubw xmm6, xmm5 ; xmm6=z13
|
||||||
paddw xmm3,xmm5 ; xmm3=z11
|
paddw xmm3, xmm5 ; xmm3=z11
|
||||||
|
|
||||||
movdqa xmm2,xmm6
|
movdqa xmm2, xmm6
|
||||||
movdqa xmm5,xmm3
|
movdqa xmm5, xmm3
|
||||||
psubw xmm6,xmm4 ; xmm6=data3
|
psubw xmm6, xmm4 ; xmm6=data3
|
||||||
psubw xmm3,xmm0 ; xmm3=data7
|
psubw xmm3, xmm0 ; xmm3=data7
|
||||||
paddw xmm2,xmm4 ; xmm2=data5
|
paddw xmm2, xmm4 ; xmm2=data5
|
||||||
paddw xmm5,xmm0 ; xmm5=data1
|
paddw xmm5, xmm0 ; xmm5=data1
|
||||||
|
|
||||||
; ---- Pass 2: process columns.
|
; ---- Pass 2: process columns.
|
||||||
|
|
||||||
; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
|
; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
|
||||||
; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
|
; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
|
||||||
|
|
||||||
movdqa xmm4,xmm1 ; transpose coefficients(phase 1)
|
movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
|
punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
|
||||||
punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
|
punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
|
||||||
movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
|
movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
|
punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
|
||||||
punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
|
punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
|
||||||
|
|
||||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
|
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
|
||||||
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
|
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
|
||||||
@@ -257,84 +257,84 @@ EXTN(jsimd_fdct_ifast_sse2):
|
|||||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
|
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
|
||||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
|
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
|
||||||
|
|
||||||
movdqa xmm7,xmm5 ; transpose coefficients(phase 1)
|
movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
|
punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
|
||||||
punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
|
punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
|
||||||
movdqa xmm0,xmm6 ; transpose coefficients(phase 1)
|
movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
|
punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
|
||||||
punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
|
punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
|
||||||
|
|
||||||
movdqa xmm2,xmm5 ; transpose coefficients(phase 2)
|
movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
|
punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
|
||||||
punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
|
punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
|
||||||
movdqa xmm3,xmm7 ; transpose coefficients(phase 2)
|
movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
|
punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
|
||||||
punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
|
punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
|
||||||
|
|
||||||
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
|
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
|
||||||
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
|
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
|
||||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
|
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
|
||||||
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
|
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
|
||||||
|
|
||||||
movdqa xmm2,xmm1 ; transpose coefficients(phase 2)
|
movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
|
punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
|
||||||
punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
|
punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
|
||||||
movdqa xmm7,xmm4 ; transpose coefficients(phase 2)
|
movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
|
punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
|
||||||
punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
|
punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
|
||||||
|
|
||||||
movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
|
movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
|
punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
|
||||||
punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
|
punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
|
||||||
movdqa xmm0,xmm7 ; transpose coefficients(phase 3)
|
movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
|
punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
|
||||||
punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
|
punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
|
||||||
|
|
||||||
movdqa xmm5,xmm6
|
movdqa xmm5, xmm6
|
||||||
movdqa xmm3,xmm1
|
movdqa xmm3, xmm1
|
||||||
psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6
|
psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
|
||||||
psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7
|
psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
|
||||||
paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1
|
paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
|
||||||
paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0
|
paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
|
||||||
|
|
||||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
|
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
|
||||||
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
|
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
|
||||||
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
|
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
|
||||||
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
|
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
|
||||||
|
|
||||||
movdqa xmm6,xmm2 ; transpose coefficients(phase 3)
|
movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
|
punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
|
||||||
punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
|
punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
|
||||||
movdqa xmm1,xmm4 ; transpose coefficients(phase 3)
|
movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
|
punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
|
||||||
punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
|
punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
|
||||||
|
|
||||||
movdqa xmm7,xmm6
|
movdqa xmm7, xmm6
|
||||||
movdqa xmm0,xmm2
|
movdqa xmm0, xmm2
|
||||||
paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3
|
paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
|
||||||
paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2
|
paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
|
||||||
psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4
|
psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
|
||||||
psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5
|
psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
movdqa xmm4,xmm3
|
movdqa xmm4, xmm3
|
||||||
movdqa xmm1,xmm5
|
movdqa xmm1, xmm5
|
||||||
psubw xmm3,xmm6 ; xmm3=tmp13
|
psubw xmm3, xmm6 ; xmm3=tmp13
|
||||||
psubw xmm5,xmm2 ; xmm5=tmp12
|
psubw xmm5, xmm2 ; xmm5=tmp12
|
||||||
paddw xmm4,xmm6 ; xmm4=tmp10
|
paddw xmm4, xmm6 ; xmm4=tmp10
|
||||||
paddw xmm1,xmm2 ; xmm1=tmp11
|
paddw xmm1, xmm2 ; xmm1=tmp11
|
||||||
|
|
||||||
paddw xmm5,xmm3
|
paddw xmm5, xmm3
|
||||||
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
|
||||||
pmulhw xmm5,[rel PW_F0707] ; xmm5=z1
|
pmulhw xmm5, [rel PW_F0707] ; xmm5=z1
|
||||||
|
|
||||||
movdqa xmm6,xmm4
|
movdqa xmm6, xmm4
|
||||||
movdqa xmm2,xmm3
|
movdqa xmm2, xmm3
|
||||||
psubw xmm4,xmm1 ; xmm4=data4
|
psubw xmm4, xmm1 ; xmm4=data4
|
||||||
psubw xmm3,xmm5 ; xmm3=data6
|
psubw xmm3, xmm5 ; xmm3=data6
|
||||||
paddw xmm6,xmm1 ; xmm6=data0
|
paddw xmm6, xmm1 ; xmm6=data0
|
||||||
paddw xmm2,xmm5 ; xmm2=data2
|
paddw xmm2, xmm5 ; xmm2=data2
|
||||||
|
|
||||||
movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
|
movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
|
||||||
movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
|
movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
|
||||||
@@ -346,34 +346,34 @@ EXTN(jsimd_fdct_ifast_sse2):
|
|||||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
|
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
|
||||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
|
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
|
||||||
|
|
||||||
paddw xmm7,xmm0 ; xmm7=tmp10
|
paddw xmm7, xmm0 ; xmm7=tmp10
|
||||||
paddw xmm0,xmm1 ; xmm0=tmp11
|
paddw xmm0, xmm1 ; xmm0=tmp11
|
||||||
paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7
|
paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
|
||||||
|
|
||||||
psllw xmm7,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm7, PRE_MULTIPLY_SCALE_BITS
|
||||||
psllw xmm1,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm1, PRE_MULTIPLY_SCALE_BITS
|
||||||
|
|
||||||
psllw xmm0,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
|
||||||
pmulhw xmm0,[rel PW_F0707] ; xmm0=z3
|
pmulhw xmm0, [rel PW_F0707] ; xmm0=z3
|
||||||
|
|
||||||
movdqa xmm4,xmm7 ; xmm4=tmp10
|
movdqa xmm4, xmm7 ; xmm4=tmp10
|
||||||
psubw xmm7,xmm1
|
psubw xmm7, xmm1
|
||||||
pmulhw xmm7,[rel PW_F0382] ; xmm7=z5
|
pmulhw xmm7, [rel PW_F0382] ; xmm7=z5
|
||||||
pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
|
pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
|
||||||
pmulhw xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
|
pmulhw xmm1, [rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
|
||||||
paddw xmm4,xmm7 ; xmm4=z2
|
paddw xmm4, xmm7 ; xmm4=z2
|
||||||
paddw xmm1,xmm7 ; xmm1=z4
|
paddw xmm1, xmm7 ; xmm1=z4
|
||||||
|
|
||||||
movdqa xmm3,xmm5
|
movdqa xmm3, xmm5
|
||||||
psubw xmm5,xmm0 ; xmm5=z13
|
psubw xmm5, xmm0 ; xmm5=z13
|
||||||
paddw xmm3,xmm0 ; xmm3=z11
|
paddw xmm3, xmm0 ; xmm3=z11
|
||||||
|
|
||||||
movdqa xmm6,xmm5
|
movdqa xmm6, xmm5
|
||||||
movdqa xmm2,xmm3
|
movdqa xmm2, xmm3
|
||||||
psubw xmm5,xmm4 ; xmm5=data3
|
psubw xmm5, xmm4 ; xmm5=data3
|
||||||
psubw xmm3,xmm1 ; xmm3=data7
|
psubw xmm3, xmm1 ; xmm3=data7
|
||||||
paddw xmm6,xmm4 ; xmm6=data5
|
paddw xmm6, xmm4 ; xmm6=data5
|
||||||
paddw xmm2,xmm1 ; xmm2=data1
|
paddw xmm2, xmm1 ; xmm2=data1
|
||||||
|
|
||||||
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
|
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
|
||||||
movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
|
movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
|
||||||
@@ -381,7 +381,7 @@ EXTN(jsimd_fdct_ifast_sse2):
|
|||||||
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
|
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
|
||||||
|
|
||||||
uncollect_args
|
uncollect_args
|
||||||
mov rsp,rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -35,10 +35,10 @@ F_1_306 equ 334 ; FIX(1.306562965)
|
|||||||
%else
|
%else
|
||||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||||
F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433)
|
F_0_382 equ DESCALE( 410903207, 30-CONST_BITS) ; FIX(0.382683433)
|
||||||
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
|
F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
|
||||||
F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781)
|
F_0_707 equ DESCALE( 759250124, 30-CONST_BITS) ; FIX(0.707106781)
|
||||||
F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965)
|
F_1_306 equ DESCALE(1402911301, 30-CONST_BITS) ; FIX(1.306562965)
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
; --------------------------------------------------------------------------
|
; --------------------------------------------------------------------------
|
||||||
@@ -83,11 +83,11 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
|
|||||||
|
|
||||||
EXTN(jsimd_fdct_ifast_sse2):
|
EXTN(jsimd_fdct_ifast_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov eax,esp ; eax = original ebp
|
mov eax, esp ; eax = original ebp
|
||||||
sub esp, byte 4
|
sub esp, byte 4
|
||||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [esp],eax
|
mov [esp], eax
|
||||||
mov ebp,esp ; ebp = aligned ebp
|
mov ebp, esp ; ebp = aligned ebp
|
||||||
lea esp, [wk(0)]
|
lea esp, [wk(0)]
|
||||||
pushpic ebx
|
pushpic ebx
|
||||||
; push ecx ; unused
|
; push ecx ; unused
|
||||||
@@ -109,12 +109,12 @@ EXTN(jsimd_fdct_ifast_sse2):
|
|||||||
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
|
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
|
||||||
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
|
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
|
||||||
|
|
||||||
movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
|
movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
|
punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
|
||||||
punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
|
punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
|
||||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
|
movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
|
punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
|
||||||
punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
|
punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
|
||||||
|
|
||||||
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
|
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
|
||||||
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
|
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
|
||||||
@@ -127,84 +127,84 @@ EXTN(jsimd_fdct_ifast_sse2):
|
|||||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
|
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
|
||||||
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
|
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
|
||||||
|
|
||||||
movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
|
movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
|
punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
|
||||||
punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
|
punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
|
||||||
movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
|
movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
|
punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
|
||||||
punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
|
punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
|
||||||
|
|
||||||
movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
|
movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
|
punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
|
||||||
punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
|
punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
|
||||||
movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
|
movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
|
punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
|
||||||
punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
|
punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
|
||||||
|
|
||||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
|
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
|
||||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
|
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
|
||||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
|
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
|
||||||
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
|
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
|
||||||
|
|
||||||
movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
|
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
|
punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
|
||||||
punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
|
punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
|
||||||
movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
|
movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
|
punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
|
||||||
punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
|
punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
|
||||||
|
|
||||||
movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
|
movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
|
punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
|
||||||
punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
|
punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
|
||||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
|
movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
|
punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
|
||||||
punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
|
punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
|
||||||
|
|
||||||
movdqa xmm6,xmm1
|
movdqa xmm6, xmm1
|
||||||
movdqa xmm3,xmm0
|
movdqa xmm3, xmm0
|
||||||
psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
|
psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
|
||||||
psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
|
psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
|
||||||
paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
|
paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
|
||||||
paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
|
paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
|
||||||
|
|
||||||
movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
|
movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
|
||||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
|
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
|
||||||
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
|
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
|
||||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
|
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
|
||||||
|
|
||||||
movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
|
movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
|
punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
|
||||||
punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
|
punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
|
||||||
movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
|
movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
|
punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
|
||||||
punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
|
punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
|
||||||
|
|
||||||
movdqa xmm2,xmm1
|
movdqa xmm2, xmm1
|
||||||
movdqa xmm5,xmm7
|
movdqa xmm5, xmm7
|
||||||
paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
|
paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
|
||||||
paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
|
paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
|
||||||
psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
|
psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
|
||||||
psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
|
psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
movdqa xmm4,xmm3
|
movdqa xmm4, xmm3
|
||||||
movdqa xmm0,xmm6
|
movdqa xmm0, xmm6
|
||||||
psubw xmm3,xmm1 ; xmm3=tmp13
|
psubw xmm3, xmm1 ; xmm3=tmp13
|
||||||
psubw xmm6,xmm7 ; xmm6=tmp12
|
psubw xmm6, xmm7 ; xmm6=tmp12
|
||||||
paddw xmm4,xmm1 ; xmm4=tmp10
|
paddw xmm4, xmm1 ; xmm4=tmp10
|
||||||
paddw xmm0,xmm7 ; xmm0=tmp11
|
paddw xmm0, xmm7 ; xmm0=tmp11
|
||||||
|
|
||||||
paddw xmm6,xmm3
|
paddw xmm6, xmm3
|
||||||
psllw xmm6,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm6, PRE_MULTIPLY_SCALE_BITS
|
||||||
pmulhw xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1
|
pmulhw xmm6, [GOTOFF(ebx,PW_F0707)] ; xmm6=z1
|
||||||
|
|
||||||
movdqa xmm1,xmm4
|
movdqa xmm1, xmm4
|
||||||
movdqa xmm7,xmm3
|
movdqa xmm7, xmm3
|
||||||
psubw xmm4,xmm0 ; xmm4=data4
|
psubw xmm4, xmm0 ; xmm4=data4
|
||||||
psubw xmm3,xmm6 ; xmm3=data6
|
psubw xmm3, xmm6 ; xmm3=data6
|
||||||
paddw xmm1,xmm0 ; xmm1=data0
|
paddw xmm1, xmm0 ; xmm1=data0
|
||||||
paddw xmm7,xmm6 ; xmm7=data2
|
paddw xmm7, xmm6 ; xmm7=data2
|
||||||
|
|
||||||
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
|
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
|
||||||
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
|
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
|
||||||
@@ -213,34 +213,34 @@ EXTN(jsimd_fdct_ifast_sse2):
|
|||||||
|
|
||||||
; -- Odd part
|
; -- Odd part
|
||||||
|
|
||||||
paddw xmm2,xmm5 ; xmm2=tmp10
|
paddw xmm2, xmm5 ; xmm2=tmp10
|
||||||
paddw xmm5,xmm0 ; xmm5=tmp11
|
paddw xmm5, xmm0 ; xmm5=tmp11
|
||||||
paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7
|
paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
|
||||||
|
|
||||||
psllw xmm2,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
|
||||||
psllw xmm0,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
|
||||||
|
|
||||||
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
|
||||||
pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3
|
pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z3
|
||||||
|
|
||||||
movdqa xmm4,xmm2 ; xmm4=tmp10
|
movdqa xmm4, xmm2 ; xmm4=tmp10
|
||||||
psubw xmm2,xmm0
|
psubw xmm2, xmm0
|
||||||
pmulhw xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5
|
pmulhw xmm2, [GOTOFF(ebx,PW_F0382)] ; xmm2=z5
|
||||||
pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
|
pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
|
||||||
pmulhw xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
|
pmulhw xmm0, [GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
|
||||||
paddw xmm4,xmm2 ; xmm4=z2
|
paddw xmm4, xmm2 ; xmm4=z2
|
||||||
paddw xmm0,xmm2 ; xmm0=z4
|
paddw xmm0, xmm2 ; xmm0=z4
|
||||||
|
|
||||||
movdqa xmm3,xmm6
|
movdqa xmm3, xmm6
|
||||||
psubw xmm6,xmm5 ; xmm6=z13
|
psubw xmm6, xmm5 ; xmm6=z13
|
||||||
paddw xmm3,xmm5 ; xmm3=z11
|
paddw xmm3, xmm5 ; xmm3=z11
|
||||||
|
|
||||||
movdqa xmm2,xmm6
|
movdqa xmm2, xmm6
|
||||||
movdqa xmm5,xmm3
|
movdqa xmm5, xmm3
|
||||||
psubw xmm6,xmm4 ; xmm6=data3
|
psubw xmm6, xmm4 ; xmm6=data3
|
||||||
psubw xmm3,xmm0 ; xmm3=data7
|
psubw xmm3, xmm0 ; xmm3=data7
|
||||||
paddw xmm2,xmm4 ; xmm2=data5
|
paddw xmm2, xmm4 ; xmm2=data5
|
||||||
paddw xmm5,xmm0 ; xmm5=data1
|
paddw xmm5, xmm0 ; xmm5=data1
|
||||||
|
|
||||||
; ---- Pass 2: process columns.
|
; ---- Pass 2: process columns.
|
||||||
|
|
||||||
@@ -249,12 +249,12 @@ EXTN(jsimd_fdct_ifast_sse2):
|
|||||||
; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
|
; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
|
||||||
; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
|
; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
|
||||||
|
|
||||||
movdqa xmm4,xmm1 ; transpose coefficients(phase 1)
|
movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
|
punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
|
||||||
punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
|
punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
|
||||||
movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
|
movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
|
punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
|
||||||
punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
|
punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
|
||||||
|
|
||||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
|
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
|
||||||
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
|
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
|
||||||
@@ -265,84 +265,84 @@ EXTN(jsimd_fdct_ifast_sse2):
|
|||||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
|
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
|
||||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
|
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
|
||||||
|
|
||||||
movdqa xmm7,xmm5 ; transpose coefficients(phase 1)
|
movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
|
punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
|
||||||
punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
|
punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
|
||||||
movdqa xmm0,xmm6 ; transpose coefficients(phase 1)
|
movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
|
punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
|
||||||
punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
|
punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
|
||||||
|
|
||||||
movdqa xmm2,xmm5 ; transpose coefficients(phase 2)
|
movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
|
punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
|
||||||
punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
|
punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
|
||||||
movdqa xmm3,xmm7 ; transpose coefficients(phase 2)
|
movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
|
punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
|
||||||
punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
|
punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
|
||||||
|
|
||||||
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
|
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
|
||||||
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
|
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
|
||||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
|
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
|
||||||
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
|
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
|
||||||
|
|
||||||
movdqa xmm2,xmm1 ; transpose coefficients(phase 2)
|
movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
|
punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
|
||||||
punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
|
punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
|
||||||
movdqa xmm7,xmm4 ; transpose coefficients(phase 2)
|
movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
|
punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
|
||||||
punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
|
punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
|
||||||
|
|
||||||
movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
|
movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
|
punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
|
||||||
punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
|
punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
|
||||||
movdqa xmm0,xmm7 ; transpose coefficients(phase 3)
|
movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
|
punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
|
||||||
punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
|
punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
|
||||||
|
|
||||||
movdqa xmm5,xmm6
|
movdqa xmm5, xmm6
|
||||||
movdqa xmm3,xmm1
|
movdqa xmm3, xmm1
|
||||||
psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6
|
psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
|
||||||
psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7
|
psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
|
||||||
paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1
|
paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
|
||||||
paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0
|
paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
|
||||||
|
|
||||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
|
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
|
||||||
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
|
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
|
||||||
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
|
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
|
||||||
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
|
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
|
||||||
|
|
||||||
movdqa xmm6,xmm2 ; transpose coefficients(phase 3)
|
movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
|
punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
|
||||||
punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
|
punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
|
||||||
movdqa xmm1,xmm4 ; transpose coefficients(phase 3)
|
movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
|
punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
|
||||||
punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
|
punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
|
||||||
|
|
||||||
movdqa xmm7,xmm6
|
movdqa xmm7, xmm6
|
||||||
movdqa xmm0,xmm2
|
movdqa xmm0, xmm2
|
||||||
paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3
|
paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
|
||||||
paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2
|
paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
|
||||||
psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4
|
psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
|
||||||
psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5
|
psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
movdqa xmm4,xmm3
|
movdqa xmm4, xmm3
|
||||||
movdqa xmm1,xmm5
|
movdqa xmm1, xmm5
|
||||||
psubw xmm3,xmm6 ; xmm3=tmp13
|
psubw xmm3, xmm6 ; xmm3=tmp13
|
||||||
psubw xmm5,xmm2 ; xmm5=tmp12
|
psubw xmm5, xmm2 ; xmm5=tmp12
|
||||||
paddw xmm4,xmm6 ; xmm4=tmp10
|
paddw xmm4, xmm6 ; xmm4=tmp10
|
||||||
paddw xmm1,xmm2 ; xmm1=tmp11
|
paddw xmm1, xmm2 ; xmm1=tmp11
|
||||||
|
|
||||||
paddw xmm5,xmm3
|
paddw xmm5, xmm3
|
||||||
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
|
||||||
pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1
|
pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z1
|
||||||
|
|
||||||
movdqa xmm6,xmm4
|
movdqa xmm6, xmm4
|
||||||
movdqa xmm2,xmm3
|
movdqa xmm2, xmm3
|
||||||
psubw xmm4,xmm1 ; xmm4=data4
|
psubw xmm4, xmm1 ; xmm4=data4
|
||||||
psubw xmm3,xmm5 ; xmm3=data6
|
psubw xmm3, xmm5 ; xmm3=data6
|
||||||
paddw xmm6,xmm1 ; xmm6=data0
|
paddw xmm6, xmm1 ; xmm6=data0
|
||||||
paddw xmm2,xmm5 ; xmm2=data2
|
paddw xmm2, xmm5 ; xmm2=data2
|
||||||
|
|
||||||
movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
|
movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
|
||||||
movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
|
movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
|
||||||
@@ -354,34 +354,34 @@ EXTN(jsimd_fdct_ifast_sse2):
|
|||||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
|
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
|
||||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
|
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
|
||||||
|
|
||||||
paddw xmm7,xmm0 ; xmm7=tmp10
|
paddw xmm7, xmm0 ; xmm7=tmp10
|
||||||
paddw xmm0,xmm1 ; xmm0=tmp11
|
paddw xmm0, xmm1 ; xmm0=tmp11
|
||||||
paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7
|
paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
|
||||||
|
|
||||||
psllw xmm7,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm7, PRE_MULTIPLY_SCALE_BITS
|
||||||
psllw xmm1,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm1, PRE_MULTIPLY_SCALE_BITS
|
||||||
|
|
||||||
psllw xmm0,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
|
||||||
pmulhw xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3
|
pmulhw xmm0, [GOTOFF(ebx,PW_F0707)] ; xmm0=z3
|
||||||
|
|
||||||
movdqa xmm4,xmm7 ; xmm4=tmp10
|
movdqa xmm4, xmm7 ; xmm4=tmp10
|
||||||
psubw xmm7,xmm1
|
psubw xmm7, xmm1
|
||||||
pmulhw xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5
|
pmulhw xmm7, [GOTOFF(ebx,PW_F0382)] ; xmm7=z5
|
||||||
pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
|
pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
|
||||||
pmulhw xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
|
pmulhw xmm1, [GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
|
||||||
paddw xmm4,xmm7 ; xmm4=z2
|
paddw xmm4, xmm7 ; xmm4=z2
|
||||||
paddw xmm1,xmm7 ; xmm1=z4
|
paddw xmm1, xmm7 ; xmm1=z4
|
||||||
|
|
||||||
movdqa xmm3,xmm5
|
movdqa xmm3, xmm5
|
||||||
psubw xmm5,xmm0 ; xmm5=z13
|
psubw xmm5, xmm0 ; xmm5=z13
|
||||||
paddw xmm3,xmm0 ; xmm3=z11
|
paddw xmm3, xmm0 ; xmm3=z11
|
||||||
|
|
||||||
movdqa xmm6,xmm5
|
movdqa xmm6, xmm5
|
||||||
movdqa xmm2,xmm3
|
movdqa xmm2, xmm3
|
||||||
psubw xmm5,xmm4 ; xmm5=data3
|
psubw xmm5, xmm4 ; xmm5=data3
|
||||||
psubw xmm3,xmm1 ; xmm3=data7
|
psubw xmm3, xmm1 ; xmm3=data7
|
||||||
paddw xmm6,xmm4 ; xmm6=data5
|
paddw xmm6, xmm4 ; xmm6=data5
|
||||||
paddw xmm2,xmm1 ; xmm2=data1
|
paddw xmm2, xmm1 ; xmm2=data1
|
||||||
|
|
||||||
movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
|
movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
|
||||||
movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
|
movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
|
||||||
|
|||||||
@@ -48,18 +48,18 @@ F_3_072 equ 25172 ; FIX(3.072711026)
|
|||||||
%else
|
%else
|
||||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||||
F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
|
F_0_298 equ DESCALE( 320652955, 30-CONST_BITS) ; FIX(0.298631336)
|
||||||
F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
|
F_0_390 equ DESCALE( 418953276, 30-CONST_BITS) ; FIX(0.390180644)
|
||||||
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
|
F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
|
||||||
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
|
F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
|
||||||
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
|
F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
|
||||||
F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
|
F_1_175 equ DESCALE(1262586813, 30-CONST_BITS) ; FIX(1.175875602)
|
||||||
F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
|
F_1_501 equ DESCALE(1612031267, 30-CONST_BITS) ; FIX(1.501321110)
|
||||||
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
|
F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
|
||||||
F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
|
F_1_961 equ DESCALE(2106220350, 30-CONST_BITS) ; FIX(1.961570560)
|
||||||
F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
|
F_2_053 equ DESCALE(2204520673, 30-CONST_BITS) ; FIX(2.053119869)
|
||||||
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
|
F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
|
||||||
F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
|
F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026)
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
; --------------------------------------------------------------------------
|
; --------------------------------------------------------------------------
|
||||||
@@ -104,11 +104,11 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1)
|
|||||||
|
|
||||||
EXTN(jsimd_fdct_islow_sse2):
|
EXTN(jsimd_fdct_islow_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp ; rax = original rbp
|
mov rax, rsp ; rax = original rbp
|
||||||
sub rsp, byte 4
|
sub rsp, byte 4
|
||||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [rsp],rax
|
mov [rsp], rax
|
||||||
mov rbp,rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args
|
||||||
|
|
||||||
@@ -124,12 +124,12 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
|
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
|
||||||
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
|
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
|
||||||
|
|
||||||
movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
|
movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
|
punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
|
||||||
punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
|
punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
|
||||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
|
movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
|
punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
|
||||||
punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
|
punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
|
||||||
|
|
||||||
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
|
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
|
||||||
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
|
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
|
||||||
@@ -142,80 +142,80 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
|
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
|
||||||
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
|
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
|
||||||
|
|
||||||
movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
|
movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
|
punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
|
||||||
punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
|
punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
|
||||||
movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
|
movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
|
punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
|
||||||
punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
|
punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
|
||||||
|
|
||||||
movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
|
movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
|
punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
|
||||||
punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
|
punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
|
||||||
movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
|
movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
|
punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
|
||||||
punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
|
punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
|
||||||
|
|
||||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
|
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
|
||||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
|
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
|
||||||
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
|
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
|
||||||
movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
|
movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
|
||||||
|
|
||||||
movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
|
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
|
punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
|
||||||
punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
|
punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
|
||||||
movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
|
movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
|
punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
|
||||||
punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
|
punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
|
||||||
|
|
||||||
movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
|
movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
|
punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
|
||||||
punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
|
punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
|
||||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
|
movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
|
punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
|
||||||
punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
|
punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
|
||||||
|
|
||||||
movdqa xmm6,xmm1
|
movdqa xmm6, xmm1
|
||||||
movdqa xmm3,xmm0
|
movdqa xmm3, xmm0
|
||||||
psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
|
psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
|
||||||
psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
|
psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
|
||||||
paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
|
paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
|
||||||
paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
|
paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
|
||||||
|
|
||||||
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
|
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
|
||||||
movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
|
movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
|
||||||
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
|
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
|
||||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
|
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
|
||||||
|
|
||||||
movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
|
movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
|
punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
|
||||||
punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
|
punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
|
||||||
movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
|
movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
|
punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
|
||||||
punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
|
punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
|
||||||
|
|
||||||
movdqa xmm2,xmm1
|
movdqa xmm2, xmm1
|
||||||
movdqa xmm5,xmm7
|
movdqa xmm5, xmm7
|
||||||
paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
|
paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
|
||||||
paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
|
paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
|
||||||
psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
|
psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
|
||||||
psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
|
psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
movdqa xmm4,xmm3
|
movdqa xmm4, xmm3
|
||||||
movdqa xmm0,xmm6
|
movdqa xmm0, xmm6
|
||||||
paddw xmm3,xmm1 ; xmm3=tmp10
|
paddw xmm3, xmm1 ; xmm3=tmp10
|
||||||
paddw xmm6,xmm7 ; xmm6=tmp11
|
paddw xmm6, xmm7 ; xmm6=tmp11
|
||||||
psubw xmm4,xmm1 ; xmm4=tmp13
|
psubw xmm4, xmm1 ; xmm4=tmp13
|
||||||
psubw xmm0,xmm7 ; xmm0=tmp12
|
psubw xmm0, xmm7 ; xmm0=tmp12
|
||||||
|
|
||||||
movdqa xmm1,xmm3
|
movdqa xmm1, xmm3
|
||||||
paddw xmm3,xmm6 ; xmm3=tmp10+tmp11
|
paddw xmm3, xmm6 ; xmm3=tmp10+tmp11
|
||||||
psubw xmm1,xmm6 ; xmm1=tmp10-tmp11
|
psubw xmm1, xmm6 ; xmm1=tmp10-tmp11
|
||||||
|
|
||||||
psllw xmm3,PASS1_BITS ; xmm3=data0
|
psllw xmm3, PASS1_BITS ; xmm3=data0
|
||||||
psllw xmm1,PASS1_BITS ; xmm1=data4
|
psllw xmm1, PASS1_BITS ; xmm1=data4
|
||||||
|
|
||||||
movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
|
movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
|
||||||
movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
|
movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
|
||||||
@@ -229,28 +229,28 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
||||||
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
||||||
|
|
||||||
movdqa xmm7,xmm4 ; xmm4=tmp13
|
movdqa xmm7, xmm4 ; xmm4=tmp13
|
||||||
movdqa xmm6,xmm4
|
movdqa xmm6, xmm4
|
||||||
punpcklwd xmm7,xmm0 ; xmm0=tmp12
|
punpcklwd xmm7, xmm0 ; xmm0=tmp12
|
||||||
punpckhwd xmm6,xmm0
|
punpckhwd xmm6, xmm0
|
||||||
movdqa xmm4,xmm7
|
movdqa xmm4, xmm7
|
||||||
movdqa xmm0,xmm6
|
movdqa xmm0, xmm6
|
||||||
pmaddwd xmm7,[rel PW_F130_F054] ; xmm7=data2L
|
pmaddwd xmm7, [rel PW_F130_F054] ; xmm7=data2L
|
||||||
pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=data2H
|
pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=data2H
|
||||||
pmaddwd xmm4,[rel PW_F054_MF130] ; xmm4=data6L
|
pmaddwd xmm4, [rel PW_F054_MF130] ; xmm4=data6L
|
||||||
pmaddwd xmm0,[rel PW_F054_MF130] ; xmm0=data6H
|
pmaddwd xmm0, [rel PW_F054_MF130] ; xmm0=data6H
|
||||||
|
|
||||||
paddd xmm7,[rel PD_DESCALE_P1]
|
paddd xmm7, [rel PD_DESCALE_P1]
|
||||||
paddd xmm6,[rel PD_DESCALE_P1]
|
paddd xmm6, [rel PD_DESCALE_P1]
|
||||||
psrad xmm7,DESCALE_P1
|
psrad xmm7, DESCALE_P1
|
||||||
psrad xmm6,DESCALE_P1
|
psrad xmm6, DESCALE_P1
|
||||||
paddd xmm4,[rel PD_DESCALE_P1]
|
paddd xmm4, [rel PD_DESCALE_P1]
|
||||||
paddd xmm0,[rel PD_DESCALE_P1]
|
paddd xmm0, [rel PD_DESCALE_P1]
|
||||||
psrad xmm4,DESCALE_P1
|
psrad xmm4, DESCALE_P1
|
||||||
psrad xmm0,DESCALE_P1
|
psrad xmm0, DESCALE_P1
|
||||||
|
|
||||||
packssdw xmm7,xmm6 ; xmm7=data2
|
packssdw xmm7, xmm6 ; xmm7=data2
|
||||||
packssdw xmm4,xmm0 ; xmm4=data6
|
packssdw xmm4, xmm0 ; xmm4=data6
|
||||||
|
|
||||||
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
|
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
|
||||||
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
|
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
|
||||||
@@ -260,10 +260,10 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
|
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
|
||||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
|
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
|
||||||
|
|
||||||
movdqa xmm6,xmm2 ; xmm2=tmp4
|
movdqa xmm6, xmm2 ; xmm2=tmp4
|
||||||
movdqa xmm0,xmm5 ; xmm5=tmp5
|
movdqa xmm0, xmm5 ; xmm5=tmp5
|
||||||
paddw xmm6,xmm3 ; xmm6=z3
|
paddw xmm6, xmm3 ; xmm6=z3
|
||||||
paddw xmm0,xmm1 ; xmm0=z4
|
paddw xmm0, xmm1 ; xmm0=z4
|
||||||
|
|
||||||
; (Original)
|
; (Original)
|
||||||
; z5 = (z3 + z4) * 1.175875602;
|
; z5 = (z3 + z4) * 1.175875602;
|
||||||
@@ -274,16 +274,16 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||||
|
|
||||||
movdqa xmm7,xmm6
|
movdqa xmm7, xmm6
|
||||||
movdqa xmm4,xmm6
|
movdqa xmm4, xmm6
|
||||||
punpcklwd xmm7,xmm0
|
punpcklwd xmm7, xmm0
|
||||||
punpckhwd xmm4,xmm0
|
punpckhwd xmm4, xmm0
|
||||||
movdqa xmm6,xmm7
|
movdqa xmm6, xmm7
|
||||||
movdqa xmm0,xmm4
|
movdqa xmm0, xmm4
|
||||||
pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3L
|
pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3L
|
||||||
pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3H
|
pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3H
|
||||||
pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4L
|
pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4L
|
||||||
pmaddwd xmm0,[rel PW_F117_F078] ; xmm0=z4H
|
pmaddwd xmm0, [rel PW_F117_F078] ; xmm0=z4H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
|
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
|
||||||
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
|
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
|
||||||
@@ -304,61 +304,61 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
||||||
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
||||||
|
|
||||||
movdqa xmm7,xmm2
|
movdqa xmm7, xmm2
|
||||||
movdqa xmm4,xmm2
|
movdqa xmm4, xmm2
|
||||||
punpcklwd xmm7,xmm1
|
punpcklwd xmm7, xmm1
|
||||||
punpckhwd xmm4,xmm1
|
punpckhwd xmm4, xmm1
|
||||||
movdqa xmm2,xmm7
|
movdqa xmm2, xmm7
|
||||||
movdqa xmm1,xmm4
|
movdqa xmm1, xmm4
|
||||||
pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp4L
|
pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp4L
|
||||||
pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4H
|
pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4H
|
||||||
pmaddwd xmm2,[rel PW_MF089_F060] ; xmm2=tmp7L
|
pmaddwd xmm2, [rel PW_MF089_F060] ; xmm2=tmp7L
|
||||||
pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp7H
|
pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp7H
|
||||||
|
|
||||||
paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
|
paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
|
||||||
paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
|
paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
|
||||||
paddd xmm2,xmm6 ; xmm2=data1L
|
paddd xmm2, xmm6 ; xmm2=data1L
|
||||||
paddd xmm1,xmm0 ; xmm1=data1H
|
paddd xmm1, xmm0 ; xmm1=data1H
|
||||||
|
|
||||||
paddd xmm7,[rel PD_DESCALE_P1]
|
paddd xmm7, [rel PD_DESCALE_P1]
|
||||||
paddd xmm4,[rel PD_DESCALE_P1]
|
paddd xmm4, [rel PD_DESCALE_P1]
|
||||||
psrad xmm7,DESCALE_P1
|
psrad xmm7, DESCALE_P1
|
||||||
psrad xmm4,DESCALE_P1
|
psrad xmm4, DESCALE_P1
|
||||||
paddd xmm2,[rel PD_DESCALE_P1]
|
paddd xmm2, [rel PD_DESCALE_P1]
|
||||||
paddd xmm1,[rel PD_DESCALE_P1]
|
paddd xmm1, [rel PD_DESCALE_P1]
|
||||||
psrad xmm2,DESCALE_P1
|
psrad xmm2, DESCALE_P1
|
||||||
psrad xmm1,DESCALE_P1
|
psrad xmm1, DESCALE_P1
|
||||||
|
|
||||||
packssdw xmm7,xmm4 ; xmm7=data7
|
packssdw xmm7, xmm4 ; xmm7=data7
|
||||||
packssdw xmm2,xmm1 ; xmm2=data1
|
packssdw xmm2, xmm1 ; xmm2=data1
|
||||||
|
|
||||||
movdqa xmm4,xmm5
|
movdqa xmm4, xmm5
|
||||||
movdqa xmm1,xmm5
|
movdqa xmm1, xmm5
|
||||||
punpcklwd xmm4,xmm3
|
punpcklwd xmm4, xmm3
|
||||||
punpckhwd xmm1,xmm3
|
punpckhwd xmm1, xmm3
|
||||||
movdqa xmm5,xmm4
|
movdqa xmm5, xmm4
|
||||||
movdqa xmm3,xmm1
|
movdqa xmm3, xmm1
|
||||||
pmaddwd xmm4,[rel PW_MF050_MF256] ; xmm4=tmp5L
|
pmaddwd xmm4, [rel PW_MF050_MF256] ; xmm4=tmp5L
|
||||||
pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5H
|
pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5H
|
||||||
pmaddwd xmm5,[rel PW_MF256_F050] ; xmm5=tmp6L
|
pmaddwd xmm5, [rel PW_MF256_F050] ; xmm5=tmp6L
|
||||||
pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6H
|
pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6H
|
||||||
|
|
||||||
paddd xmm4,xmm6 ; xmm4=data5L
|
paddd xmm4, xmm6 ; xmm4=data5L
|
||||||
paddd xmm1,xmm0 ; xmm1=data5H
|
paddd xmm1, xmm0 ; xmm1=data5H
|
||||||
paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
|
paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
|
||||||
paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
|
paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
|
||||||
|
|
||||||
paddd xmm4,[rel PD_DESCALE_P1]
|
paddd xmm4, [rel PD_DESCALE_P1]
|
||||||
paddd xmm1,[rel PD_DESCALE_P1]
|
paddd xmm1, [rel PD_DESCALE_P1]
|
||||||
psrad xmm4,DESCALE_P1
|
psrad xmm4, DESCALE_P1
|
||||||
psrad xmm1,DESCALE_P1
|
psrad xmm1, DESCALE_P1
|
||||||
paddd xmm5,[rel PD_DESCALE_P1]
|
paddd xmm5, [rel PD_DESCALE_P1]
|
||||||
paddd xmm3,[rel PD_DESCALE_P1]
|
paddd xmm3, [rel PD_DESCALE_P1]
|
||||||
psrad xmm5,DESCALE_P1
|
psrad xmm5, DESCALE_P1
|
||||||
psrad xmm3,DESCALE_P1
|
psrad xmm3, DESCALE_P1
|
||||||
|
|
||||||
packssdw xmm4,xmm1 ; xmm4=data5
|
packssdw xmm4, xmm1 ; xmm4=data5
|
||||||
packssdw xmm5,xmm3 ; xmm5=data3
|
packssdw xmm5, xmm3 ; xmm5=data3
|
||||||
|
|
||||||
; ---- Pass 2: process columns.
|
; ---- Pass 2: process columns.
|
||||||
|
|
||||||
@@ -368,12 +368,12 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
|
; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
|
||||||
; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
|
; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
|
||||||
|
|
||||||
movdqa xmm1,xmm6 ; transpose coefficients(phase 1)
|
movdqa xmm1, xmm6 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
|
punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
|
||||||
punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
|
punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
|
||||||
movdqa xmm3,xmm0 ; transpose coefficients(phase 1)
|
movdqa xmm3, xmm0 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
|
punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
|
||||||
punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
|
punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
|
||||||
|
|
||||||
movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
|
movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
|
||||||
movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
|
movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
|
||||||
@@ -384,82 +384,82 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
|
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
|
||||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
|
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
|
||||||
|
|
||||||
movdqa xmm0,xmm2 ; transpose coefficients(phase 1)
|
movdqa xmm0, xmm2 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
|
punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
|
||||||
punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
|
punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
|
||||||
movdqa xmm3,xmm5 ; transpose coefficients(phase 1)
|
movdqa xmm3, xmm5 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
|
punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
|
||||||
punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
|
punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
|
||||||
|
|
||||||
movdqa xmm4,xmm2 ; transpose coefficients(phase 2)
|
movdqa xmm4, xmm2 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
|
punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
|
||||||
punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
|
punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
|
||||||
movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
|
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
|
punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
|
||||||
punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
|
punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
|
||||||
|
|
||||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
|
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
|
||||||
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
|
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
|
||||||
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
|
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
|
||||||
movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
|
movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
|
||||||
|
|
||||||
movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
|
movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
|
punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
|
||||||
punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
|
punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
|
||||||
movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
|
movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
|
punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
|
||||||
punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
|
punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
|
||||||
|
|
||||||
movdqa xmm5,xmm6 ; transpose coefficients(phase 3)
|
movdqa xmm5, xmm6 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
|
punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
|
||||||
punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
|
punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
|
||||||
movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
|
movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
|
punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
|
||||||
punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
|
punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
|
||||||
|
|
||||||
movdqa xmm2,xmm5
|
movdqa xmm2, xmm5
|
||||||
movdqa xmm7,xmm6
|
movdqa xmm7, xmm6
|
||||||
psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6
|
psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6
|
||||||
psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7
|
psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7
|
||||||
paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1
|
paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1
|
||||||
paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0
|
paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0
|
||||||
|
|
||||||
movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
|
movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
|
||||||
movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
|
movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
|
||||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
|
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
|
||||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
||||||
|
|
||||||
movdqa xmm5,xmm4 ; transpose coefficients(phase 3)
|
movdqa xmm5, xmm4 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
|
punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
|
||||||
punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
|
punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
|
||||||
movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
|
movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
|
punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
|
||||||
punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
|
punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
|
||||||
|
|
||||||
movdqa xmm0,xmm5
|
movdqa xmm0, xmm5
|
||||||
movdqa xmm3,xmm4
|
movdqa xmm3, xmm4
|
||||||
paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3
|
paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3
|
||||||
paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2
|
paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2
|
||||||
psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4
|
psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4
|
||||||
psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5
|
psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
movdqa xmm1,xmm7
|
movdqa xmm1, xmm7
|
||||||
movdqa xmm6,xmm2
|
movdqa xmm6, xmm2
|
||||||
paddw xmm7,xmm5 ; xmm7=tmp10
|
paddw xmm7, xmm5 ; xmm7=tmp10
|
||||||
paddw xmm2,xmm4 ; xmm2=tmp11
|
paddw xmm2, xmm4 ; xmm2=tmp11
|
||||||
psubw xmm1,xmm5 ; xmm1=tmp13
|
psubw xmm1, xmm5 ; xmm1=tmp13
|
||||||
psubw xmm6,xmm4 ; xmm6=tmp12
|
psubw xmm6, xmm4 ; xmm6=tmp12
|
||||||
|
|
||||||
movdqa xmm5,xmm7
|
movdqa xmm5, xmm7
|
||||||
paddw xmm7,xmm2 ; xmm7=tmp10+tmp11
|
paddw xmm7, xmm2 ; xmm7=tmp10+tmp11
|
||||||
psubw xmm5,xmm2 ; xmm5=tmp10-tmp11
|
psubw xmm5, xmm2 ; xmm5=tmp10-tmp11
|
||||||
|
|
||||||
paddw xmm7,[rel PW_DESCALE_P2X]
|
paddw xmm7, [rel PW_DESCALE_P2X]
|
||||||
paddw xmm5,[rel PW_DESCALE_P2X]
|
paddw xmm5, [rel PW_DESCALE_P2X]
|
||||||
psraw xmm7,PASS1_BITS ; xmm7=data0
|
psraw xmm7, PASS1_BITS ; xmm7=data0
|
||||||
psraw xmm5,PASS1_BITS ; xmm5=data4
|
psraw xmm5, PASS1_BITS ; xmm5=data4
|
||||||
|
|
||||||
movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
|
movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
|
||||||
movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
|
movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
|
||||||
@@ -473,28 +473,28 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
||||||
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
||||||
|
|
||||||
movdqa xmm4,xmm1 ; xmm1=tmp13
|
movdqa xmm4, xmm1 ; xmm1=tmp13
|
||||||
movdqa xmm2,xmm1
|
movdqa xmm2, xmm1
|
||||||
punpcklwd xmm4,xmm6 ; xmm6=tmp12
|
punpcklwd xmm4, xmm6 ; xmm6=tmp12
|
||||||
punpckhwd xmm2,xmm6
|
punpckhwd xmm2, xmm6
|
||||||
movdqa xmm1,xmm4
|
movdqa xmm1, xmm4
|
||||||
movdqa xmm6,xmm2
|
movdqa xmm6, xmm2
|
||||||
pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=data2L
|
pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=data2L
|
||||||
pmaddwd xmm2,[rel PW_F130_F054] ; xmm2=data2H
|
pmaddwd xmm2, [rel PW_F130_F054] ; xmm2=data2H
|
||||||
pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=data6L
|
pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=data6L
|
||||||
pmaddwd xmm6,[rel PW_F054_MF130] ; xmm6=data6H
|
pmaddwd xmm6, [rel PW_F054_MF130] ; xmm6=data6H
|
||||||
|
|
||||||
paddd xmm4,[rel PD_DESCALE_P2]
|
paddd xmm4, [rel PD_DESCALE_P2]
|
||||||
paddd xmm2,[rel PD_DESCALE_P2]
|
paddd xmm2, [rel PD_DESCALE_P2]
|
||||||
psrad xmm4,DESCALE_P2
|
psrad xmm4, DESCALE_P2
|
||||||
psrad xmm2,DESCALE_P2
|
psrad xmm2, DESCALE_P2
|
||||||
paddd xmm1,[rel PD_DESCALE_P2]
|
paddd xmm1, [rel PD_DESCALE_P2]
|
||||||
paddd xmm6,[rel PD_DESCALE_P2]
|
paddd xmm6, [rel PD_DESCALE_P2]
|
||||||
psrad xmm1,DESCALE_P2
|
psrad xmm1, DESCALE_P2
|
||||||
psrad xmm6,DESCALE_P2
|
psrad xmm6, DESCALE_P2
|
||||||
|
|
||||||
packssdw xmm4,xmm2 ; xmm4=data2
|
packssdw xmm4, xmm2 ; xmm4=data2
|
||||||
packssdw xmm1,xmm6 ; xmm1=data6
|
packssdw xmm1, xmm6 ; xmm1=data6
|
||||||
|
|
||||||
movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
|
movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
|
||||||
movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
|
movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
|
||||||
@@ -504,10 +504,10 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
|
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
|
||||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
|
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
|
||||||
|
|
||||||
movdqa xmm2,xmm0 ; xmm0=tmp4
|
movdqa xmm2, xmm0 ; xmm0=tmp4
|
||||||
movdqa xmm6,xmm3 ; xmm3=tmp5
|
movdqa xmm6, xmm3 ; xmm3=tmp5
|
||||||
paddw xmm2,xmm7 ; xmm2=z3
|
paddw xmm2, xmm7 ; xmm2=z3
|
||||||
paddw xmm6,xmm5 ; xmm6=z4
|
paddw xmm6, xmm5 ; xmm6=z4
|
||||||
|
|
||||||
; (Original)
|
; (Original)
|
||||||
; z5 = (z3 + z4) * 1.175875602;
|
; z5 = (z3 + z4) * 1.175875602;
|
||||||
@@ -518,16 +518,16 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||||
|
|
||||||
movdqa xmm4,xmm2
|
movdqa xmm4, xmm2
|
||||||
movdqa xmm1,xmm2
|
movdqa xmm1, xmm2
|
||||||
punpcklwd xmm4,xmm6
|
punpcklwd xmm4, xmm6
|
||||||
punpckhwd xmm1,xmm6
|
punpckhwd xmm1, xmm6
|
||||||
movdqa xmm2,xmm4
|
movdqa xmm2, xmm4
|
||||||
movdqa xmm6,xmm1
|
movdqa xmm6, xmm1
|
||||||
pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3L
|
pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3L
|
||||||
pmaddwd xmm1,[rel PW_MF078_F117] ; xmm1=z3H
|
pmaddwd xmm1, [rel PW_MF078_F117] ; xmm1=z3H
|
||||||
pmaddwd xmm2,[rel PW_F117_F078] ; xmm2=z4L
|
pmaddwd xmm2, [rel PW_F117_F078] ; xmm2=z4L
|
||||||
pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4H
|
pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
|
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
|
||||||
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
|
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
|
||||||
@@ -548,70 +548,70 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
||||||
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
||||||
|
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
movdqa xmm1,xmm0
|
movdqa xmm1, xmm0
|
||||||
punpcklwd xmm4,xmm5
|
punpcklwd xmm4, xmm5
|
||||||
punpckhwd xmm1,xmm5
|
punpckhwd xmm1, xmm5
|
||||||
movdqa xmm0,xmm4
|
movdqa xmm0, xmm4
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4L
|
pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4L
|
||||||
pmaddwd xmm1,[rel PW_MF060_MF089] ; xmm1=tmp4H
|
pmaddwd xmm1, [rel PW_MF060_MF089] ; xmm1=tmp4H
|
||||||
pmaddwd xmm0,[rel PW_MF089_F060] ; xmm0=tmp7L
|
pmaddwd xmm0, [rel PW_MF089_F060] ; xmm0=tmp7L
|
||||||
pmaddwd xmm5,[rel PW_MF089_F060] ; xmm5=tmp7H
|
pmaddwd xmm5, [rel PW_MF089_F060] ; xmm5=tmp7H
|
||||||
|
|
||||||
paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
|
paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
|
||||||
paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
|
paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
|
||||||
paddd xmm0,xmm2 ; xmm0=data1L
|
paddd xmm0, xmm2 ; xmm0=data1L
|
||||||
paddd xmm5,xmm6 ; xmm5=data1H
|
paddd xmm5, xmm6 ; xmm5=data1H
|
||||||
|
|
||||||
paddd xmm4,[rel PD_DESCALE_P2]
|
paddd xmm4, [rel PD_DESCALE_P2]
|
||||||
paddd xmm1,[rel PD_DESCALE_P2]
|
paddd xmm1, [rel PD_DESCALE_P2]
|
||||||
psrad xmm4,DESCALE_P2
|
psrad xmm4, DESCALE_P2
|
||||||
psrad xmm1,DESCALE_P2
|
psrad xmm1, DESCALE_P2
|
||||||
paddd xmm0,[rel PD_DESCALE_P2]
|
paddd xmm0, [rel PD_DESCALE_P2]
|
||||||
paddd xmm5,[rel PD_DESCALE_P2]
|
paddd xmm5, [rel PD_DESCALE_P2]
|
||||||
psrad xmm0,DESCALE_P2
|
psrad xmm0, DESCALE_P2
|
||||||
psrad xmm5,DESCALE_P2
|
psrad xmm5, DESCALE_P2
|
||||||
|
|
||||||
packssdw xmm4,xmm1 ; xmm4=data7
|
packssdw xmm4, xmm1 ; xmm4=data7
|
||||||
packssdw xmm0,xmm5 ; xmm0=data1
|
packssdw xmm0, xmm5 ; xmm0=data1
|
||||||
|
|
||||||
movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
|
movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
|
||||||
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
|
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
|
||||||
|
|
||||||
movdqa xmm1,xmm3
|
movdqa xmm1, xmm3
|
||||||
movdqa xmm5,xmm3
|
movdqa xmm5, xmm3
|
||||||
punpcklwd xmm1,xmm7
|
punpcklwd xmm1, xmm7
|
||||||
punpckhwd xmm5,xmm7
|
punpckhwd xmm5, xmm7
|
||||||
movdqa xmm3,xmm1
|
movdqa xmm3, xmm1
|
||||||
movdqa xmm7,xmm5
|
movdqa xmm7, xmm5
|
||||||
pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5L
|
pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5L
|
||||||
pmaddwd xmm5,[rel PW_MF050_MF256] ; xmm5=tmp5H
|
pmaddwd xmm5, [rel PW_MF050_MF256] ; xmm5=tmp5H
|
||||||
pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6L
|
pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6L
|
||||||
pmaddwd xmm7,[rel PW_MF256_F050] ; xmm7=tmp6H
|
pmaddwd xmm7, [rel PW_MF256_F050] ; xmm7=tmp6H
|
||||||
|
|
||||||
paddd xmm1,xmm2 ; xmm1=data5L
|
paddd xmm1, xmm2 ; xmm1=data5L
|
||||||
paddd xmm5,xmm6 ; xmm5=data5H
|
paddd xmm5, xmm6 ; xmm5=data5H
|
||||||
paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
|
paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
|
||||||
paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
|
paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
|
||||||
|
|
||||||
paddd xmm1,[rel PD_DESCALE_P2]
|
paddd xmm1, [rel PD_DESCALE_P2]
|
||||||
paddd xmm5,[rel PD_DESCALE_P2]
|
paddd xmm5, [rel PD_DESCALE_P2]
|
||||||
psrad xmm1,DESCALE_P2
|
psrad xmm1, DESCALE_P2
|
||||||
psrad xmm5,DESCALE_P2
|
psrad xmm5, DESCALE_P2
|
||||||
paddd xmm3,[rel PD_DESCALE_P2]
|
paddd xmm3, [rel PD_DESCALE_P2]
|
||||||
paddd xmm7,[rel PD_DESCALE_P2]
|
paddd xmm7, [rel PD_DESCALE_P2]
|
||||||
psrad xmm3,DESCALE_P2
|
psrad xmm3, DESCALE_P2
|
||||||
psrad xmm7,DESCALE_P2
|
psrad xmm7, DESCALE_P2
|
||||||
|
|
||||||
packssdw xmm1,xmm5 ; xmm1=data5
|
packssdw xmm1, xmm5 ; xmm1=data5
|
||||||
packssdw xmm3,xmm7 ; xmm3=data3
|
packssdw xmm3, xmm7 ; xmm3=data3
|
||||||
|
|
||||||
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
|
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
|
||||||
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
|
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
|
||||||
|
|
||||||
uncollect_args
|
uncollect_args
|
||||||
mov rsp,rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -47,18 +47,18 @@ F_3_072 equ 25172 ; FIX(3.072711026)
|
|||||||
%else
|
%else
|
||||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||||
F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
|
F_0_298 equ DESCALE( 320652955, 30-CONST_BITS) ; FIX(0.298631336)
|
||||||
F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
|
F_0_390 equ DESCALE( 418953276, 30-CONST_BITS) ; FIX(0.390180644)
|
||||||
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
|
F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
|
||||||
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
|
F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
|
||||||
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
|
F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
|
||||||
F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
|
F_1_175 equ DESCALE(1262586813, 30-CONST_BITS) ; FIX(1.175875602)
|
||||||
F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
|
F_1_501 equ DESCALE(1612031267, 30-CONST_BITS) ; FIX(1.501321110)
|
||||||
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
|
F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
|
||||||
F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
|
F_1_961 equ DESCALE(2106220350, 30-CONST_BITS) ; FIX(1.961570560)
|
||||||
F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
|
F_2_053 equ DESCALE(2204520673, 30-CONST_BITS) ; FIX(2.053119869)
|
||||||
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
|
F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
|
||||||
F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
|
F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026)
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
; --------------------------------------------------------------------------
|
; --------------------------------------------------------------------------
|
||||||
@@ -104,11 +104,11 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1)
|
|||||||
|
|
||||||
EXTN(jsimd_fdct_islow_sse2):
|
EXTN(jsimd_fdct_islow_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov eax,esp ; eax = original ebp
|
mov eax, esp ; eax = original ebp
|
||||||
sub esp, byte 4
|
sub esp, byte 4
|
||||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [esp],eax
|
mov [esp], eax
|
||||||
mov ebp,esp ; ebp = aligned ebp
|
mov ebp, esp ; ebp = aligned ebp
|
||||||
lea esp, [wk(0)]
|
lea esp, [wk(0)]
|
||||||
pushpic ebx
|
pushpic ebx
|
||||||
; push ecx ; unused
|
; push ecx ; unused
|
||||||
@@ -130,12 +130,12 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
|
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
|
||||||
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
|
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
|
||||||
|
|
||||||
movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
|
movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
|
punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
|
||||||
punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
|
punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
|
||||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
|
movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
|
punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
|
||||||
punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
|
punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
|
||||||
|
|
||||||
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
|
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
|
||||||
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
|
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
|
||||||
@@ -148,80 +148,80 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
|
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
|
||||||
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
|
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
|
||||||
|
|
||||||
movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
|
movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
|
punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
|
||||||
punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
|
punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
|
||||||
movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
|
movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
|
punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
|
||||||
punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
|
punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
|
||||||
|
|
||||||
movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
|
movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
|
punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
|
||||||
punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
|
punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
|
||||||
movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
|
movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
|
punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
|
||||||
punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
|
punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
|
||||||
|
|
||||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
|
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
|
||||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
|
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
|
||||||
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
|
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
|
||||||
movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
|
movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
|
||||||
|
|
||||||
movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
|
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
|
punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
|
||||||
punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
|
punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
|
||||||
movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
|
movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
|
punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
|
||||||
punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
|
punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
|
||||||
|
|
||||||
movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
|
movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
|
punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
|
||||||
punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
|
punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
|
||||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
|
movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
|
punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
|
||||||
punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
|
punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
|
||||||
|
|
||||||
movdqa xmm6,xmm1
|
movdqa xmm6, xmm1
|
||||||
movdqa xmm3,xmm0
|
movdqa xmm3, xmm0
|
||||||
psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
|
psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
|
||||||
psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
|
psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
|
||||||
paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
|
paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
|
||||||
paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
|
paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
|
||||||
|
|
||||||
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
|
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
|
||||||
movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
|
movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
|
||||||
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
|
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
|
||||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
|
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
|
||||||
|
|
||||||
movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
|
movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
|
punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
|
||||||
punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
|
punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
|
||||||
movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
|
movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
|
punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
|
||||||
punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
|
punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
|
||||||
|
|
||||||
movdqa xmm2,xmm1
|
movdqa xmm2, xmm1
|
||||||
movdqa xmm5,xmm7
|
movdqa xmm5, xmm7
|
||||||
paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
|
paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
|
||||||
paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
|
paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
|
||||||
psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
|
psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
|
||||||
psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
|
psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
movdqa xmm4,xmm3
|
movdqa xmm4, xmm3
|
||||||
movdqa xmm0,xmm6
|
movdqa xmm0, xmm6
|
||||||
paddw xmm3,xmm1 ; xmm3=tmp10
|
paddw xmm3, xmm1 ; xmm3=tmp10
|
||||||
paddw xmm6,xmm7 ; xmm6=tmp11
|
paddw xmm6, xmm7 ; xmm6=tmp11
|
||||||
psubw xmm4,xmm1 ; xmm4=tmp13
|
psubw xmm4, xmm1 ; xmm4=tmp13
|
||||||
psubw xmm0,xmm7 ; xmm0=tmp12
|
psubw xmm0, xmm7 ; xmm0=tmp12
|
||||||
|
|
||||||
movdqa xmm1,xmm3
|
movdqa xmm1, xmm3
|
||||||
paddw xmm3,xmm6 ; xmm3=tmp10+tmp11
|
paddw xmm3, xmm6 ; xmm3=tmp10+tmp11
|
||||||
psubw xmm1,xmm6 ; xmm1=tmp10-tmp11
|
psubw xmm1, xmm6 ; xmm1=tmp10-tmp11
|
||||||
|
|
||||||
psllw xmm3,PASS1_BITS ; xmm3=data0
|
psllw xmm3, PASS1_BITS ; xmm3=data0
|
||||||
psllw xmm1,PASS1_BITS ; xmm1=data4
|
psllw xmm1, PASS1_BITS ; xmm1=data4
|
||||||
|
|
||||||
movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
|
movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
|
||||||
movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
|
movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
|
||||||
@@ -235,28 +235,28 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
||||||
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
||||||
|
|
||||||
movdqa xmm7,xmm4 ; xmm4=tmp13
|
movdqa xmm7, xmm4 ; xmm4=tmp13
|
||||||
movdqa xmm6,xmm4
|
movdqa xmm6, xmm4
|
||||||
punpcklwd xmm7,xmm0 ; xmm0=tmp12
|
punpcklwd xmm7, xmm0 ; xmm0=tmp12
|
||||||
punpckhwd xmm6,xmm0
|
punpckhwd xmm6, xmm0
|
||||||
movdqa xmm4,xmm7
|
movdqa xmm4, xmm7
|
||||||
movdqa xmm0,xmm6
|
movdqa xmm0, xmm6
|
||||||
pmaddwd xmm7,[GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L
|
pmaddwd xmm7, [GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L
|
||||||
pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H
|
pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H
|
||||||
pmaddwd xmm4,[GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L
|
pmaddwd xmm4, [GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L
|
||||||
pmaddwd xmm0,[GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H
|
pmaddwd xmm0, [GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H
|
||||||
|
|
||||||
paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
|
paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||||
paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P1)]
|
paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||||
psrad xmm7,DESCALE_P1
|
psrad xmm7, DESCALE_P1
|
||||||
psrad xmm6,DESCALE_P1
|
psrad xmm6, DESCALE_P1
|
||||||
paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
|
paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||||
paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P1)]
|
paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||||
psrad xmm4,DESCALE_P1
|
psrad xmm4, DESCALE_P1
|
||||||
psrad xmm0,DESCALE_P1
|
psrad xmm0, DESCALE_P1
|
||||||
|
|
||||||
packssdw xmm7,xmm6 ; xmm7=data2
|
packssdw xmm7, xmm6 ; xmm7=data2
|
||||||
packssdw xmm4,xmm0 ; xmm4=data6
|
packssdw xmm4, xmm0 ; xmm4=data6
|
||||||
|
|
||||||
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
|
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
|
||||||
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
|
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
|
||||||
@@ -266,10 +266,10 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
|
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
|
||||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
|
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
|
||||||
|
|
||||||
movdqa xmm6,xmm2 ; xmm2=tmp4
|
movdqa xmm6, xmm2 ; xmm2=tmp4
|
||||||
movdqa xmm0,xmm5 ; xmm5=tmp5
|
movdqa xmm0, xmm5 ; xmm5=tmp5
|
||||||
paddw xmm6,xmm3 ; xmm6=z3
|
paddw xmm6, xmm3 ; xmm6=z3
|
||||||
paddw xmm0,xmm1 ; xmm0=z4
|
paddw xmm0, xmm1 ; xmm0=z4
|
||||||
|
|
||||||
; (Original)
|
; (Original)
|
||||||
; z5 = (z3 + z4) * 1.175875602;
|
; z5 = (z3 + z4) * 1.175875602;
|
||||||
@@ -280,16 +280,16 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||||
|
|
||||||
movdqa xmm7,xmm6
|
movdqa xmm7, xmm6
|
||||||
movdqa xmm4,xmm6
|
movdqa xmm4, xmm6
|
||||||
punpcklwd xmm7,xmm0
|
punpcklwd xmm7, xmm0
|
||||||
punpckhwd xmm4,xmm0
|
punpckhwd xmm4, xmm0
|
||||||
movdqa xmm6,xmm7
|
movdqa xmm6, xmm7
|
||||||
movdqa xmm0,xmm4
|
movdqa xmm0, xmm4
|
||||||
pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L
|
pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L
|
||||||
pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H
|
pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H
|
||||||
pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L
|
pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L
|
||||||
pmaddwd xmm0,[GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H
|
pmaddwd xmm0, [GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
|
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
|
||||||
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
|
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
|
||||||
@@ -310,61 +310,61 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
||||||
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
||||||
|
|
||||||
movdqa xmm7,xmm2
|
movdqa xmm7, xmm2
|
||||||
movdqa xmm4,xmm2
|
movdqa xmm4, xmm2
|
||||||
punpcklwd xmm7,xmm1
|
punpcklwd xmm7, xmm1
|
||||||
punpckhwd xmm4,xmm1
|
punpckhwd xmm4, xmm1
|
||||||
movdqa xmm2,xmm7
|
movdqa xmm2, xmm7
|
||||||
movdqa xmm1,xmm4
|
movdqa xmm1, xmm4
|
||||||
pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L
|
pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L
|
||||||
pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H
|
pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H
|
||||||
pmaddwd xmm2,[GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L
|
pmaddwd xmm2, [GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L
|
||||||
pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H
|
pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H
|
||||||
|
|
||||||
paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
|
paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
|
||||||
paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
|
paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
|
||||||
paddd xmm2,xmm6 ; xmm2=data1L
|
paddd xmm2, xmm6 ; xmm2=data1L
|
||||||
paddd xmm1,xmm0 ; xmm1=data1H
|
paddd xmm1, xmm0 ; xmm1=data1H
|
||||||
|
|
||||||
paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
|
paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||||
paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
|
paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||||
psrad xmm7,DESCALE_P1
|
psrad xmm7, DESCALE_P1
|
||||||
psrad xmm4,DESCALE_P1
|
psrad xmm4, DESCALE_P1
|
||||||
paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]
|
paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||||
paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
|
paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||||
psrad xmm2,DESCALE_P1
|
psrad xmm2, DESCALE_P1
|
||||||
psrad xmm1,DESCALE_P1
|
psrad xmm1, DESCALE_P1
|
||||||
|
|
||||||
packssdw xmm7,xmm4 ; xmm7=data7
|
packssdw xmm7, xmm4 ; xmm7=data7
|
||||||
packssdw xmm2,xmm1 ; xmm2=data1
|
packssdw xmm2, xmm1 ; xmm2=data1
|
||||||
|
|
||||||
movdqa xmm4,xmm5
|
movdqa xmm4, xmm5
|
||||||
movdqa xmm1,xmm5
|
movdqa xmm1, xmm5
|
||||||
punpcklwd xmm4,xmm3
|
punpcklwd xmm4, xmm3
|
||||||
punpckhwd xmm1,xmm3
|
punpckhwd xmm1, xmm3
|
||||||
movdqa xmm5,xmm4
|
movdqa xmm5, xmm4
|
||||||
movdqa xmm3,xmm1
|
movdqa xmm3, xmm1
|
||||||
pmaddwd xmm4,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L
|
pmaddwd xmm4, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L
|
||||||
pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H
|
pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H
|
||||||
pmaddwd xmm5,[GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L
|
pmaddwd xmm5, [GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L
|
||||||
pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H
|
pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H
|
||||||
|
|
||||||
paddd xmm4,xmm6 ; xmm4=data5L
|
paddd xmm4, xmm6 ; xmm4=data5L
|
||||||
paddd xmm1,xmm0 ; xmm1=data5H
|
paddd xmm1, xmm0 ; xmm1=data5H
|
||||||
paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
|
paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
|
||||||
paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
|
paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
|
||||||
|
|
||||||
paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
|
paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||||
paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
|
paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||||
psrad xmm4,DESCALE_P1
|
psrad xmm4, DESCALE_P1
|
||||||
psrad xmm1,DESCALE_P1
|
psrad xmm1, DESCALE_P1
|
||||||
paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P1)]
|
paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||||
paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]
|
paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||||
psrad xmm5,DESCALE_P1
|
psrad xmm5, DESCALE_P1
|
||||||
psrad xmm3,DESCALE_P1
|
psrad xmm3, DESCALE_P1
|
||||||
|
|
||||||
packssdw xmm4,xmm1 ; xmm4=data5
|
packssdw xmm4, xmm1 ; xmm4=data5
|
||||||
packssdw xmm5,xmm3 ; xmm5=data3
|
packssdw xmm5, xmm3 ; xmm5=data3
|
||||||
|
|
||||||
; ---- Pass 2: process columns.
|
; ---- Pass 2: process columns.
|
||||||
|
|
||||||
@@ -376,12 +376,12 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
|
; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
|
||||||
; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
|
; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
|
||||||
|
|
||||||
movdqa xmm1,xmm6 ; transpose coefficients(phase 1)
|
movdqa xmm1, xmm6 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
|
punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
|
||||||
punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
|
punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
|
||||||
movdqa xmm3,xmm0 ; transpose coefficients(phase 1)
|
movdqa xmm3, xmm0 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
|
punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
|
||||||
punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
|
punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
|
||||||
|
|
||||||
movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
|
movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
|
||||||
movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
|
movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
|
||||||
@@ -392,82 +392,82 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
|
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
|
||||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
|
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
|
||||||
|
|
||||||
movdqa xmm0,xmm2 ; transpose coefficients(phase 1)
|
movdqa xmm0, xmm2 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
|
punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
|
||||||
punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
|
punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
|
||||||
movdqa xmm3,xmm5 ; transpose coefficients(phase 1)
|
movdqa xmm3, xmm5 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
|
punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
|
||||||
punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
|
punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
|
||||||
|
|
||||||
movdqa xmm4,xmm2 ; transpose coefficients(phase 2)
|
movdqa xmm4, xmm2 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
|
punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
|
||||||
punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
|
punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
|
||||||
movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
|
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
|
punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
|
||||||
punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
|
punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
|
||||||
|
|
||||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
|
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
|
||||||
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
|
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
|
||||||
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
|
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
|
||||||
movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
|
movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
|
||||||
|
|
||||||
movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
|
movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
|
punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
|
||||||
punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
|
punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
|
||||||
movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
|
movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
|
punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
|
||||||
punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
|
punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
|
||||||
|
|
||||||
movdqa xmm5,xmm6 ; transpose coefficients(phase 3)
|
movdqa xmm5, xmm6 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
|
punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
|
||||||
punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
|
punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
|
||||||
movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
|
movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
|
punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
|
||||||
punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
|
punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
|
||||||
|
|
||||||
movdqa xmm2,xmm5
|
movdqa xmm2, xmm5
|
||||||
movdqa xmm7,xmm6
|
movdqa xmm7, xmm6
|
||||||
psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6
|
psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6
|
||||||
psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7
|
psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7
|
||||||
paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1
|
paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1
|
||||||
paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0
|
paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0
|
||||||
|
|
||||||
movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
|
movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
|
||||||
movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
|
movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
|
||||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
|
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
|
||||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
||||||
|
|
||||||
movdqa xmm5,xmm4 ; transpose coefficients(phase 3)
|
movdqa xmm5, xmm4 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
|
punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
|
||||||
punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
|
punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
|
||||||
movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
|
movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
|
punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
|
||||||
punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
|
punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
|
||||||
|
|
||||||
movdqa xmm0,xmm5
|
movdqa xmm0, xmm5
|
||||||
movdqa xmm3,xmm4
|
movdqa xmm3, xmm4
|
||||||
paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3
|
paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3
|
||||||
paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2
|
paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2
|
||||||
psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4
|
psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4
|
||||||
psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5
|
psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
movdqa xmm1,xmm7
|
movdqa xmm1, xmm7
|
||||||
movdqa xmm6,xmm2
|
movdqa xmm6, xmm2
|
||||||
paddw xmm7,xmm5 ; xmm7=tmp10
|
paddw xmm7, xmm5 ; xmm7=tmp10
|
||||||
paddw xmm2,xmm4 ; xmm2=tmp11
|
paddw xmm2, xmm4 ; xmm2=tmp11
|
||||||
psubw xmm1,xmm5 ; xmm1=tmp13
|
psubw xmm1, xmm5 ; xmm1=tmp13
|
||||||
psubw xmm6,xmm4 ; xmm6=tmp12
|
psubw xmm6, xmm4 ; xmm6=tmp12
|
||||||
|
|
||||||
movdqa xmm5,xmm7
|
movdqa xmm5, xmm7
|
||||||
paddw xmm7,xmm2 ; xmm7=tmp10+tmp11
|
paddw xmm7, xmm2 ; xmm7=tmp10+tmp11
|
||||||
psubw xmm5,xmm2 ; xmm5=tmp10-tmp11
|
psubw xmm5, xmm2 ; xmm5=tmp10-tmp11
|
||||||
|
|
||||||
paddw xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
|
paddw xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
|
||||||
paddw xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
|
paddw xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
|
||||||
psraw xmm7,PASS1_BITS ; xmm7=data0
|
psraw xmm7, PASS1_BITS ; xmm7=data0
|
||||||
psraw xmm5,PASS1_BITS ; xmm5=data4
|
psraw xmm5, PASS1_BITS ; xmm5=data4
|
||||||
|
|
||||||
movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
|
movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
|
||||||
movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
|
movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
|
||||||
@@ -481,28 +481,28 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
||||||
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
||||||
|
|
||||||
movdqa xmm4,xmm1 ; xmm1=tmp13
|
movdqa xmm4, xmm1 ; xmm1=tmp13
|
||||||
movdqa xmm2,xmm1
|
movdqa xmm2, xmm1
|
||||||
punpcklwd xmm4,xmm6 ; xmm6=tmp12
|
punpcklwd xmm4, xmm6 ; xmm6=tmp12
|
||||||
punpckhwd xmm2,xmm6
|
punpckhwd xmm2, xmm6
|
||||||
movdqa xmm1,xmm4
|
movdqa xmm1, xmm4
|
||||||
movdqa xmm6,xmm2
|
movdqa xmm6, xmm2
|
||||||
pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L
|
pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L
|
||||||
pmaddwd xmm2,[GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H
|
pmaddwd xmm2, [GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H
|
||||||
pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L
|
pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L
|
||||||
pmaddwd xmm6,[GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H
|
pmaddwd xmm6, [GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H
|
||||||
|
|
||||||
paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
|
paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||||
paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]
|
paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||||
psrad xmm4,DESCALE_P2
|
psrad xmm4, DESCALE_P2
|
||||||
psrad xmm2,DESCALE_P2
|
psrad xmm2, DESCALE_P2
|
||||||
paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
|
paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||||
paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2)]
|
paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||||
psrad xmm1,DESCALE_P2
|
psrad xmm1, DESCALE_P2
|
||||||
psrad xmm6,DESCALE_P2
|
psrad xmm6, DESCALE_P2
|
||||||
|
|
||||||
packssdw xmm4,xmm2 ; xmm4=data2
|
packssdw xmm4, xmm2 ; xmm4=data2
|
||||||
packssdw xmm1,xmm6 ; xmm1=data6
|
packssdw xmm1, xmm6 ; xmm1=data6
|
||||||
|
|
||||||
movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
|
movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
|
||||||
movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
|
movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
|
||||||
@@ -512,10 +512,10 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
|
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
|
||||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
|
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
|
||||||
|
|
||||||
movdqa xmm2,xmm0 ; xmm0=tmp4
|
movdqa xmm2, xmm0 ; xmm0=tmp4
|
||||||
movdqa xmm6,xmm3 ; xmm3=tmp5
|
movdqa xmm6, xmm3 ; xmm3=tmp5
|
||||||
paddw xmm2,xmm7 ; xmm2=z3
|
paddw xmm2, xmm7 ; xmm2=z3
|
||||||
paddw xmm6,xmm5 ; xmm6=z4
|
paddw xmm6, xmm5 ; xmm6=z4
|
||||||
|
|
||||||
; (Original)
|
; (Original)
|
||||||
; z5 = (z3 + z4) * 1.175875602;
|
; z5 = (z3 + z4) * 1.175875602;
|
||||||
@@ -526,16 +526,16 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||||
|
|
||||||
movdqa xmm4,xmm2
|
movdqa xmm4, xmm2
|
||||||
movdqa xmm1,xmm2
|
movdqa xmm1, xmm2
|
||||||
punpcklwd xmm4,xmm6
|
punpcklwd xmm4, xmm6
|
||||||
punpckhwd xmm1,xmm6
|
punpckhwd xmm1, xmm6
|
||||||
movdqa xmm2,xmm4
|
movdqa xmm2, xmm4
|
||||||
movdqa xmm6,xmm1
|
movdqa xmm6, xmm1
|
||||||
pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L
|
pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L
|
||||||
pmaddwd xmm1,[GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H
|
pmaddwd xmm1, [GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H
|
||||||
pmaddwd xmm2,[GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L
|
pmaddwd xmm2, [GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L
|
||||||
pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H
|
pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
|
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
|
||||||
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
|
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
|
||||||
@@ -556,64 +556,64 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
||||||
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
||||||
|
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
movdqa xmm1,xmm0
|
movdqa xmm1, xmm0
|
||||||
punpcklwd xmm4,xmm5
|
punpcklwd xmm4, xmm5
|
||||||
punpckhwd xmm1,xmm5
|
punpckhwd xmm1, xmm5
|
||||||
movdqa xmm0,xmm4
|
movdqa xmm0, xmm4
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L
|
pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L
|
||||||
pmaddwd xmm1,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H
|
pmaddwd xmm1, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H
|
||||||
pmaddwd xmm0,[GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L
|
pmaddwd xmm0, [GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L
|
||||||
pmaddwd xmm5,[GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H
|
pmaddwd xmm5, [GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H
|
||||||
|
|
||||||
paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
|
paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
|
||||||
paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
|
paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
|
||||||
paddd xmm0,xmm2 ; xmm0=data1L
|
paddd xmm0, xmm2 ; xmm0=data1L
|
||||||
paddd xmm5,xmm6 ; xmm5=data1H
|
paddd xmm5, xmm6 ; xmm5=data1H
|
||||||
|
|
||||||
paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
|
paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||||
paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
|
paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||||
psrad xmm4,DESCALE_P2
|
psrad xmm4, DESCALE_P2
|
||||||
psrad xmm1,DESCALE_P2
|
psrad xmm1, DESCALE_P2
|
||||||
paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P2)]
|
paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||||
paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
|
paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||||
psrad xmm0,DESCALE_P2
|
psrad xmm0, DESCALE_P2
|
||||||
psrad xmm5,DESCALE_P2
|
psrad xmm5, DESCALE_P2
|
||||||
|
|
||||||
packssdw xmm4,xmm1 ; xmm4=data7
|
packssdw xmm4, xmm1 ; xmm4=data7
|
||||||
packssdw xmm0,xmm5 ; xmm0=data1
|
packssdw xmm0, xmm5 ; xmm0=data1
|
||||||
|
|
||||||
movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
|
movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
|
||||||
movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
|
movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
|
||||||
|
|
||||||
movdqa xmm1,xmm3
|
movdqa xmm1, xmm3
|
||||||
movdqa xmm5,xmm3
|
movdqa xmm5, xmm3
|
||||||
punpcklwd xmm1,xmm7
|
punpcklwd xmm1, xmm7
|
||||||
punpckhwd xmm5,xmm7
|
punpckhwd xmm5, xmm7
|
||||||
movdqa xmm3,xmm1
|
movdqa xmm3, xmm1
|
||||||
movdqa xmm7,xmm5
|
movdqa xmm7, xmm5
|
||||||
pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L
|
pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L
|
||||||
pmaddwd xmm5,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H
|
pmaddwd xmm5, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H
|
||||||
pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L
|
pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L
|
||||||
pmaddwd xmm7,[GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H
|
pmaddwd xmm7, [GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H
|
||||||
|
|
||||||
paddd xmm1,xmm2 ; xmm1=data5L
|
paddd xmm1, xmm2 ; xmm1=data5L
|
||||||
paddd xmm5,xmm6 ; xmm5=data5H
|
paddd xmm5, xmm6 ; xmm5=data5H
|
||||||
paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
|
paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
|
||||||
paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
|
paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
|
||||||
|
|
||||||
paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
|
paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||||
paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
|
paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||||
psrad xmm1,DESCALE_P2
|
psrad xmm1, DESCALE_P2
|
||||||
psrad xmm5,DESCALE_P2
|
psrad xmm5, DESCALE_P2
|
||||||
paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P2)]
|
paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||||
paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]
|
paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||||
psrad xmm3,DESCALE_P2
|
psrad xmm3, DESCALE_P2
|
||||||
psrad xmm7,DESCALE_P2
|
psrad xmm7, DESCALE_P2
|
||||||
|
|
||||||
packssdw xmm1,xmm5 ; xmm1=data5
|
packssdw xmm1, xmm5 ; xmm1=data5
|
||||||
packssdw xmm3,xmm7 ; xmm3=data3
|
packssdw xmm3, xmm7 ; xmm3=data3
|
||||||
|
|
||||||
movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
|
movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
|
||||||
movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
|
movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
|
||||||
@@ -623,7 +623,7 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
; pop edx ; need not be preserved
|
; pop edx ; need not be preserved
|
||||||
; pop ecx ; unused
|
; pop ecx ; unused
|
||||||
poppic ebx
|
poppic ebx
|
||||||
mov esp,ebp ; esp <- aligned ebp
|
mov esp, ebp ; esp <- aligned ebp
|
||||||
pop esp ; esp <- original ebp
|
pop esp ; esp <- original ebp
|
||||||
pop ebp
|
pop ebp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -26,11 +26,11 @@
|
|||||||
; --------------------------------------------------------------------------
|
; --------------------------------------------------------------------------
|
||||||
|
|
||||||
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
|
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
|
||||||
shufps %1,%2,0x44
|
shufps %1, %2, 0x44
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
|
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
|
||||||
shufps %1,%2,0xEE
|
shufps %1, %2, 0xEE
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
; --------------------------------------------------------------------------
|
; --------------------------------------------------------------------------
|
||||||
@@ -77,11 +77,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
|||||||
|
|
||||||
EXTN(jsimd_idct_float_sse2):
|
EXTN(jsimd_idct_float_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp ; rax = original rbp
|
mov rax, rsp ; rax = original rbp
|
||||||
sub rsp, byte 4
|
sub rsp, byte 4
|
||||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [rsp],rax
|
mov [rsp], rax
|
||||||
mov rbp,rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [workspace]
|
lea rsp, [workspace]
|
||||||
collect_args
|
collect_args
|
||||||
push rbx
|
push rbx
|
||||||
@@ -105,35 +105,35 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||||
movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||||
movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||||
por xmm1,xmm2
|
por xmm1, xmm2
|
||||||
por xmm3,xmm4
|
por xmm3, xmm4
|
||||||
por xmm5,xmm6
|
por xmm5, xmm6
|
||||||
por xmm1,xmm3
|
por xmm1, xmm3
|
||||||
por xmm5,xmm7
|
por xmm5, xmm7
|
||||||
por xmm1,xmm5
|
por xmm1, xmm5
|
||||||
packsswb xmm1,xmm1
|
packsswb xmm1, xmm1
|
||||||
movd eax,xmm1
|
movd eax, xmm1
|
||||||
test rax,rax
|
test rax, rax
|
||||||
jnz short .columnDCT
|
jnz short .columnDCT
|
||||||
|
|
||||||
; -- AC terms all zero
|
; -- AC terms all zero
|
||||||
|
|
||||||
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||||
|
|
||||||
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||||
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
|
psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
|
||||||
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
|
cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
|
||||||
|
|
||||||
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
|
|
||||||
movaps xmm1,xmm0
|
movaps xmm1, xmm0
|
||||||
movaps xmm2,xmm0
|
movaps xmm2, xmm0
|
||||||
movaps xmm3,xmm0
|
movaps xmm3, xmm0
|
||||||
|
|
||||||
shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
|
shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
|
||||||
shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
|
shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
|
||||||
shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
|
shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
|
||||||
shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
|
shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
|
||||||
|
|
||||||
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
|
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
|
||||||
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
|
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
|
||||||
@@ -154,41 +154,41 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
|
movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
|
||||||
movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||||
|
|
||||||
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||||
punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
|
punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
|
||||||
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
|
psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
|
||||||
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
|
psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
|
||||||
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
|
cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
|
||||||
cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)
|
cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
|
||||||
|
|
||||||
punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
|
punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
|
||||||
punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
|
punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
|
||||||
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
|
psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
|
||||||
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
|
psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
|
||||||
cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)
|
cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
|
||||||
cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)
|
cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
|
||||||
|
|
||||||
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
|
|
||||||
movaps xmm4,xmm0
|
movaps xmm4, xmm0
|
||||||
movaps xmm5,xmm1
|
movaps xmm5, xmm1
|
||||||
subps xmm0,xmm2 ; xmm0=tmp11
|
subps xmm0, xmm2 ; xmm0=tmp11
|
||||||
subps xmm1,xmm3
|
subps xmm1, xmm3
|
||||||
addps xmm4,xmm2 ; xmm4=tmp10
|
addps xmm4, xmm2 ; xmm4=tmp10
|
||||||
addps xmm5,xmm3 ; xmm5=tmp13
|
addps xmm5, xmm3 ; xmm5=tmp13
|
||||||
|
|
||||||
mulps xmm1,[rel PD_1_414]
|
mulps xmm1, [rel PD_1_414]
|
||||||
subps xmm1,xmm5 ; xmm1=tmp12
|
subps xmm1, xmm5 ; xmm1=tmp12
|
||||||
|
|
||||||
movaps xmm6,xmm4
|
movaps xmm6, xmm4
|
||||||
movaps xmm7,xmm0
|
movaps xmm7, xmm0
|
||||||
subps xmm4,xmm5 ; xmm4=tmp3
|
subps xmm4, xmm5 ; xmm4=tmp3
|
||||||
subps xmm0,xmm1 ; xmm0=tmp2
|
subps xmm0, xmm1 ; xmm0=tmp2
|
||||||
addps xmm6,xmm5 ; xmm6=tmp0
|
addps xmm6, xmm5 ; xmm6=tmp0
|
||||||
addps xmm7,xmm1 ; xmm7=tmp1
|
addps xmm7, xmm1 ; xmm7=tmp1
|
||||||
|
|
||||||
movaps XMMWORD [wk(1)], xmm4 ; tmp3
|
movaps XMMWORD [wk(1)], xmm4 ; tmp3
|
||||||
movaps XMMWORD [wk(0)], xmm0 ; tmp2
|
movaps XMMWORD [wk(0)], xmm0 ; tmp2
|
||||||
@@ -200,63 +200,63 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||||
movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||||
|
|
||||||
punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
|
punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
|
||||||
punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
|
punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
|
||||||
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
|
psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
|
||||||
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
|
psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
|
||||||
cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)
|
cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
|
||||||
cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)
|
cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
|
||||||
|
|
||||||
punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
|
punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
|
||||||
punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
|
punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
|
||||||
psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
|
psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
|
||||||
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
|
psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
|
||||||
cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)
|
cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
|
||||||
cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)
|
cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
|
||||||
|
|
||||||
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
|
|
||||||
movaps xmm4,xmm2
|
movaps xmm4, xmm2
|
||||||
movaps xmm0,xmm5
|
movaps xmm0, xmm5
|
||||||
addps xmm2,xmm1 ; xmm2=z11
|
addps xmm2, xmm1 ; xmm2=z11
|
||||||
addps xmm5,xmm3 ; xmm5=z13
|
addps xmm5, xmm3 ; xmm5=z13
|
||||||
subps xmm4,xmm1 ; xmm4=z12
|
subps xmm4, xmm1 ; xmm4=z12
|
||||||
subps xmm0,xmm3 ; xmm0=z10
|
subps xmm0, xmm3 ; xmm0=z10
|
||||||
|
|
||||||
movaps xmm1,xmm2
|
movaps xmm1, xmm2
|
||||||
subps xmm2,xmm5
|
subps xmm2, xmm5
|
||||||
addps xmm1,xmm5 ; xmm1=tmp7
|
addps xmm1, xmm5 ; xmm1=tmp7
|
||||||
|
|
||||||
mulps xmm2,[rel PD_1_414] ; xmm2=tmp11
|
mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
|
||||||
|
|
||||||
movaps xmm3,xmm0
|
movaps xmm3, xmm0
|
||||||
addps xmm0,xmm4
|
addps xmm0, xmm4
|
||||||
mulps xmm0,[rel PD_1_847] ; xmm0=z5
|
mulps xmm0, [rel PD_1_847] ; xmm0=z5
|
||||||
mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
|
mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
|
||||||
mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)
|
mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
|
||||||
addps xmm3,xmm0 ; xmm3=tmp12
|
addps xmm3, xmm0 ; xmm3=tmp12
|
||||||
subps xmm4,xmm0 ; xmm4=tmp10
|
subps xmm4, xmm0 ; xmm4=tmp10
|
||||||
|
|
||||||
; -- Final output stage
|
; -- Final output stage
|
||||||
|
|
||||||
subps xmm3,xmm1 ; xmm3=tmp6
|
subps xmm3, xmm1 ; xmm3=tmp6
|
||||||
movaps xmm5,xmm6
|
movaps xmm5, xmm6
|
||||||
movaps xmm0,xmm7
|
movaps xmm0, xmm7
|
||||||
addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
|
addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
|
||||||
addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
|
addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
|
||||||
subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
|
subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
|
||||||
subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
|
subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
|
||||||
subps xmm2,xmm3 ; xmm2=tmp5
|
subps xmm2, xmm3 ; xmm2=tmp5
|
||||||
|
|
||||||
movaps xmm1,xmm6 ; transpose coefficients(phase 1)
|
movaps xmm1, xmm6 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
|
unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
|
||||||
unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
|
unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
|
||||||
movaps xmm3,xmm0 ; transpose coefficients(phase 1)
|
movaps xmm3, xmm0 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
|
unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
|
||||||
unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
|
unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
|
||||||
|
|
||||||
movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
|
movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
|
||||||
movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
|
movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
|
||||||
@@ -264,27 +264,27 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
|
movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
|
||||||
movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
|
movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
|
||||||
|
|
||||||
addps xmm4,xmm2 ; xmm4=tmp4
|
addps xmm4, xmm2 ; xmm4=tmp4
|
||||||
movaps xmm0,xmm7
|
movaps xmm0, xmm7
|
||||||
movaps xmm3,xmm5
|
movaps xmm3, xmm5
|
||||||
addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
|
addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
|
||||||
addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
|
addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
|
||||||
subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
|
subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
|
||||||
subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
|
subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
|
||||||
|
|
||||||
movaps xmm2,xmm7 ; transpose coefficients(phase 1)
|
movaps xmm2, xmm7 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
|
unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
|
||||||
unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
|
unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
|
||||||
movaps xmm4,xmm5 ; transpose coefficients(phase 1)
|
movaps xmm4, xmm5 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
|
unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
|
||||||
unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
|
unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
|
||||||
|
|
||||||
movaps xmm3,xmm6 ; transpose coefficients(phase 2)
|
movaps xmm3, xmm6 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
|
unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
|
||||||
unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
|
unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
|
||||||
movaps xmm0,xmm1 ; transpose coefficients(phase 2)
|
movaps xmm0, xmm1 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
|
unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
|
||||||
unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
|
unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
|
||||||
|
|
||||||
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
|
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
|
||||||
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
|
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
|
||||||
@@ -294,12 +294,12 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
|
movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
|
||||||
movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
|
movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
|
||||||
|
|
||||||
movaps xmm6,xmm5 ; transpose coefficients(phase 2)
|
movaps xmm6, xmm5 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
|
unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
|
||||||
unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
|
unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
|
||||||
movaps xmm3,xmm4 ; transpose coefficients(phase 2)
|
movaps xmm3, xmm4 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
|
unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
|
||||||
unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
|
unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
|
||||||
|
|
||||||
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
|
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
|
||||||
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
|
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
|
||||||
@@ -336,22 +336,22 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
|
movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||||
movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
|
movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||||
|
|
||||||
movaps xmm4,xmm0
|
movaps xmm4, xmm0
|
||||||
movaps xmm5,xmm1
|
movaps xmm5, xmm1
|
||||||
subps xmm0,xmm2 ; xmm0=tmp11
|
subps xmm0, xmm2 ; xmm0=tmp11
|
||||||
subps xmm1,xmm3
|
subps xmm1, xmm3
|
||||||
addps xmm4,xmm2 ; xmm4=tmp10
|
addps xmm4, xmm2 ; xmm4=tmp10
|
||||||
addps xmm5,xmm3 ; xmm5=tmp13
|
addps xmm5, xmm3 ; xmm5=tmp13
|
||||||
|
|
||||||
mulps xmm1,[rel PD_1_414]
|
mulps xmm1, [rel PD_1_414]
|
||||||
subps xmm1,xmm5 ; xmm1=tmp12
|
subps xmm1, xmm5 ; xmm1=tmp12
|
||||||
|
|
||||||
movaps xmm6,xmm4
|
movaps xmm6, xmm4
|
||||||
movaps xmm7,xmm0
|
movaps xmm7, xmm0
|
||||||
subps xmm4,xmm5 ; xmm4=tmp3
|
subps xmm4, xmm5 ; xmm4=tmp3
|
||||||
subps xmm0,xmm1 ; xmm0=tmp2
|
subps xmm0, xmm1 ; xmm0=tmp2
|
||||||
addps xmm6,xmm5 ; xmm6=tmp0
|
addps xmm6, xmm5 ; xmm6=tmp0
|
||||||
addps xmm7,xmm1 ; xmm7=tmp1
|
addps xmm7, xmm1 ; xmm7=tmp1
|
||||||
|
|
||||||
movaps XMMWORD [wk(1)], xmm4 ; tmp3
|
movaps XMMWORD [wk(1)], xmm4 ; tmp3
|
||||||
movaps XMMWORD [wk(0)], xmm0 ; tmp2
|
movaps XMMWORD [wk(0)], xmm0 ; tmp2
|
||||||
@@ -363,98 +363,98 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
|
movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||||
movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
|
movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||||
|
|
||||||
movaps xmm4,xmm2
|
movaps xmm4, xmm2
|
||||||
movaps xmm0,xmm5
|
movaps xmm0, xmm5
|
||||||
addps xmm2,xmm1 ; xmm2=z11
|
addps xmm2, xmm1 ; xmm2=z11
|
||||||
addps xmm5,xmm3 ; xmm5=z13
|
addps xmm5, xmm3 ; xmm5=z13
|
||||||
subps xmm4,xmm1 ; xmm4=z12
|
subps xmm4, xmm1 ; xmm4=z12
|
||||||
subps xmm0,xmm3 ; xmm0=z10
|
subps xmm0, xmm3 ; xmm0=z10
|
||||||
|
|
||||||
movaps xmm1,xmm2
|
movaps xmm1, xmm2
|
||||||
subps xmm2,xmm5
|
subps xmm2, xmm5
|
||||||
addps xmm1,xmm5 ; xmm1=tmp7
|
addps xmm1, xmm5 ; xmm1=tmp7
|
||||||
|
|
||||||
mulps xmm2,[rel PD_1_414] ; xmm2=tmp11
|
mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
|
||||||
|
|
||||||
movaps xmm3,xmm0
|
movaps xmm3, xmm0
|
||||||
addps xmm0,xmm4
|
addps xmm0, xmm4
|
||||||
mulps xmm0,[rel PD_1_847] ; xmm0=z5
|
mulps xmm0, [rel PD_1_847] ; xmm0=z5
|
||||||
mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
|
mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
|
||||||
mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)
|
mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
|
||||||
addps xmm3,xmm0 ; xmm3=tmp12
|
addps xmm3, xmm0 ; xmm3=tmp12
|
||||||
subps xmm4,xmm0 ; xmm4=tmp10
|
subps xmm4, xmm0 ; xmm4=tmp10
|
||||||
|
|
||||||
; -- Final output stage
|
; -- Final output stage
|
||||||
|
|
||||||
subps xmm3,xmm1 ; xmm3=tmp6
|
subps xmm3, xmm1 ; xmm3=tmp6
|
||||||
movaps xmm5,xmm6
|
movaps xmm5, xmm6
|
||||||
movaps xmm0,xmm7
|
movaps xmm0, xmm7
|
||||||
addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
|
addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
|
||||||
addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
|
addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
|
||||||
subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
|
subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
|
||||||
subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
|
subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
|
||||||
subps xmm2,xmm3 ; xmm2=tmp5
|
subps xmm2, xmm3 ; xmm2=tmp5
|
||||||
|
|
||||||
movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
|
movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
|
||||||
pcmpeqd xmm3,xmm3
|
pcmpeqd xmm3, xmm3
|
||||||
psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
|
psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
|
||||||
|
|
||||||
addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
|
addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
|
||||||
addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
|
addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
|
||||||
addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
|
addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
|
||||||
addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
|
addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
|
||||||
|
|
||||||
pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
|
pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
|
||||||
pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
|
pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
|
||||||
pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
|
pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
|
||||||
pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
|
pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
|
||||||
por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
|
por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
|
||||||
por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
|
por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
|
||||||
|
|
||||||
movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
|
movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
|
||||||
movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
|
movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
|
||||||
|
|
||||||
addps xmm4,xmm2 ; xmm4=tmp4
|
addps xmm4, xmm2 ; xmm4=tmp4
|
||||||
movaps xmm7,xmm1
|
movaps xmm7, xmm1
|
||||||
movaps xmm5,xmm3
|
movaps xmm5, xmm3
|
||||||
addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)
|
addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
|
||||||
addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)
|
addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
|
||||||
subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)
|
subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
|
||||||
subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)
|
subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
|
||||||
|
|
||||||
movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
|
movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
|
||||||
pcmpeqd xmm4,xmm4
|
pcmpeqd xmm4, xmm4
|
||||||
psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
|
psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
|
||||||
|
|
||||||
addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
|
addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
|
||||||
addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
|
addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
|
||||||
addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
|
addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
|
||||||
addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
|
addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
|
||||||
|
|
||||||
pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
|
pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
|
||||||
pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
|
pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
|
||||||
pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
|
pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
|
||||||
pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
|
pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
|
||||||
por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
|
por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
|
||||||
por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
|
por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
|
||||||
|
|
||||||
movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
|
movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
|
||||||
|
|
||||||
packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
|
packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
|
||||||
packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
|
packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
|
||||||
paddb xmm6,xmm2
|
paddb xmm6, xmm2
|
||||||
paddb xmm1,xmm2
|
paddb xmm1, xmm2
|
||||||
|
|
||||||
movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
|
movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
|
||||||
punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
||||||
punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
||||||
|
|
||||||
movdqa xmm7,xmm6 ; transpose coefficients(phase 3)
|
movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
|
||||||
punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||||
punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||||
|
|
||||||
pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||||
pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||||
|
|
||||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
||||||
mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
|
mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
|
||||||
@@ -472,7 +472,7 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
|
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args
|
||||||
mov rsp,rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -25,11 +25,11 @@
|
|||||||
; --------------------------------------------------------------------------
|
; --------------------------------------------------------------------------
|
||||||
|
|
||||||
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
|
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
|
||||||
shufps %1,%2,0x44
|
shufps %1, %2, 0x44
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
|
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
|
||||||
shufps %1,%2,0xEE
|
shufps %1, %2, 0xEE
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
; --------------------------------------------------------------------------
|
; --------------------------------------------------------------------------
|
||||||
@@ -76,11 +76,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
|||||||
|
|
||||||
EXTN(jsimd_idct_float_sse2):
|
EXTN(jsimd_idct_float_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov eax,esp ; eax = original ebp
|
mov eax, esp ; eax = original ebp
|
||||||
sub esp, byte 4
|
sub esp, byte 4
|
||||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [esp],eax
|
mov [esp], eax
|
||||||
mov ebp,esp ; ebp = aligned ebp
|
mov ebp, esp ; ebp = aligned ebp
|
||||||
lea esp, [workspace]
|
lea esp, [workspace]
|
||||||
push ebx
|
push ebx
|
||||||
; push ecx ; need not be preserved
|
; push ecx ; need not be preserved
|
||||||
@@ -97,7 +97,7 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||||
lea edi, [workspace] ; FAST_FLOAT *wsptr
|
lea edi, [workspace] ; FAST_FLOAT *wsptr
|
||||||
mov ecx, DCTSIZE/4 ; ctr
|
mov ecx, DCTSIZE/4 ; ctr
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.columnloop:
|
.columnloop:
|
||||||
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
|
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
|
||||||
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||||
@@ -111,35 +111,35 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||||
movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||||
movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||||
por xmm1,xmm2
|
por xmm1, xmm2
|
||||||
por xmm3,xmm4
|
por xmm3, xmm4
|
||||||
por xmm5,xmm6
|
por xmm5, xmm6
|
||||||
por xmm1,xmm3
|
por xmm1, xmm3
|
||||||
por xmm5,xmm7
|
por xmm5, xmm7
|
||||||
por xmm1,xmm5
|
por xmm1, xmm5
|
||||||
packsswb xmm1,xmm1
|
packsswb xmm1, xmm1
|
||||||
movd eax,xmm1
|
movd eax, xmm1
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jnz short .columnDCT
|
jnz short .columnDCT
|
||||||
|
|
||||||
; -- AC terms all zero
|
; -- AC terms all zero
|
||||||
|
|
||||||
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||||
|
|
||||||
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||||
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
|
psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
|
||||||
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
|
cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
|
||||||
|
|
||||||
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
|
|
||||||
movaps xmm1,xmm0
|
movaps xmm1, xmm0
|
||||||
movaps xmm2,xmm0
|
movaps xmm2, xmm0
|
||||||
movaps xmm3,xmm0
|
movaps xmm3, xmm0
|
||||||
|
|
||||||
shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
|
shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
|
||||||
shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
|
shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
|
||||||
shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
|
shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
|
||||||
shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
|
shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
|
||||||
|
|
||||||
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
|
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
|
||||||
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
|
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
|
||||||
@@ -150,7 +150,7 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
|
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
|
||||||
movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
|
movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
|
||||||
jmp near .nextcolumn
|
jmp near .nextcolumn
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
%endif
|
%endif
|
||||||
.columnDCT:
|
.columnDCT:
|
||||||
|
|
||||||
@@ -161,41 +161,41 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||||
movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||||
|
|
||||||
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||||
punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
|
punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
|
||||||
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
|
psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
|
||||||
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
|
psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
|
||||||
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
|
cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
|
||||||
cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)
|
cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
|
||||||
|
|
||||||
punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
|
punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
|
||||||
punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
|
punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
|
||||||
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
|
psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
|
||||||
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
|
psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
|
||||||
cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)
|
cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
|
||||||
cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)
|
cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
|
||||||
|
|
||||||
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
|
|
||||||
movaps xmm4,xmm0
|
movaps xmm4, xmm0
|
||||||
movaps xmm5,xmm1
|
movaps xmm5, xmm1
|
||||||
subps xmm0,xmm2 ; xmm0=tmp11
|
subps xmm0, xmm2 ; xmm0=tmp11
|
||||||
subps xmm1,xmm3
|
subps xmm1, xmm3
|
||||||
addps xmm4,xmm2 ; xmm4=tmp10
|
addps xmm4, xmm2 ; xmm4=tmp10
|
||||||
addps xmm5,xmm3 ; xmm5=tmp13
|
addps xmm5, xmm3 ; xmm5=tmp13
|
||||||
|
|
||||||
mulps xmm1,[GOTOFF(ebx,PD_1_414)]
|
mulps xmm1, [GOTOFF(ebx,PD_1_414)]
|
||||||
subps xmm1,xmm5 ; xmm1=tmp12
|
subps xmm1, xmm5 ; xmm1=tmp12
|
||||||
|
|
||||||
movaps xmm6,xmm4
|
movaps xmm6, xmm4
|
||||||
movaps xmm7,xmm0
|
movaps xmm7, xmm0
|
||||||
subps xmm4,xmm5 ; xmm4=tmp3
|
subps xmm4, xmm5 ; xmm4=tmp3
|
||||||
subps xmm0,xmm1 ; xmm0=tmp2
|
subps xmm0, xmm1 ; xmm0=tmp2
|
||||||
addps xmm6,xmm5 ; xmm6=tmp0
|
addps xmm6, xmm5 ; xmm6=tmp0
|
||||||
addps xmm7,xmm1 ; xmm7=tmp1
|
addps xmm7, xmm1 ; xmm7=tmp1
|
||||||
|
|
||||||
movaps XMMWORD [wk(1)], xmm4 ; tmp3
|
movaps XMMWORD [wk(1)], xmm4 ; tmp3
|
||||||
movaps XMMWORD [wk(0)], xmm0 ; tmp2
|
movaps XMMWORD [wk(0)], xmm0 ; tmp2
|
||||||
@@ -207,63 +207,63 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||||
movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||||
|
|
||||||
punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
|
punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
|
||||||
punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
|
punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
|
||||||
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
|
psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
|
||||||
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
|
psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
|
||||||
cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)
|
cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
|
||||||
cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)
|
cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
|
||||||
|
|
||||||
punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
|
punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
|
||||||
punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
|
punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
|
||||||
psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
|
psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
|
||||||
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
|
psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
|
||||||
cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)
|
cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
|
||||||
cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)
|
cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
|
||||||
|
|
||||||
mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||||
|
|
||||||
movaps xmm4,xmm2
|
movaps xmm4, xmm2
|
||||||
movaps xmm0,xmm5
|
movaps xmm0, xmm5
|
||||||
addps xmm2,xmm1 ; xmm2=z11
|
addps xmm2, xmm1 ; xmm2=z11
|
||||||
addps xmm5,xmm3 ; xmm5=z13
|
addps xmm5, xmm3 ; xmm5=z13
|
||||||
subps xmm4,xmm1 ; xmm4=z12
|
subps xmm4, xmm1 ; xmm4=z12
|
||||||
subps xmm0,xmm3 ; xmm0=z10
|
subps xmm0, xmm3 ; xmm0=z10
|
||||||
|
|
||||||
movaps xmm1,xmm2
|
movaps xmm1, xmm2
|
||||||
subps xmm2,xmm5
|
subps xmm2, xmm5
|
||||||
addps xmm1,xmm5 ; xmm1=tmp7
|
addps xmm1, xmm5 ; xmm1=tmp7
|
||||||
|
|
||||||
mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
|
mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
|
||||||
|
|
||||||
movaps xmm3,xmm0
|
movaps xmm3, xmm0
|
||||||
addps xmm0,xmm4
|
addps xmm0, xmm4
|
||||||
mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
|
mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
|
||||||
mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
|
mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
|
||||||
mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
|
mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
|
||||||
addps xmm3,xmm0 ; xmm3=tmp12
|
addps xmm3, xmm0 ; xmm3=tmp12
|
||||||
subps xmm4,xmm0 ; xmm4=tmp10
|
subps xmm4, xmm0 ; xmm4=tmp10
|
||||||
|
|
||||||
; -- Final output stage
|
; -- Final output stage
|
||||||
|
|
||||||
subps xmm3,xmm1 ; xmm3=tmp6
|
subps xmm3, xmm1 ; xmm3=tmp6
|
||||||
movaps xmm5,xmm6
|
movaps xmm5, xmm6
|
||||||
movaps xmm0,xmm7
|
movaps xmm0, xmm7
|
||||||
addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
|
addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
|
||||||
addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
|
addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
|
||||||
subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
|
subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
|
||||||
subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
|
subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
|
||||||
subps xmm2,xmm3 ; xmm2=tmp5
|
subps xmm2, xmm3 ; xmm2=tmp5
|
||||||
|
|
||||||
movaps xmm1,xmm6 ; transpose coefficients(phase 1)
|
movaps xmm1, xmm6 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
|
unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
|
||||||
unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
|
unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
|
||||||
movaps xmm3,xmm0 ; transpose coefficients(phase 1)
|
movaps xmm3, xmm0 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
|
unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
|
||||||
unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
|
unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
|
||||||
|
|
||||||
movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
|
movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
|
||||||
movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
|
movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
|
||||||
@@ -271,27 +271,27 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
|
movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
|
||||||
movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
|
movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
|
||||||
|
|
||||||
addps xmm4,xmm2 ; xmm4=tmp4
|
addps xmm4, xmm2 ; xmm4=tmp4
|
||||||
movaps xmm0,xmm7
|
movaps xmm0, xmm7
|
||||||
movaps xmm3,xmm5
|
movaps xmm3, xmm5
|
||||||
addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
|
addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
|
||||||
addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
|
addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
|
||||||
subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
|
subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
|
||||||
subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
|
subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
|
||||||
|
|
||||||
movaps xmm2,xmm7 ; transpose coefficients(phase 1)
|
movaps xmm2, xmm7 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
|
unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
|
||||||
unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
|
unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
|
||||||
movaps xmm4,xmm5 ; transpose coefficients(phase 1)
|
movaps xmm4, xmm5 ; transpose coefficients(phase 1)
|
||||||
unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
|
unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
|
||||||
unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
|
unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
|
||||||
|
|
||||||
movaps xmm3,xmm6 ; transpose coefficients(phase 2)
|
movaps xmm3, xmm6 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
|
unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
|
||||||
unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
|
unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
|
||||||
movaps xmm0,xmm1 ; transpose coefficients(phase 2)
|
movaps xmm0, xmm1 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
|
unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
|
||||||
unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
|
unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
|
||||||
|
|
||||||
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
|
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
|
||||||
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
|
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
|
||||||
@@ -301,12 +301,12 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
|
movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
|
||||||
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
|
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
|
||||||
|
|
||||||
movaps xmm6,xmm5 ; transpose coefficients(phase 2)
|
movaps xmm6, xmm5 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
|
unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
|
||||||
unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
|
unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
|
||||||
movaps xmm3,xmm4 ; transpose coefficients(phase 2)
|
movaps xmm3, xmm4 ; transpose coefficients(phase 2)
|
||||||
unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
|
unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
|
||||||
unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
|
unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
|
||||||
|
|
||||||
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
|
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
|
||||||
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
|
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
|
||||||
@@ -334,7 +334,7 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
|
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
|
||||||
mov eax, JDIMENSION [output_col(eax)]
|
mov eax, JDIMENSION [output_col(eax)]
|
||||||
mov ecx, DCTSIZE/4 ; ctr
|
mov ecx, DCTSIZE/4 ; ctr
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.rowloop:
|
.rowloop:
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
@@ -344,22 +344,22 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
|
movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
|
||||||
movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
|
movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
|
||||||
|
|
||||||
movaps xmm4,xmm0
|
movaps xmm4, xmm0
|
||||||
movaps xmm5,xmm1
|
movaps xmm5, xmm1
|
||||||
subps xmm0,xmm2 ; xmm0=tmp11
|
subps xmm0, xmm2 ; xmm0=tmp11
|
||||||
subps xmm1,xmm3
|
subps xmm1, xmm3
|
||||||
addps xmm4,xmm2 ; xmm4=tmp10
|
addps xmm4, xmm2 ; xmm4=tmp10
|
||||||
addps xmm5,xmm3 ; xmm5=tmp13
|
addps xmm5, xmm3 ; xmm5=tmp13
|
||||||
|
|
||||||
mulps xmm1,[GOTOFF(ebx,PD_1_414)]
|
mulps xmm1, [GOTOFF(ebx,PD_1_414)]
|
||||||
subps xmm1,xmm5 ; xmm1=tmp12
|
subps xmm1, xmm5 ; xmm1=tmp12
|
||||||
|
|
||||||
movaps xmm6,xmm4
|
movaps xmm6, xmm4
|
||||||
movaps xmm7,xmm0
|
movaps xmm7, xmm0
|
||||||
subps xmm4,xmm5 ; xmm4=tmp3
|
subps xmm4, xmm5 ; xmm4=tmp3
|
||||||
subps xmm0,xmm1 ; xmm0=tmp2
|
subps xmm0, xmm1 ; xmm0=tmp2
|
||||||
addps xmm6,xmm5 ; xmm6=tmp0
|
addps xmm6, xmm5 ; xmm6=tmp0
|
||||||
addps xmm7,xmm1 ; xmm7=tmp1
|
addps xmm7, xmm1 ; xmm7=tmp1
|
||||||
|
|
||||||
movaps XMMWORD [wk(1)], xmm4 ; tmp3
|
movaps XMMWORD [wk(1)], xmm4 ; tmp3
|
||||||
movaps XMMWORD [wk(0)], xmm0 ; tmp2
|
movaps XMMWORD [wk(0)], xmm0 ; tmp2
|
||||||
@@ -371,98 +371,98 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
|
movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
|
||||||
movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
|
movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
|
||||||
|
|
||||||
movaps xmm4,xmm2
|
movaps xmm4, xmm2
|
||||||
movaps xmm0,xmm5
|
movaps xmm0, xmm5
|
||||||
addps xmm2,xmm1 ; xmm2=z11
|
addps xmm2, xmm1 ; xmm2=z11
|
||||||
addps xmm5,xmm3 ; xmm5=z13
|
addps xmm5, xmm3 ; xmm5=z13
|
||||||
subps xmm4,xmm1 ; xmm4=z12
|
subps xmm4, xmm1 ; xmm4=z12
|
||||||
subps xmm0,xmm3 ; xmm0=z10
|
subps xmm0, xmm3 ; xmm0=z10
|
||||||
|
|
||||||
movaps xmm1,xmm2
|
movaps xmm1, xmm2
|
||||||
subps xmm2,xmm5
|
subps xmm2, xmm5
|
||||||
addps xmm1,xmm5 ; xmm1=tmp7
|
addps xmm1, xmm5 ; xmm1=tmp7
|
||||||
|
|
||||||
mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
|
mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
|
||||||
|
|
||||||
movaps xmm3,xmm0
|
movaps xmm3, xmm0
|
||||||
addps xmm0,xmm4
|
addps xmm0, xmm4
|
||||||
mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
|
mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
|
||||||
mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
|
mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
|
||||||
mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
|
mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
|
||||||
addps xmm3,xmm0 ; xmm3=tmp12
|
addps xmm3, xmm0 ; xmm3=tmp12
|
||||||
subps xmm4,xmm0 ; xmm4=tmp10
|
subps xmm4, xmm0 ; xmm4=tmp10
|
||||||
|
|
||||||
; -- Final output stage
|
; -- Final output stage
|
||||||
|
|
||||||
subps xmm3,xmm1 ; xmm3=tmp6
|
subps xmm3, xmm1 ; xmm3=tmp6
|
||||||
movaps xmm5,xmm6
|
movaps xmm5, xmm6
|
||||||
movaps xmm0,xmm7
|
movaps xmm0, xmm7
|
||||||
addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
|
addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
|
||||||
addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
|
addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
|
||||||
subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
|
subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
|
||||||
subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
|
subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
|
||||||
subps xmm2,xmm3 ; xmm2=tmp5
|
subps xmm2, xmm3 ; xmm2=tmp5
|
||||||
|
|
||||||
movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
|
movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
|
||||||
pcmpeqd xmm3,xmm3
|
pcmpeqd xmm3, xmm3
|
||||||
psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
|
psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
|
||||||
|
|
||||||
addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
|
addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
|
||||||
addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
|
addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
|
||||||
addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
|
addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
|
||||||
addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
|
addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
|
||||||
|
|
||||||
pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
|
pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
|
||||||
pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
|
pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
|
||||||
pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
|
pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
|
||||||
pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
|
pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
|
||||||
por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
|
por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
|
||||||
por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
|
por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
|
||||||
|
|
||||||
movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
|
movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
|
||||||
movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
|
movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
|
||||||
|
|
||||||
addps xmm4,xmm2 ; xmm4=tmp4
|
addps xmm4, xmm2 ; xmm4=tmp4
|
||||||
movaps xmm7,xmm1
|
movaps xmm7, xmm1
|
||||||
movaps xmm5,xmm3
|
movaps xmm5, xmm3
|
||||||
addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)
|
addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
|
||||||
addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)
|
addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
|
||||||
subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)
|
subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
|
||||||
subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)
|
subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
|
||||||
|
|
||||||
movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
|
movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
|
||||||
pcmpeqd xmm4,xmm4
|
pcmpeqd xmm4, xmm4
|
||||||
psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
|
psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
|
||||||
|
|
||||||
addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
|
addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
|
||||||
addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
|
addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
|
||||||
addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
|
addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
|
||||||
addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
|
addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
|
||||||
|
|
||||||
pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
|
pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
|
||||||
pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
|
pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
|
||||||
pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
|
pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
|
||||||
pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
|
pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
|
||||||
por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
|
por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
|
||||||
por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
|
por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
|
||||||
|
|
||||||
movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
|
movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
|
||||||
|
|
||||||
packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
|
packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
|
||||||
packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
|
packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
|
||||||
paddb xmm6,xmm2
|
paddb xmm6, xmm2
|
||||||
paddb xmm1,xmm2
|
paddb xmm1, xmm2
|
||||||
|
|
||||||
movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
|
movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
|
||||||
punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
||||||
punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
||||||
|
|
||||||
movdqa xmm7,xmm6 ; transpose coefficients(phase 3)
|
movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
|
||||||
punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||||
punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||||
|
|
||||||
pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||||
pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||||
|
|
||||||
pushpic ebx ; save GOT address
|
pushpic ebx ; save GOT address
|
||||||
|
|
||||||
@@ -487,7 +487,7 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
; pop edx ; need not be preserved
|
; pop edx ; need not be preserved
|
||||||
; pop ecx ; need not be preserved
|
; pop ecx ; need not be preserved
|
||||||
pop ebx
|
pop ebx
|
||||||
mov esp,ebp ; esp <- aligned ebp
|
mov esp, ebp ; esp <- aligned ebp
|
||||||
pop esp ; esp <- original ebp
|
pop esp ; esp <- original ebp
|
||||||
pop ebp
|
pop ebp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -42,10 +42,10 @@ F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
|
|||||||
%else
|
%else
|
||||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||||
F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
|
F_1_082 equ DESCALE(1162209775, 30-CONST_BITS) ; FIX(1.082392200)
|
||||||
F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
|
F_1_414 equ DESCALE(1518500249, 30-CONST_BITS) ; FIX(1.414213562)
|
||||||
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
|
F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
|
||||||
F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
|
F_2_613 equ DESCALE(2805822602, 30-CONST_BITS) ; FIX(2.613125930)
|
||||||
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
|
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
@@ -96,11 +96,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
|||||||
|
|
||||||
EXTN(jsimd_idct_ifast_sse2):
|
EXTN(jsimd_idct_ifast_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp ; rax = original rbp
|
mov rax, rsp ; rax = original rbp
|
||||||
sub rsp, byte 4
|
sub rsp, byte 4
|
||||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [rsp],rax
|
mov [rsp], rax
|
||||||
mov rbp,rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args
|
||||||
|
|
||||||
@@ -121,11 +121,11 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||||
por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||||
por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||||
por xmm1,xmm0
|
por xmm1, xmm0
|
||||||
packsswb xmm1,xmm1
|
packsswb xmm1, xmm1
|
||||||
packsswb xmm1,xmm1
|
packsswb xmm1, xmm1
|
||||||
movd eax,xmm1
|
movd eax, xmm1
|
||||||
test rax,rax
|
test rax, rax
|
||||||
jnz short .columnDCT
|
jnz short .columnDCT
|
||||||
|
|
||||||
; -- AC terms all zero
|
; -- AC terms all zero
|
||||||
@@ -133,18 +133,18 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||||
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
|
|
||||||
movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
|
movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
|
||||||
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||||
punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
|
punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
|
||||||
|
|
||||||
pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
|
pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
|
||||||
pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
|
pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
|
||||||
pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
|
pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
|
||||||
pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
|
pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
|
||||||
pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
|
pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
|
||||||
pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
|
pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
|
||||||
pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
|
pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
|
||||||
pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
|
pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
|
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
|
||||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
|
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
|
||||||
@@ -163,23 +163,23 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||||
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||||
|
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
psubw xmm0,xmm2 ; xmm0=tmp11
|
psubw xmm0, xmm2 ; xmm0=tmp11
|
||||||
psubw xmm1,xmm3
|
psubw xmm1, xmm3
|
||||||
paddw xmm4,xmm2 ; xmm4=tmp10
|
paddw xmm4, xmm2 ; xmm4=tmp10
|
||||||
paddw xmm5,xmm3 ; xmm5=tmp13
|
paddw xmm5, xmm3 ; xmm5=tmp13
|
||||||
|
|
||||||
psllw xmm1,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm1, PRE_MULTIPLY_SCALE_BITS
|
||||||
pmulhw xmm1,[rel PW_F1414]
|
pmulhw xmm1, [rel PW_F1414]
|
||||||
psubw xmm1,xmm5 ; xmm1=tmp12
|
psubw xmm1, xmm5 ; xmm1=tmp12
|
||||||
|
|
||||||
movdqa xmm6,xmm4
|
movdqa xmm6, xmm4
|
||||||
movdqa xmm7,xmm0
|
movdqa xmm7, xmm0
|
||||||
psubw xmm4,xmm5 ; xmm4=tmp3
|
psubw xmm4, xmm5 ; xmm4=tmp3
|
||||||
psubw xmm0,xmm1 ; xmm0=tmp2
|
psubw xmm0, xmm1 ; xmm0=tmp2
|
||||||
paddw xmm6,xmm5 ; xmm6=tmp0
|
paddw xmm6, xmm5 ; xmm6=tmp0
|
||||||
paddw xmm7,xmm1 ; xmm7=tmp1
|
paddw xmm7, xmm1 ; xmm7=tmp1
|
||||||
|
|
||||||
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
|
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
|
||||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
|
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
|
||||||
@@ -195,23 +195,23 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||||
pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||||
|
|
||||||
movdqa xmm4,xmm2
|
movdqa xmm4, xmm2
|
||||||
movdqa xmm0,xmm5
|
movdqa xmm0, xmm5
|
||||||
psubw xmm2,xmm1 ; xmm2=z12
|
psubw xmm2, xmm1 ; xmm2=z12
|
||||||
psubw xmm5,xmm3 ; xmm5=z10
|
psubw xmm5, xmm3 ; xmm5=z10
|
||||||
paddw xmm4,xmm1 ; xmm4=z11
|
paddw xmm4, xmm1 ; xmm4=z11
|
||||||
paddw xmm0,xmm3 ; xmm0=z13
|
paddw xmm0, xmm3 ; xmm0=z13
|
||||||
|
|
||||||
movdqa xmm1,xmm5 ; xmm1=z10(unscaled)
|
movdqa xmm1, xmm5 ; xmm1=z10(unscaled)
|
||||||
psllw xmm2,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
|
||||||
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
|
||||||
|
|
||||||
movdqa xmm3,xmm4
|
movdqa xmm3, xmm4
|
||||||
psubw xmm4,xmm0
|
psubw xmm4, xmm0
|
||||||
paddw xmm3,xmm0 ; xmm3=tmp7
|
paddw xmm3, xmm0 ; xmm3=tmp7
|
||||||
|
|
||||||
psllw xmm4,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm4, PRE_MULTIPLY_SCALE_BITS
|
||||||
pmulhw xmm4,[rel PW_F1414] ; xmm4=tmp11
|
pmulhw xmm4, [rel PW_F1414] ; xmm4=tmp11
|
||||||
|
|
||||||
; To avoid overflow...
|
; To avoid overflow...
|
||||||
;
|
;
|
||||||
@@ -222,32 +222,32 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
||||||
; = -1.613125930 * z10 - z10 + z5;
|
; = -1.613125930 * z10 - z10 + z5;
|
||||||
|
|
||||||
movdqa xmm0,xmm5
|
movdqa xmm0, xmm5
|
||||||
paddw xmm5,xmm2
|
paddw xmm5, xmm2
|
||||||
pmulhw xmm5,[rel PW_F1847] ; xmm5=z5
|
pmulhw xmm5, [rel PW_F1847] ; xmm5=z5
|
||||||
pmulhw xmm0,[rel PW_MF1613]
|
pmulhw xmm0, [rel PW_MF1613]
|
||||||
pmulhw xmm2,[rel PW_F1082]
|
pmulhw xmm2, [rel PW_F1082]
|
||||||
psubw xmm0,xmm1
|
psubw xmm0, xmm1
|
||||||
psubw xmm2,xmm5 ; xmm2=tmp10
|
psubw xmm2, xmm5 ; xmm2=tmp10
|
||||||
paddw xmm0,xmm5 ; xmm0=tmp12
|
paddw xmm0, xmm5 ; xmm0=tmp12
|
||||||
|
|
||||||
; -- Final output stage
|
; -- Final output stage
|
||||||
|
|
||||||
psubw xmm0,xmm3 ; xmm0=tmp6
|
psubw xmm0, xmm3 ; xmm0=tmp6
|
||||||
movdqa xmm1,xmm6
|
movdqa xmm1, xmm6
|
||||||
movdqa xmm5,xmm7
|
movdqa xmm5, xmm7
|
||||||
paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
|
paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
|
||||||
paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
|
paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
|
||||||
psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
|
psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
|
||||||
psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
|
psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
|
||||||
psubw xmm4,xmm0 ; xmm4=tmp5
|
psubw xmm4, xmm0 ; xmm4=tmp5
|
||||||
|
|
||||||
movdqa xmm3,xmm6 ; transpose coefficients(phase 1)
|
movdqa xmm3, xmm6 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
|
punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
|
||||||
punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
|
punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
|
||||||
movdqa xmm0,xmm5 ; transpose coefficients(phase 1)
|
movdqa xmm0, xmm5 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
|
punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
|
||||||
punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
|
punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
|
||||||
|
|
||||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
|
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
|
||||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
|
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
|
||||||
@@ -255,27 +255,27 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
|
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
|
||||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
|
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
|
||||||
|
|
||||||
paddw xmm2,xmm4 ; xmm2=tmp4
|
paddw xmm2, xmm4 ; xmm2=tmp4
|
||||||
movdqa xmm5,xmm7
|
movdqa xmm5, xmm7
|
||||||
movdqa xmm0,xmm1
|
movdqa xmm0, xmm1
|
||||||
paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
|
paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
|
||||||
paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
|
paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
|
||||||
psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
|
psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
|
||||||
psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
|
psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
|
||||||
|
|
||||||
movdqa xmm4,xmm7 ; transpose coefficients(phase 1)
|
movdqa xmm4, xmm7 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
|
punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
|
||||||
punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
|
punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
|
||||||
movdqa xmm2,xmm1 ; transpose coefficients(phase 1)
|
movdqa xmm2, xmm1 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
|
punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
|
||||||
punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
|
punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
|
||||||
|
|
||||||
movdqa xmm0,xmm3 ; transpose coefficients(phase 2)
|
movdqa xmm0, xmm3 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
|
punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
|
||||||
punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
|
punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
|
||||||
movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
|
movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
|
punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
|
||||||
punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
|
punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
|
||||||
|
|
||||||
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
|
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
|
||||||
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
|
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
|
||||||
@@ -283,19 +283,19 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
|
movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
|
||||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
|
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
|
||||||
|
|
||||||
movdqa xmm3,xmm1 ; transpose coefficients(phase 2)
|
movdqa xmm3, xmm1 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
|
punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
|
||||||
punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
|
punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
|
||||||
movdqa xmm0,xmm2 ; transpose coefficients(phase 2)
|
movdqa xmm0, xmm2 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
|
punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
|
||||||
punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
|
punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
|
||||||
|
|
||||||
movdqa xmm4,xmm6 ; transpose coefficients(phase 3)
|
movdqa xmm4, xmm6 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
|
punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
|
||||||
punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
|
punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
|
||||||
movdqa xmm7,xmm5 ; transpose coefficients(phase 3)
|
movdqa xmm7, xmm5 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
|
punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
|
||||||
punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
|
punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
|
||||||
|
|
||||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
|
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
|
||||||
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
|
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
|
||||||
@@ -303,12 +303,12 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
|
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
|
||||||
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
|
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
|
||||||
|
|
||||||
movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
|
movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
|
punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
|
||||||
punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
|
punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
|
||||||
movdqa xmm7,xmm3 ; transpose coefficients(phase 3)
|
movdqa xmm7, xmm3 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
|
punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
|
||||||
punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
|
punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
|
||||||
.column_end:
|
.column_end:
|
||||||
|
|
||||||
; -- Prefetch the next coefficient block
|
; -- Prefetch the next coefficient block
|
||||||
@@ -328,23 +328,23 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
|
|
||||||
; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
|
; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
|
||||||
|
|
||||||
movdqa xmm2,xmm6
|
movdqa xmm2, xmm6
|
||||||
movdqa xmm0,xmm5
|
movdqa xmm0, xmm5
|
||||||
psubw xmm6,xmm1 ; xmm6=tmp11
|
psubw xmm6, xmm1 ; xmm6=tmp11
|
||||||
psubw xmm5,xmm3
|
psubw xmm5, xmm3
|
||||||
paddw xmm2,xmm1 ; xmm2=tmp10
|
paddw xmm2, xmm1 ; xmm2=tmp10
|
||||||
paddw xmm0,xmm3 ; xmm0=tmp13
|
paddw xmm0, xmm3 ; xmm0=tmp13
|
||||||
|
|
||||||
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
|
||||||
pmulhw xmm5,[rel PW_F1414]
|
pmulhw xmm5, [rel PW_F1414]
|
||||||
psubw xmm5,xmm0 ; xmm5=tmp12
|
psubw xmm5, xmm0 ; xmm5=tmp12
|
||||||
|
|
||||||
movdqa xmm1,xmm2
|
movdqa xmm1, xmm2
|
||||||
movdqa xmm3,xmm6
|
movdqa xmm3, xmm6
|
||||||
psubw xmm2,xmm0 ; xmm2=tmp3
|
psubw xmm2, xmm0 ; xmm2=tmp3
|
||||||
psubw xmm6,xmm5 ; xmm6=tmp2
|
psubw xmm6, xmm5 ; xmm6=tmp2
|
||||||
paddw xmm1,xmm0 ; xmm1=tmp0
|
paddw xmm1, xmm0 ; xmm1=tmp0
|
||||||
paddw xmm3,xmm5 ; xmm3=tmp1
|
paddw xmm3, xmm5 ; xmm3=tmp1
|
||||||
|
|
||||||
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
|
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
|
||||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
|
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
|
||||||
@@ -356,23 +356,23 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
|
|
||||||
; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
|
; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
|
||||||
|
|
||||||
movdqa xmm2,xmm0
|
movdqa xmm2, xmm0
|
||||||
movdqa xmm6,xmm4
|
movdqa xmm6, xmm4
|
||||||
psubw xmm0,xmm7 ; xmm0=z12
|
psubw xmm0, xmm7 ; xmm0=z12
|
||||||
psubw xmm4,xmm5 ; xmm4=z10
|
psubw xmm4, xmm5 ; xmm4=z10
|
||||||
paddw xmm2,xmm7 ; xmm2=z11
|
paddw xmm2, xmm7 ; xmm2=z11
|
||||||
paddw xmm6,xmm5 ; xmm6=z13
|
paddw xmm6, xmm5 ; xmm6=z13
|
||||||
|
|
||||||
movdqa xmm7,xmm4 ; xmm7=z10(unscaled)
|
movdqa xmm7, xmm4 ; xmm7=z10(unscaled)
|
||||||
psllw xmm0,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
|
||||||
psllw xmm4,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm4, PRE_MULTIPLY_SCALE_BITS
|
||||||
|
|
||||||
movdqa xmm5,xmm2
|
movdqa xmm5, xmm2
|
||||||
psubw xmm2,xmm6
|
psubw xmm2, xmm6
|
||||||
paddw xmm5,xmm6 ; xmm5=tmp7
|
paddw xmm5, xmm6 ; xmm5=tmp7
|
||||||
|
|
||||||
psllw xmm2,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
|
||||||
pmulhw xmm2,[rel PW_F1414] ; xmm2=tmp11
|
pmulhw xmm2, [rel PW_F1414] ; xmm2=tmp11
|
||||||
|
|
||||||
; To avoid overflow...
|
; To avoid overflow...
|
||||||
;
|
;
|
||||||
@@ -383,83 +383,83 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
||||||
; = -1.613125930 * z10 - z10 + z5;
|
; = -1.613125930 * z10 - z10 + z5;
|
||||||
|
|
||||||
movdqa xmm6,xmm4
|
movdqa xmm6, xmm4
|
||||||
paddw xmm4,xmm0
|
paddw xmm4, xmm0
|
||||||
pmulhw xmm4,[rel PW_F1847] ; xmm4=z5
|
pmulhw xmm4, [rel PW_F1847] ; xmm4=z5
|
||||||
pmulhw xmm6,[rel PW_MF1613]
|
pmulhw xmm6, [rel PW_MF1613]
|
||||||
pmulhw xmm0,[rel PW_F1082]
|
pmulhw xmm0, [rel PW_F1082]
|
||||||
psubw xmm6,xmm7
|
psubw xmm6, xmm7
|
||||||
psubw xmm0,xmm4 ; xmm0=tmp10
|
psubw xmm0, xmm4 ; xmm0=tmp10
|
||||||
paddw xmm6,xmm4 ; xmm6=tmp12
|
paddw xmm6, xmm4 ; xmm6=tmp12
|
||||||
|
|
||||||
; -- Final output stage
|
; -- Final output stage
|
||||||
|
|
||||||
psubw xmm6,xmm5 ; xmm6=tmp6
|
psubw xmm6, xmm5 ; xmm6=tmp6
|
||||||
movdqa xmm7,xmm1
|
movdqa xmm7, xmm1
|
||||||
movdqa xmm4,xmm3
|
movdqa xmm4, xmm3
|
||||||
paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
|
paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
|
||||||
paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
|
paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
|
||||||
psraw xmm1,(PASS1_BITS+3) ; descale
|
psraw xmm1, (PASS1_BITS+3) ; descale
|
||||||
psraw xmm3,(PASS1_BITS+3) ; descale
|
psraw xmm3, (PASS1_BITS+3) ; descale
|
||||||
psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
|
psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
|
||||||
psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
|
psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
|
||||||
psraw xmm7,(PASS1_BITS+3) ; descale
|
psraw xmm7, (PASS1_BITS+3) ; descale
|
||||||
psraw xmm4,(PASS1_BITS+3) ; descale
|
psraw xmm4, (PASS1_BITS+3) ; descale
|
||||||
psubw xmm2,xmm6 ; xmm2=tmp5
|
psubw xmm2, xmm6 ; xmm2=tmp5
|
||||||
|
|
||||||
packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||||
packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||||
|
|
||||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
|
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
|
||||||
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
|
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
|
||||||
|
|
||||||
paddw xmm0,xmm2 ; xmm0=tmp4
|
paddw xmm0, xmm2 ; xmm0=tmp4
|
||||||
movdqa xmm4,xmm5
|
movdqa xmm4, xmm5
|
||||||
movdqa xmm7,xmm6
|
movdqa xmm7, xmm6
|
||||||
paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
|
paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
|
||||||
paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
|
paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
|
||||||
psraw xmm5,(PASS1_BITS+3) ; descale
|
psraw xmm5, (PASS1_BITS+3) ; descale
|
||||||
psraw xmm6,(PASS1_BITS+3) ; descale
|
psraw xmm6, (PASS1_BITS+3) ; descale
|
||||||
psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
|
psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
|
||||||
psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
|
psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
|
||||||
psraw xmm4,(PASS1_BITS+3) ; descale
|
psraw xmm4, (PASS1_BITS+3) ; descale
|
||||||
psraw xmm7,(PASS1_BITS+3) ; descale
|
psraw xmm7, (PASS1_BITS+3) ; descale
|
||||||
|
|
||||||
movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
|
movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
|
||||||
|
|
||||||
packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
|
packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
|
||||||
packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
|
packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
|
||||||
|
|
||||||
paddb xmm1,xmm2
|
paddb xmm1, xmm2
|
||||||
paddb xmm3,xmm2
|
paddb xmm3, xmm2
|
||||||
paddb xmm5,xmm2
|
paddb xmm5, xmm2
|
||||||
paddb xmm7,xmm2
|
paddb xmm7, xmm2
|
||||||
|
|
||||||
movdqa xmm0,xmm1 ; transpose coefficients(phase 1)
|
movdqa xmm0, xmm1 ; transpose coefficients(phase 1)
|
||||||
punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
|
punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
|
||||||
punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
|
punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
|
||||||
movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
|
movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
|
||||||
punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
|
punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
|
||||||
punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
|
punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
|
||||||
|
|
||||||
movdqa xmm4,xmm1 ; transpose coefficients(phase 2)
|
movdqa xmm4, xmm1 ; transpose coefficients(phase 2)
|
||||||
punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
||||||
punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
|
punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
|
||||||
movdqa xmm2,xmm6 ; transpose coefficients(phase 2)
|
movdqa xmm2, xmm6 ; transpose coefficients(phase 2)
|
||||||
punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
||||||
punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
|
punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
|
||||||
|
|
||||||
movdqa xmm3,xmm1 ; transpose coefficients(phase 3)
|
movdqa xmm3, xmm1 ; transpose coefficients(phase 3)
|
||||||
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||||
punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||||
movdqa xmm7,xmm4 ; transpose coefficients(phase 3)
|
movdqa xmm7, xmm4 ; transpose coefficients(phase 3)
|
||||||
punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
||||||
punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
||||||
|
|
||||||
pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||||
pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||||
pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
|
pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
|
||||||
pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
|
pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
|
||||||
|
|
||||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
||||||
mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
|
mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
|
||||||
@@ -480,7 +480,7 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
|
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
|
||||||
|
|
||||||
uncollect_args
|
uncollect_args
|
||||||
mov rsp,rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -41,10 +41,10 @@ F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
|
|||||||
%else
|
%else
|
||||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||||
F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
|
F_1_082 equ DESCALE(1162209775, 30-CONST_BITS) ; FIX(1.082392200)
|
||||||
F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
|
F_1_414 equ DESCALE(1518500249, 30-CONST_BITS) ; FIX(1.414213562)
|
||||||
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
|
F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
|
||||||
F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
|
F_2_613 equ DESCALE(2805822602, 30-CONST_BITS) ; FIX(2.613125930)
|
||||||
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
|
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
@@ -95,11 +95,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
|||||||
|
|
||||||
EXTN(jsimd_idct_ifast_sse2):
|
EXTN(jsimd_idct_ifast_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov eax,esp ; eax = original ebp
|
mov eax, esp ; eax = original ebp
|
||||||
sub esp, byte 4
|
sub esp, byte 4
|
||||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [esp],eax
|
mov [esp], eax
|
||||||
mov ebp,esp ; ebp = aligned ebp
|
mov ebp, esp ; ebp = aligned ebp
|
||||||
lea esp, [wk(0)]
|
lea esp, [wk(0)]
|
||||||
pushpic ebx
|
pushpic ebx
|
||||||
; push ecx ; unused
|
; push ecx ; unused
|
||||||
@@ -127,11 +127,11 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||||
por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||||
por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||||
por xmm1,xmm0
|
por xmm1, xmm0
|
||||||
packsswb xmm1,xmm1
|
packsswb xmm1, xmm1
|
||||||
packsswb xmm1,xmm1
|
packsswb xmm1, xmm1
|
||||||
movd eax,xmm1
|
movd eax, xmm1
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jnz short .columnDCT
|
jnz short .columnDCT
|
||||||
|
|
||||||
; -- AC terms all zero
|
; -- AC terms all zero
|
||||||
@@ -139,23 +139,23 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||||
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
|
|
||||||
movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
|
movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
|
||||||
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||||
punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
|
punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
|
||||||
|
|
||||||
pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
|
pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
|
||||||
pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
|
pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
|
||||||
pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
|
pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
|
||||||
pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
|
pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
|
||||||
pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
|
pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
|
||||||
pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
|
pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
|
||||||
pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
|
pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
|
||||||
pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
|
pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
|
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
|
||||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
|
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
|
||||||
jmp near .column_end
|
jmp near .column_end
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
%endif
|
%endif
|
||||||
.columnDCT:
|
.columnDCT:
|
||||||
|
|
||||||
@@ -170,23 +170,23 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||||
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||||
|
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
psubw xmm0,xmm2 ; xmm0=tmp11
|
psubw xmm0, xmm2 ; xmm0=tmp11
|
||||||
psubw xmm1,xmm3
|
psubw xmm1, xmm3
|
||||||
paddw xmm4,xmm2 ; xmm4=tmp10
|
paddw xmm4, xmm2 ; xmm4=tmp10
|
||||||
paddw xmm5,xmm3 ; xmm5=tmp13
|
paddw xmm5, xmm3 ; xmm5=tmp13
|
||||||
|
|
||||||
psllw xmm1,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm1, PRE_MULTIPLY_SCALE_BITS
|
||||||
pmulhw xmm1,[GOTOFF(ebx,PW_F1414)]
|
pmulhw xmm1, [GOTOFF(ebx,PW_F1414)]
|
||||||
psubw xmm1,xmm5 ; xmm1=tmp12
|
psubw xmm1, xmm5 ; xmm1=tmp12
|
||||||
|
|
||||||
movdqa xmm6,xmm4
|
movdqa xmm6, xmm4
|
||||||
movdqa xmm7,xmm0
|
movdqa xmm7, xmm0
|
||||||
psubw xmm4,xmm5 ; xmm4=tmp3
|
psubw xmm4, xmm5 ; xmm4=tmp3
|
||||||
psubw xmm0,xmm1 ; xmm0=tmp2
|
psubw xmm0, xmm1 ; xmm0=tmp2
|
||||||
paddw xmm6,xmm5 ; xmm6=tmp0
|
paddw xmm6, xmm5 ; xmm6=tmp0
|
||||||
paddw xmm7,xmm1 ; xmm7=tmp1
|
paddw xmm7, xmm1 ; xmm7=tmp1
|
||||||
|
|
||||||
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
|
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
|
||||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
|
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
|
||||||
@@ -202,23 +202,23 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||||
pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||||
|
|
||||||
movdqa xmm4,xmm2
|
movdqa xmm4, xmm2
|
||||||
movdqa xmm0,xmm5
|
movdqa xmm0, xmm5
|
||||||
psubw xmm2,xmm1 ; xmm2=z12
|
psubw xmm2, xmm1 ; xmm2=z12
|
||||||
psubw xmm5,xmm3 ; xmm5=z10
|
psubw xmm5, xmm3 ; xmm5=z10
|
||||||
paddw xmm4,xmm1 ; xmm4=z11
|
paddw xmm4, xmm1 ; xmm4=z11
|
||||||
paddw xmm0,xmm3 ; xmm0=z13
|
paddw xmm0, xmm3 ; xmm0=z13
|
||||||
|
|
||||||
movdqa xmm1,xmm5 ; xmm1=z10(unscaled)
|
movdqa xmm1, xmm5 ; xmm1=z10(unscaled)
|
||||||
psllw xmm2,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
|
||||||
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
|
||||||
|
|
||||||
movdqa xmm3,xmm4
|
movdqa xmm3, xmm4
|
||||||
psubw xmm4,xmm0
|
psubw xmm4, xmm0
|
||||||
paddw xmm3,xmm0 ; xmm3=tmp7
|
paddw xmm3, xmm0 ; xmm3=tmp7
|
||||||
|
|
||||||
psllw xmm4,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm4, PRE_MULTIPLY_SCALE_BITS
|
||||||
pmulhw xmm4,[GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11
|
pmulhw xmm4, [GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11
|
||||||
|
|
||||||
; To avoid overflow...
|
; To avoid overflow...
|
||||||
;
|
;
|
||||||
@@ -229,32 +229,32 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
||||||
; = -1.613125930 * z10 - z10 + z5;
|
; = -1.613125930 * z10 - z10 + z5;
|
||||||
|
|
||||||
movdqa xmm0,xmm5
|
movdqa xmm0, xmm5
|
||||||
paddw xmm5,xmm2
|
paddw xmm5, xmm2
|
||||||
pmulhw xmm5,[GOTOFF(ebx,PW_F1847)] ; xmm5=z5
|
pmulhw xmm5, [GOTOFF(ebx,PW_F1847)] ; xmm5=z5
|
||||||
pmulhw xmm0,[GOTOFF(ebx,PW_MF1613)]
|
pmulhw xmm0, [GOTOFF(ebx,PW_MF1613)]
|
||||||
pmulhw xmm2,[GOTOFF(ebx,PW_F1082)]
|
pmulhw xmm2, [GOTOFF(ebx,PW_F1082)]
|
||||||
psubw xmm0,xmm1
|
psubw xmm0, xmm1
|
||||||
psubw xmm2,xmm5 ; xmm2=tmp10
|
psubw xmm2, xmm5 ; xmm2=tmp10
|
||||||
paddw xmm0,xmm5 ; xmm0=tmp12
|
paddw xmm0, xmm5 ; xmm0=tmp12
|
||||||
|
|
||||||
; -- Final output stage
|
; -- Final output stage
|
||||||
|
|
||||||
psubw xmm0,xmm3 ; xmm0=tmp6
|
psubw xmm0, xmm3 ; xmm0=tmp6
|
||||||
movdqa xmm1,xmm6
|
movdqa xmm1, xmm6
|
||||||
movdqa xmm5,xmm7
|
movdqa xmm5, xmm7
|
||||||
paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
|
paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
|
||||||
paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
|
paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
|
||||||
psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
|
psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
|
||||||
psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
|
psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
|
||||||
psubw xmm4,xmm0 ; xmm4=tmp5
|
psubw xmm4, xmm0 ; xmm4=tmp5
|
||||||
|
|
||||||
movdqa xmm3,xmm6 ; transpose coefficients(phase 1)
|
movdqa xmm3, xmm6 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
|
punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
|
||||||
punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
|
punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
|
||||||
movdqa xmm0,xmm5 ; transpose coefficients(phase 1)
|
movdqa xmm0, xmm5 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
|
punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
|
||||||
punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
|
punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
|
||||||
|
|
||||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
|
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
|
||||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
|
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
|
||||||
@@ -262,27 +262,27 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
|
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
|
||||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
|
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
|
||||||
|
|
||||||
paddw xmm2,xmm4 ; xmm2=tmp4
|
paddw xmm2, xmm4 ; xmm2=tmp4
|
||||||
movdqa xmm5,xmm7
|
movdqa xmm5, xmm7
|
||||||
movdqa xmm0,xmm1
|
movdqa xmm0, xmm1
|
||||||
paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
|
paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
|
||||||
paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
|
paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
|
||||||
psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
|
psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
|
||||||
psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
|
psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
|
||||||
|
|
||||||
movdqa xmm4,xmm7 ; transpose coefficients(phase 1)
|
movdqa xmm4, xmm7 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
|
punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
|
||||||
punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
|
punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
|
||||||
movdqa xmm2,xmm1 ; transpose coefficients(phase 1)
|
movdqa xmm2, xmm1 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
|
punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
|
||||||
punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
|
punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
|
||||||
|
|
||||||
movdqa xmm0,xmm3 ; transpose coefficients(phase 2)
|
movdqa xmm0, xmm3 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
|
punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
|
||||||
punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
|
punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
|
||||||
movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
|
movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
|
punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
|
||||||
punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
|
punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
|
||||||
|
|
||||||
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
|
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
|
||||||
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
|
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
|
||||||
@@ -290,19 +290,19 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
|
movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
|
||||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
|
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
|
||||||
|
|
||||||
movdqa xmm3,xmm1 ; transpose coefficients(phase 2)
|
movdqa xmm3, xmm1 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
|
punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
|
||||||
punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
|
punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
|
||||||
movdqa xmm0,xmm2 ; transpose coefficients(phase 2)
|
movdqa xmm0, xmm2 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
|
punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
|
||||||
punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
|
punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
|
||||||
|
|
||||||
movdqa xmm4,xmm6 ; transpose coefficients(phase 3)
|
movdqa xmm4, xmm6 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
|
punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
|
||||||
punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
|
punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
|
||||||
movdqa xmm7,xmm5 ; transpose coefficients(phase 3)
|
movdqa xmm7, xmm5 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
|
punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
|
||||||
punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
|
punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
|
||||||
|
|
||||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
|
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
|
||||||
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
|
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
|
||||||
@@ -310,12 +310,12 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
|
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
|
||||||
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
|
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
|
||||||
|
|
||||||
movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
|
movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
|
punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
|
||||||
punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
|
punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
|
||||||
movdqa xmm7,xmm3 ; transpose coefficients(phase 3)
|
movdqa xmm7, xmm3 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
|
punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
|
||||||
punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
|
punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
|
||||||
.column_end:
|
.column_end:
|
||||||
|
|
||||||
; -- Prefetch the next coefficient block
|
; -- Prefetch the next coefficient block
|
||||||
@@ -335,23 +335,23 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
|
|
||||||
; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
|
; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
|
||||||
|
|
||||||
movdqa xmm2,xmm6
|
movdqa xmm2, xmm6
|
||||||
movdqa xmm0,xmm5
|
movdqa xmm0, xmm5
|
||||||
psubw xmm6,xmm1 ; xmm6=tmp11
|
psubw xmm6, xmm1 ; xmm6=tmp11
|
||||||
psubw xmm5,xmm3
|
psubw xmm5, xmm3
|
||||||
paddw xmm2,xmm1 ; xmm2=tmp10
|
paddw xmm2, xmm1 ; xmm2=tmp10
|
||||||
paddw xmm0,xmm3 ; xmm0=tmp13
|
paddw xmm0, xmm3 ; xmm0=tmp13
|
||||||
|
|
||||||
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
|
||||||
pmulhw xmm5,[GOTOFF(ebx,PW_F1414)]
|
pmulhw xmm5, [GOTOFF(ebx,PW_F1414)]
|
||||||
psubw xmm5,xmm0 ; xmm5=tmp12
|
psubw xmm5, xmm0 ; xmm5=tmp12
|
||||||
|
|
||||||
movdqa xmm1,xmm2
|
movdqa xmm1, xmm2
|
||||||
movdqa xmm3,xmm6
|
movdqa xmm3, xmm6
|
||||||
psubw xmm2,xmm0 ; xmm2=tmp3
|
psubw xmm2, xmm0 ; xmm2=tmp3
|
||||||
psubw xmm6,xmm5 ; xmm6=tmp2
|
psubw xmm6, xmm5 ; xmm6=tmp2
|
||||||
paddw xmm1,xmm0 ; xmm1=tmp0
|
paddw xmm1, xmm0 ; xmm1=tmp0
|
||||||
paddw xmm3,xmm5 ; xmm3=tmp1
|
paddw xmm3, xmm5 ; xmm3=tmp1
|
||||||
|
|
||||||
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
|
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
|
||||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
|
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
|
||||||
@@ -363,23 +363,23 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
|
|
||||||
; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
|
; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
|
||||||
|
|
||||||
movdqa xmm2,xmm0
|
movdqa xmm2, xmm0
|
||||||
movdqa xmm6,xmm4
|
movdqa xmm6, xmm4
|
||||||
psubw xmm0,xmm7 ; xmm0=z12
|
psubw xmm0, xmm7 ; xmm0=z12
|
||||||
psubw xmm4,xmm5 ; xmm4=z10
|
psubw xmm4, xmm5 ; xmm4=z10
|
||||||
paddw xmm2,xmm7 ; xmm2=z11
|
paddw xmm2, xmm7 ; xmm2=z11
|
||||||
paddw xmm6,xmm5 ; xmm6=z13
|
paddw xmm6, xmm5 ; xmm6=z13
|
||||||
|
|
||||||
movdqa xmm7,xmm4 ; xmm7=z10(unscaled)
|
movdqa xmm7, xmm4 ; xmm7=z10(unscaled)
|
||||||
psllw xmm0,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
|
||||||
psllw xmm4,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm4, PRE_MULTIPLY_SCALE_BITS
|
||||||
|
|
||||||
movdqa xmm5,xmm2
|
movdqa xmm5, xmm2
|
||||||
psubw xmm2,xmm6
|
psubw xmm2, xmm6
|
||||||
paddw xmm5,xmm6 ; xmm5=tmp7
|
paddw xmm5, xmm6 ; xmm5=tmp7
|
||||||
|
|
||||||
psllw xmm2,PRE_MULTIPLY_SCALE_BITS
|
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
|
||||||
pmulhw xmm2,[GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11
|
pmulhw xmm2, [GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11
|
||||||
|
|
||||||
; To avoid overflow...
|
; To avoid overflow...
|
||||||
;
|
;
|
||||||
@@ -390,83 +390,83 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
||||||
; = -1.613125930 * z10 - z10 + z5;
|
; = -1.613125930 * z10 - z10 + z5;
|
||||||
|
|
||||||
movdqa xmm6,xmm4
|
movdqa xmm6, xmm4
|
||||||
paddw xmm4,xmm0
|
paddw xmm4, xmm0
|
||||||
pmulhw xmm4,[GOTOFF(ebx,PW_F1847)] ; xmm4=z5
|
pmulhw xmm4, [GOTOFF(ebx,PW_F1847)] ; xmm4=z5
|
||||||
pmulhw xmm6,[GOTOFF(ebx,PW_MF1613)]
|
pmulhw xmm6, [GOTOFF(ebx,PW_MF1613)]
|
||||||
pmulhw xmm0,[GOTOFF(ebx,PW_F1082)]
|
pmulhw xmm0, [GOTOFF(ebx,PW_F1082)]
|
||||||
psubw xmm6,xmm7
|
psubw xmm6, xmm7
|
||||||
psubw xmm0,xmm4 ; xmm0=tmp10
|
psubw xmm0, xmm4 ; xmm0=tmp10
|
||||||
paddw xmm6,xmm4 ; xmm6=tmp12
|
paddw xmm6, xmm4 ; xmm6=tmp12
|
||||||
|
|
||||||
; -- Final output stage
|
; -- Final output stage
|
||||||
|
|
||||||
psubw xmm6,xmm5 ; xmm6=tmp6
|
psubw xmm6, xmm5 ; xmm6=tmp6
|
||||||
movdqa xmm7,xmm1
|
movdqa xmm7, xmm1
|
||||||
movdqa xmm4,xmm3
|
movdqa xmm4, xmm3
|
||||||
paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
|
paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
|
||||||
paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
|
paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
|
||||||
psraw xmm1,(PASS1_BITS+3) ; descale
|
psraw xmm1, (PASS1_BITS+3) ; descale
|
||||||
psraw xmm3,(PASS1_BITS+3) ; descale
|
psraw xmm3, (PASS1_BITS+3) ; descale
|
||||||
psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
|
psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
|
||||||
psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
|
psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
|
||||||
psraw xmm7,(PASS1_BITS+3) ; descale
|
psraw xmm7, (PASS1_BITS+3) ; descale
|
||||||
psraw xmm4,(PASS1_BITS+3) ; descale
|
psraw xmm4, (PASS1_BITS+3) ; descale
|
||||||
psubw xmm2,xmm6 ; xmm2=tmp5
|
psubw xmm2, xmm6 ; xmm2=tmp5
|
||||||
|
|
||||||
packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||||
packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||||
|
|
||||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
|
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
|
||||||
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
|
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
|
||||||
|
|
||||||
paddw xmm0,xmm2 ; xmm0=tmp4
|
paddw xmm0, xmm2 ; xmm0=tmp4
|
||||||
movdqa xmm4,xmm5
|
movdqa xmm4, xmm5
|
||||||
movdqa xmm7,xmm6
|
movdqa xmm7, xmm6
|
||||||
paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
|
paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
|
||||||
paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
|
paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
|
||||||
psraw xmm5,(PASS1_BITS+3) ; descale
|
psraw xmm5, (PASS1_BITS+3) ; descale
|
||||||
psraw xmm6,(PASS1_BITS+3) ; descale
|
psraw xmm6, (PASS1_BITS+3) ; descale
|
||||||
psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
|
psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
|
||||||
psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
|
psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
|
||||||
psraw xmm4,(PASS1_BITS+3) ; descale
|
psraw xmm4, (PASS1_BITS+3) ; descale
|
||||||
psraw xmm7,(PASS1_BITS+3) ; descale
|
psraw xmm7, (PASS1_BITS+3) ; descale
|
||||||
|
|
||||||
movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
|
movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
|
||||||
|
|
||||||
packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
|
packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
|
||||||
packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
|
packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
|
||||||
|
|
||||||
paddb xmm1,xmm2
|
paddb xmm1, xmm2
|
||||||
paddb xmm3,xmm2
|
paddb xmm3, xmm2
|
||||||
paddb xmm5,xmm2
|
paddb xmm5, xmm2
|
||||||
paddb xmm7,xmm2
|
paddb xmm7, xmm2
|
||||||
|
|
||||||
movdqa xmm0,xmm1 ; transpose coefficients(phase 1)
|
movdqa xmm0, xmm1 ; transpose coefficients(phase 1)
|
||||||
punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
|
punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
|
||||||
punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
|
punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
|
||||||
movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
|
movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
|
||||||
punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
|
punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
|
||||||
punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
|
punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
|
||||||
|
|
||||||
movdqa xmm4,xmm1 ; transpose coefficients(phase 2)
|
movdqa xmm4, xmm1 ; transpose coefficients(phase 2)
|
||||||
punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
||||||
punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
|
punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
|
||||||
movdqa xmm2,xmm6 ; transpose coefficients(phase 2)
|
movdqa xmm2, xmm6 ; transpose coefficients(phase 2)
|
||||||
punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
||||||
punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
|
punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
|
||||||
|
|
||||||
movdqa xmm3,xmm1 ; transpose coefficients(phase 3)
|
movdqa xmm3, xmm1 ; transpose coefficients(phase 3)
|
||||||
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||||
punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||||
movdqa xmm7,xmm4 ; transpose coefficients(phase 3)
|
movdqa xmm7, xmm4 ; transpose coefficients(phase 3)
|
||||||
punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
||||||
punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
||||||
|
|
||||||
pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||||
pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||||
pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
|
pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
|
||||||
pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
|
pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
|
||||||
|
|
||||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||||
mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
|
mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
|
||||||
@@ -491,7 +491,7 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
; pop edx ; need not be preserved
|
; pop edx ; need not be preserved
|
||||||
; pop ecx ; unused
|
; pop ecx ; unused
|
||||||
poppic ebx
|
poppic ebx
|
||||||
mov esp,ebp ; esp <- aligned ebp
|
mov esp, ebp ; esp <- aligned ebp
|
||||||
pop esp ; esp <- original ebp
|
pop esp ; esp <- original ebp
|
||||||
pop ebp
|
pop ebp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -48,18 +48,18 @@ F_3_072 equ 25172 ; FIX(3.072711026)
|
|||||||
%else
|
%else
|
||||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||||
F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
|
F_0_298 equ DESCALE( 320652955, 30-CONST_BITS) ; FIX(0.298631336)
|
||||||
F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
|
F_0_390 equ DESCALE( 418953276, 30-CONST_BITS) ; FIX(0.390180644)
|
||||||
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
|
F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
|
||||||
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
|
F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
|
||||||
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
|
F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
|
||||||
F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
|
F_1_175 equ DESCALE(1262586813, 30-CONST_BITS) ; FIX(1.175875602)
|
||||||
F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
|
F_1_501 equ DESCALE(1612031267, 30-CONST_BITS) ; FIX(1.501321110)
|
||||||
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
|
F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
|
||||||
F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
|
F_1_961 equ DESCALE(2106220350, 30-CONST_BITS) ; FIX(1.961570560)
|
||||||
F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
|
F_2_053 equ DESCALE(2204520673, 30-CONST_BITS) ; FIX(2.053119869)
|
||||||
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
|
F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
|
||||||
F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
|
F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026)
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
; --------------------------------------------------------------------------
|
; --------------------------------------------------------------------------
|
||||||
@@ -109,11 +109,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
|||||||
|
|
||||||
EXTN(jsimd_idct_islow_sse2):
|
EXTN(jsimd_idct_islow_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp ; rax = original rbp
|
mov rax, rsp ; rax = original rbp
|
||||||
sub rsp, byte 4
|
sub rsp, byte 4
|
||||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [rsp],rax
|
mov [rsp], rax
|
||||||
mov rbp,rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args
|
||||||
|
|
||||||
@@ -134,11 +134,11 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||||
por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||||
por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||||
por xmm1,xmm0
|
por xmm1, xmm0
|
||||||
packsswb xmm1,xmm1
|
packsswb xmm1, xmm1
|
||||||
packsswb xmm1,xmm1
|
packsswb xmm1, xmm1
|
||||||
movd eax,xmm1
|
movd eax, xmm1
|
||||||
test rax,rax
|
test rax, rax
|
||||||
jnz short .columnDCT
|
jnz short .columnDCT
|
||||||
|
|
||||||
; -- AC terms all zero
|
; -- AC terms all zero
|
||||||
@@ -146,20 +146,20 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||||
pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
|
|
||||||
psllw xmm5,PASS1_BITS
|
psllw xmm5, PASS1_BITS
|
||||||
|
|
||||||
movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
|
movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
|
||||||
punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
|
punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
|
||||||
punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
|
punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
|
||||||
|
|
||||||
pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
|
pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
|
||||||
pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
|
pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
|
||||||
pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
|
pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
|
||||||
pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
|
pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
|
||||||
pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
|
pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
|
||||||
pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
|
pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
|
||||||
pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
|
pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
|
||||||
pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
|
pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
|
movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
|
||||||
movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
|
movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
|
||||||
@@ -189,53 +189,53 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
||||||
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
||||||
|
|
||||||
movdqa xmm4,xmm1 ; xmm1=in2=z2
|
movdqa xmm4, xmm1 ; xmm1=in2=z2
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
punpcklwd xmm4,xmm3 ; xmm3=in6=z3
|
punpcklwd xmm4, xmm3 ; xmm3=in6=z3
|
||||||
punpckhwd xmm5,xmm3
|
punpckhwd xmm5, xmm3
|
||||||
movdqa xmm1,xmm4
|
movdqa xmm1, xmm4
|
||||||
movdqa xmm3,xmm5
|
movdqa xmm3, xmm5
|
||||||
pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=tmp3L
|
pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=tmp3L
|
||||||
pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H
|
pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H
|
||||||
pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L
|
pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L
|
||||||
pmaddwd xmm3,[rel PW_F054_MF130] ; xmm3=tmp2H
|
pmaddwd xmm3, [rel PW_F054_MF130] ; xmm3=tmp2H
|
||||||
|
|
||||||
movdqa xmm6,xmm0
|
movdqa xmm6, xmm0
|
||||||
paddw xmm0,xmm2 ; xmm0=in0+in4
|
paddw xmm0, xmm2 ; xmm0=in0+in4
|
||||||
psubw xmm6,xmm2 ; xmm6=in0-in4
|
psubw xmm6, xmm2 ; xmm6=in0-in4
|
||||||
|
|
||||||
pxor xmm7,xmm7
|
pxor xmm7, xmm7
|
||||||
pxor xmm2,xmm2
|
pxor xmm2, xmm2
|
||||||
punpcklwd xmm7,xmm0 ; xmm7=tmp0L
|
punpcklwd xmm7, xmm0 ; xmm7=tmp0L
|
||||||
punpckhwd xmm2,xmm0 ; xmm2=tmp0H
|
punpckhwd xmm2, xmm0 ; xmm2=tmp0H
|
||||||
psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
|
psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
|
||||||
psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
|
psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
|
||||||
|
|
||||||
movdqa xmm0,xmm7
|
movdqa xmm0, xmm7
|
||||||
paddd xmm7,xmm4 ; xmm7=tmp10L
|
paddd xmm7, xmm4 ; xmm7=tmp10L
|
||||||
psubd xmm0,xmm4 ; xmm0=tmp13L
|
psubd xmm0, xmm4 ; xmm0=tmp13L
|
||||||
movdqa xmm4,xmm2
|
movdqa xmm4, xmm2
|
||||||
paddd xmm2,xmm5 ; xmm2=tmp10H
|
paddd xmm2, xmm5 ; xmm2=tmp10H
|
||||||
psubd xmm4,xmm5 ; xmm4=tmp13H
|
psubd xmm4, xmm5 ; xmm4=tmp13H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
|
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
|
||||||
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
|
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
|
||||||
movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
|
movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
|
||||||
movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
|
movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
|
||||||
|
|
||||||
pxor xmm5,xmm5
|
pxor xmm5, xmm5
|
||||||
pxor xmm7,xmm7
|
pxor xmm7, xmm7
|
||||||
punpcklwd xmm5,xmm6 ; xmm5=tmp1L
|
punpcklwd xmm5, xmm6 ; xmm5=tmp1L
|
||||||
punpckhwd xmm7,xmm6 ; xmm7=tmp1H
|
punpckhwd xmm7, xmm6 ; xmm7=tmp1H
|
||||||
psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
|
psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
|
||||||
psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
|
psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
|
||||||
|
|
||||||
movdqa xmm2,xmm5
|
movdqa xmm2, xmm5
|
||||||
paddd xmm5,xmm1 ; xmm5=tmp11L
|
paddd xmm5, xmm1 ; xmm5=tmp11L
|
||||||
psubd xmm2,xmm1 ; xmm2=tmp12L
|
psubd xmm2, xmm1 ; xmm2=tmp12L
|
||||||
movdqa xmm0,xmm7
|
movdqa xmm0, xmm7
|
||||||
paddd xmm7,xmm3 ; xmm7=tmp11H
|
paddd xmm7, xmm3 ; xmm7=tmp11H
|
||||||
psubd xmm0,xmm3 ; xmm0=tmp12H
|
psubd xmm0, xmm3 ; xmm0=tmp12H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
|
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
|
||||||
movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
|
movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
|
||||||
@@ -253,10 +253,10 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
|
|
||||||
movdqa xmm5,xmm6
|
movdqa xmm5, xmm6
|
||||||
movdqa xmm7,xmm4
|
movdqa xmm7, xmm4
|
||||||
paddw xmm5,xmm3 ; xmm5=z3
|
paddw xmm5, xmm3 ; xmm5=z3
|
||||||
paddw xmm7,xmm1 ; xmm7=z4
|
paddw xmm7, xmm1 ; xmm7=z4
|
||||||
|
|
||||||
; (Original)
|
; (Original)
|
||||||
; z5 = (z3 + z4) * 1.175875602;
|
; z5 = (z3 + z4) * 1.175875602;
|
||||||
@@ -267,16 +267,16 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||||
|
|
||||||
movdqa xmm2,xmm5
|
movdqa xmm2, xmm5
|
||||||
movdqa xmm0,xmm5
|
movdqa xmm0, xmm5
|
||||||
punpcklwd xmm2,xmm7
|
punpcklwd xmm2, xmm7
|
||||||
punpckhwd xmm0,xmm7
|
punpckhwd xmm0, xmm7
|
||||||
movdqa xmm5,xmm2
|
movdqa xmm5, xmm2
|
||||||
movdqa xmm7,xmm0
|
movdqa xmm7, xmm0
|
||||||
pmaddwd xmm2,[rel PW_MF078_F117] ; xmm2=z3L
|
pmaddwd xmm2, [rel PW_MF078_F117] ; xmm2=z3L
|
||||||
pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3H
|
pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3H
|
||||||
pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L
|
pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L
|
||||||
pmaddwd xmm7,[rel PW_F117_F078] ; xmm7=z4H
|
pmaddwd xmm7, [rel PW_F117_F078] ; xmm7=z4H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
|
movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
|
||||||
movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
|
movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
|
||||||
@@ -297,38 +297,38 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
; tmp0 += z3; tmp1 += z4;
|
; tmp0 += z3; tmp1 += z4;
|
||||||
; tmp2 += z3; tmp3 += z4;
|
; tmp2 += z3; tmp3 += z4;
|
||||||
|
|
||||||
movdqa xmm2,xmm3
|
movdqa xmm2, xmm3
|
||||||
movdqa xmm0,xmm3
|
movdqa xmm0, xmm3
|
||||||
punpcklwd xmm2,xmm4
|
punpcklwd xmm2, xmm4
|
||||||
punpckhwd xmm0,xmm4
|
punpckhwd xmm0, xmm4
|
||||||
movdqa xmm3,xmm2
|
movdqa xmm3, xmm2
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
pmaddwd xmm2,[rel PW_MF060_MF089] ; xmm2=tmp0L
|
pmaddwd xmm2, [rel PW_MF060_MF089] ; xmm2=tmp0L
|
||||||
pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0H
|
pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0H
|
||||||
pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3L
|
pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3L
|
||||||
pmaddwd xmm4,[rel PW_MF089_F060] ; xmm4=tmp3H
|
pmaddwd xmm4, [rel PW_MF089_F060] ; xmm4=tmp3H
|
||||||
|
|
||||||
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
|
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
|
||||||
paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
|
paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
|
||||||
paddd xmm3,xmm5 ; xmm3=tmp3L
|
paddd xmm3, xmm5 ; xmm3=tmp3L
|
||||||
paddd xmm4,xmm7 ; xmm4=tmp3H
|
paddd xmm4, xmm7 ; xmm4=tmp3H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
|
movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
|
||||||
movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
|
movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
|
||||||
|
|
||||||
movdqa xmm2,xmm1
|
movdqa xmm2, xmm1
|
||||||
movdqa xmm0,xmm1
|
movdqa xmm0, xmm1
|
||||||
punpcklwd xmm2,xmm6
|
punpcklwd xmm2, xmm6
|
||||||
punpckhwd xmm0,xmm6
|
punpckhwd xmm0, xmm6
|
||||||
movdqa xmm1,xmm2
|
movdqa xmm1, xmm2
|
||||||
movdqa xmm6,xmm0
|
movdqa xmm6, xmm0
|
||||||
pmaddwd xmm2,[rel PW_MF050_MF256] ; xmm2=tmp1L
|
pmaddwd xmm2, [rel PW_MF050_MF256] ; xmm2=tmp1L
|
||||||
pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1H
|
pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1H
|
||||||
pmaddwd xmm1,[rel PW_MF256_F050] ; xmm1=tmp2L
|
pmaddwd xmm1, [rel PW_MF256_F050] ; xmm1=tmp2L
|
||||||
pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H
|
pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H
|
||||||
|
|
||||||
paddd xmm2,xmm5 ; xmm2=tmp1L
|
paddd xmm2, xmm5 ; xmm2=tmp1L
|
||||||
paddd xmm0,xmm7 ; xmm0=tmp1H
|
paddd xmm0, xmm7 ; xmm0=tmp1H
|
||||||
paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
|
paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
|
||||||
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
|
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
|
||||||
|
|
||||||
@@ -340,57 +340,57 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
|
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
|
||||||
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
|
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
|
||||||
|
|
||||||
movdqa xmm2,xmm5
|
movdqa xmm2, xmm5
|
||||||
movdqa xmm0,xmm7
|
movdqa xmm0, xmm7
|
||||||
paddd xmm5,xmm3 ; xmm5=data0L
|
paddd xmm5, xmm3 ; xmm5=data0L
|
||||||
paddd xmm7,xmm4 ; xmm7=data0H
|
paddd xmm7, xmm4 ; xmm7=data0H
|
||||||
psubd xmm2,xmm3 ; xmm2=data7L
|
psubd xmm2, xmm3 ; xmm2=data7L
|
||||||
psubd xmm0,xmm4 ; xmm0=data7H
|
psubd xmm0, xmm4 ; xmm0=data7H
|
||||||
|
|
||||||
movdqa xmm3,[rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1]
|
movdqa xmm3, [rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1]
|
||||||
|
|
||||||
paddd xmm5,xmm3
|
paddd xmm5, xmm3
|
||||||
paddd xmm7,xmm3
|
paddd xmm7, xmm3
|
||||||
psrad xmm5,DESCALE_P1
|
psrad xmm5, DESCALE_P1
|
||||||
psrad xmm7,DESCALE_P1
|
psrad xmm7, DESCALE_P1
|
||||||
paddd xmm2,xmm3
|
paddd xmm2, xmm3
|
||||||
paddd xmm0,xmm3
|
paddd xmm0, xmm3
|
||||||
psrad xmm2,DESCALE_P1
|
psrad xmm2, DESCALE_P1
|
||||||
psrad xmm0,DESCALE_P1
|
psrad xmm0, DESCALE_P1
|
||||||
|
|
||||||
packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
|
packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
|
||||||
packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
|
packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
|
||||||
|
|
||||||
movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
|
movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
|
||||||
movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
|
movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
|
||||||
|
|
||||||
movdqa xmm7,xmm4
|
movdqa xmm7, xmm4
|
||||||
movdqa xmm0,xmm3
|
movdqa xmm0, xmm3
|
||||||
paddd xmm4,xmm1 ; xmm4=data1L
|
paddd xmm4, xmm1 ; xmm4=data1L
|
||||||
paddd xmm3,xmm6 ; xmm3=data1H
|
paddd xmm3, xmm6 ; xmm3=data1H
|
||||||
psubd xmm7,xmm1 ; xmm7=data6L
|
psubd xmm7, xmm1 ; xmm7=data6L
|
||||||
psubd xmm0,xmm6 ; xmm0=data6H
|
psubd xmm0, xmm6 ; xmm0=data6H
|
||||||
|
|
||||||
movdqa xmm1,[rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1]
|
movdqa xmm1, [rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1]
|
||||||
|
|
||||||
paddd xmm4,xmm1
|
paddd xmm4, xmm1
|
||||||
paddd xmm3,xmm1
|
paddd xmm3, xmm1
|
||||||
psrad xmm4,DESCALE_P1
|
psrad xmm4, DESCALE_P1
|
||||||
psrad xmm3,DESCALE_P1
|
psrad xmm3, DESCALE_P1
|
||||||
paddd xmm7,xmm1
|
paddd xmm7, xmm1
|
||||||
paddd xmm0,xmm1
|
paddd xmm0, xmm1
|
||||||
psrad xmm7,DESCALE_P1
|
psrad xmm7, DESCALE_P1
|
||||||
psrad xmm0,DESCALE_P1
|
psrad xmm0, DESCALE_P1
|
||||||
|
|
||||||
packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
|
packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
|
||||||
packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
|
packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
|
||||||
|
|
||||||
movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
|
movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
|
punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
|
||||||
punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
|
punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
|
||||||
movdqa xmm1,xmm7 ; transpose coefficients(phase 1)
|
movdqa xmm1, xmm7 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
|
punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
|
||||||
punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
|
punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
|
||||||
|
|
||||||
movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
|
movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
|
||||||
movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
|
movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
|
||||||
@@ -402,69 +402,69 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
|
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
|
||||||
movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
|
movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
|
||||||
|
|
||||||
movdqa xmm5,xmm3
|
movdqa xmm5, xmm3
|
||||||
movdqa xmm6,xmm0
|
movdqa xmm6, xmm0
|
||||||
paddd xmm3,xmm4 ; xmm3=data2L
|
paddd xmm3, xmm4 ; xmm3=data2L
|
||||||
paddd xmm0,xmm2 ; xmm0=data2H
|
paddd xmm0, xmm2 ; xmm0=data2H
|
||||||
psubd xmm5,xmm4 ; xmm5=data5L
|
psubd xmm5, xmm4 ; xmm5=data5L
|
||||||
psubd xmm6,xmm2 ; xmm6=data5H
|
psubd xmm6, xmm2 ; xmm6=data5H
|
||||||
|
|
||||||
movdqa xmm7,[rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1]
|
movdqa xmm7, [rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1]
|
||||||
|
|
||||||
paddd xmm3,xmm7
|
paddd xmm3, xmm7
|
||||||
paddd xmm0,xmm7
|
paddd xmm0, xmm7
|
||||||
psrad xmm3,DESCALE_P1
|
psrad xmm3, DESCALE_P1
|
||||||
psrad xmm0,DESCALE_P1
|
psrad xmm0, DESCALE_P1
|
||||||
paddd xmm5,xmm7
|
paddd xmm5, xmm7
|
||||||
paddd xmm6,xmm7
|
paddd xmm6, xmm7
|
||||||
psrad xmm5,DESCALE_P1
|
psrad xmm5, DESCALE_P1
|
||||||
psrad xmm6,DESCALE_P1
|
psrad xmm6, DESCALE_P1
|
||||||
|
|
||||||
packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
|
packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
|
||||||
packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
|
packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
|
||||||
|
|
||||||
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
|
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
|
||||||
movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
|
movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
|
||||||
movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
|
movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
|
||||||
movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
|
movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
|
||||||
|
|
||||||
movdqa xmm0,xmm1
|
movdqa xmm0, xmm1
|
||||||
movdqa xmm6,xmm4
|
movdqa xmm6, xmm4
|
||||||
paddd xmm1,xmm2 ; xmm1=data3L
|
paddd xmm1, xmm2 ; xmm1=data3L
|
||||||
paddd xmm4,xmm7 ; xmm4=data3H
|
paddd xmm4, xmm7 ; xmm4=data3H
|
||||||
psubd xmm0,xmm2 ; xmm0=data4L
|
psubd xmm0, xmm2 ; xmm0=data4L
|
||||||
psubd xmm6,xmm7 ; xmm6=data4H
|
psubd xmm6, xmm7 ; xmm6=data4H
|
||||||
|
|
||||||
movdqa xmm2,[rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1]
|
movdqa xmm2, [rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1]
|
||||||
|
|
||||||
paddd xmm1,xmm2
|
paddd xmm1, xmm2
|
||||||
paddd xmm4,xmm2
|
paddd xmm4, xmm2
|
||||||
psrad xmm1,DESCALE_P1
|
psrad xmm1, DESCALE_P1
|
||||||
psrad xmm4,DESCALE_P1
|
psrad xmm4, DESCALE_P1
|
||||||
paddd xmm0,xmm2
|
paddd xmm0, xmm2
|
||||||
paddd xmm6,xmm2
|
paddd xmm6, xmm2
|
||||||
psrad xmm0,DESCALE_P1
|
psrad xmm0, DESCALE_P1
|
||||||
psrad xmm6,DESCALE_P1
|
psrad xmm6, DESCALE_P1
|
||||||
|
|
||||||
packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
|
packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
|
||||||
packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
|
packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
|
||||||
|
|
||||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
|
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
|
||||||
movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
|
movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
|
||||||
|
|
||||||
movdqa xmm4,xmm3 ; transpose coefficients(phase 1)
|
movdqa xmm4, xmm3 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
|
punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
|
||||||
punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
|
punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
|
||||||
movdqa xmm6,xmm0 ; transpose coefficients(phase 1)
|
movdqa xmm6, xmm0 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
|
punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
|
||||||
punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
|
punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
|
||||||
|
|
||||||
movdqa xmm1,xmm7 ; transpose coefficients(phase 2)
|
movdqa xmm1, xmm7 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
|
punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
|
||||||
punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
|
punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
|
||||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
|
movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
|
punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
|
||||||
punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
|
punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
|
||||||
|
|
||||||
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
|
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
|
||||||
movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
|
movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
|
||||||
@@ -472,19 +472,19 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
|
movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
|
||||||
movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
|
movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
|
||||||
|
|
||||||
movdqa xmm2,xmm0 ; transpose coefficients(phase 2)
|
movdqa xmm2, xmm0 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
|
punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
|
||||||
punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
|
punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
|
||||||
movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
|
movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
|
punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
|
||||||
punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
|
punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
|
||||||
|
|
||||||
movdqa xmm3,xmm7 ; transpose coefficients(phase 3)
|
movdqa xmm3, xmm7 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
|
punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
|
||||||
punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
|
punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
|
||||||
movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
|
movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
|
punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
|
||||||
punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
|
punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
|
||||||
|
|
||||||
movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
|
movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
|
||||||
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
|
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
|
||||||
@@ -492,12 +492,12 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
|
movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
|
||||||
movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
|
movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
|
||||||
|
|
||||||
movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
|
movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
|
punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
|
||||||
punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
|
punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
|
||||||
movdqa xmm4,xmm2 ; transpose coefficients(phase 3)
|
movdqa xmm4, xmm2 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
|
punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
|
||||||
punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
|
punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
|
movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
|
||||||
movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
|
movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
|
||||||
@@ -529,53 +529,53 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
||||||
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
||||||
|
|
||||||
movdqa xmm6,xmm1 ; xmm1=in2=z2
|
movdqa xmm6, xmm1 ; xmm1=in2=z2
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
punpcklwd xmm6,xmm2 ; xmm2=in6=z3
|
punpcklwd xmm6, xmm2 ; xmm2=in6=z3
|
||||||
punpckhwd xmm5,xmm2
|
punpckhwd xmm5, xmm2
|
||||||
movdqa xmm1,xmm6
|
movdqa xmm1, xmm6
|
||||||
movdqa xmm2,xmm5
|
movdqa xmm2, xmm5
|
||||||
pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=tmp3L
|
pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=tmp3L
|
||||||
pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H
|
pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H
|
||||||
pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L
|
pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L
|
||||||
pmaddwd xmm2,[rel PW_F054_MF130] ; xmm2=tmp2H
|
pmaddwd xmm2, [rel PW_F054_MF130] ; xmm2=tmp2H
|
||||||
|
|
||||||
movdqa xmm3,xmm7
|
movdqa xmm3, xmm7
|
||||||
paddw xmm7,xmm0 ; xmm7=in0+in4
|
paddw xmm7, xmm0 ; xmm7=in0+in4
|
||||||
psubw xmm3,xmm0 ; xmm3=in0-in4
|
psubw xmm3, xmm0 ; xmm3=in0-in4
|
||||||
|
|
||||||
pxor xmm4,xmm4
|
pxor xmm4, xmm4
|
||||||
pxor xmm0,xmm0
|
pxor xmm0, xmm0
|
||||||
punpcklwd xmm4,xmm7 ; xmm4=tmp0L
|
punpcklwd xmm4, xmm7 ; xmm4=tmp0L
|
||||||
punpckhwd xmm0,xmm7 ; xmm0=tmp0H
|
punpckhwd xmm0, xmm7 ; xmm0=tmp0H
|
||||||
psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
|
psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
|
||||||
psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
|
psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
|
||||||
|
|
||||||
movdqa xmm7,xmm4
|
movdqa xmm7, xmm4
|
||||||
paddd xmm4,xmm6 ; xmm4=tmp10L
|
paddd xmm4, xmm6 ; xmm4=tmp10L
|
||||||
psubd xmm7,xmm6 ; xmm7=tmp13L
|
psubd xmm7, xmm6 ; xmm7=tmp13L
|
||||||
movdqa xmm6,xmm0
|
movdqa xmm6, xmm0
|
||||||
paddd xmm0,xmm5 ; xmm0=tmp10H
|
paddd xmm0, xmm5 ; xmm0=tmp10H
|
||||||
psubd xmm6,xmm5 ; xmm6=tmp13H
|
psubd xmm6, xmm5 ; xmm6=tmp13H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
|
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
|
||||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
|
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
|
||||||
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
|
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
|
||||||
movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
|
movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
|
||||||
|
|
||||||
pxor xmm5,xmm5
|
pxor xmm5, xmm5
|
||||||
pxor xmm4,xmm4
|
pxor xmm4, xmm4
|
||||||
punpcklwd xmm5,xmm3 ; xmm5=tmp1L
|
punpcklwd xmm5, xmm3 ; xmm5=tmp1L
|
||||||
punpckhwd xmm4,xmm3 ; xmm4=tmp1H
|
punpckhwd xmm4, xmm3 ; xmm4=tmp1H
|
||||||
psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
|
psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
|
||||||
psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
|
psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
|
||||||
|
|
||||||
movdqa xmm0,xmm5
|
movdqa xmm0, xmm5
|
||||||
paddd xmm5,xmm1 ; xmm5=tmp11L
|
paddd xmm5, xmm1 ; xmm5=tmp11L
|
||||||
psubd xmm0,xmm1 ; xmm0=tmp12L
|
psubd xmm0, xmm1 ; xmm0=tmp12L
|
||||||
movdqa xmm7,xmm4
|
movdqa xmm7, xmm4
|
||||||
paddd xmm4,xmm2 ; xmm4=tmp11H
|
paddd xmm4, xmm2 ; xmm4=tmp11H
|
||||||
psubd xmm7,xmm2 ; xmm7=tmp12H
|
psubd xmm7, xmm2 ; xmm7=tmp12H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
|
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
|
||||||
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
|
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
|
||||||
@@ -589,10 +589,10 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
|
movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
|
||||||
movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
|
movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
|
||||||
|
|
||||||
movdqa xmm5,xmm6
|
movdqa xmm5, xmm6
|
||||||
movdqa xmm4,xmm3
|
movdqa xmm4, xmm3
|
||||||
paddw xmm5,xmm1 ; xmm5=z3
|
paddw xmm5, xmm1 ; xmm5=z3
|
||||||
paddw xmm4,xmm2 ; xmm4=z4
|
paddw xmm4, xmm2 ; xmm4=z4
|
||||||
|
|
||||||
; (Original)
|
; (Original)
|
||||||
; z5 = (z3 + z4) * 1.175875602;
|
; z5 = (z3 + z4) * 1.175875602;
|
||||||
@@ -603,16 +603,16 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||||
|
|
||||||
movdqa xmm0,xmm5
|
movdqa xmm0, xmm5
|
||||||
movdqa xmm7,xmm5
|
movdqa xmm7, xmm5
|
||||||
punpcklwd xmm0,xmm4
|
punpcklwd xmm0, xmm4
|
||||||
punpckhwd xmm7,xmm4
|
punpckhwd xmm7, xmm4
|
||||||
movdqa xmm5,xmm0
|
movdqa xmm5, xmm0
|
||||||
movdqa xmm4,xmm7
|
movdqa xmm4, xmm7
|
||||||
pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3L
|
pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3L
|
||||||
pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3H
|
pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3H
|
||||||
pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L
|
pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L
|
||||||
pmaddwd xmm4,[rel PW_F117_F078] ; xmm4=z4H
|
pmaddwd xmm4, [rel PW_F117_F078] ; xmm4=z4H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
|
movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
|
||||||
movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
|
movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
|
||||||
@@ -633,38 +633,38 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
; tmp0 += z3; tmp1 += z4;
|
; tmp0 += z3; tmp1 += z4;
|
||||||
; tmp2 += z3; tmp3 += z4;
|
; tmp2 += z3; tmp3 += z4;
|
||||||
|
|
||||||
movdqa xmm0,xmm1
|
movdqa xmm0, xmm1
|
||||||
movdqa xmm7,xmm1
|
movdqa xmm7, xmm1
|
||||||
punpcklwd xmm0,xmm3
|
punpcklwd xmm0, xmm3
|
||||||
punpckhwd xmm7,xmm3
|
punpckhwd xmm7, xmm3
|
||||||
movdqa xmm1,xmm0
|
movdqa xmm1, xmm0
|
||||||
movdqa xmm3,xmm7
|
movdqa xmm3, xmm7
|
||||||
pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0L
|
pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0L
|
||||||
pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp0H
|
pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp0H
|
||||||
pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp3L
|
pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp3L
|
||||||
pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3H
|
pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3H
|
||||||
|
|
||||||
paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
|
paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
|
||||||
paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
|
paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
|
||||||
paddd xmm1,xmm5 ; xmm1=tmp3L
|
paddd xmm1, xmm5 ; xmm1=tmp3L
|
||||||
paddd xmm3,xmm4 ; xmm3=tmp3H
|
paddd xmm3, xmm4 ; xmm3=tmp3H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
|
movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
|
||||||
movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
|
movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
|
||||||
|
|
||||||
movdqa xmm0,xmm2
|
movdqa xmm0, xmm2
|
||||||
movdqa xmm7,xmm2
|
movdqa xmm7, xmm2
|
||||||
punpcklwd xmm0,xmm6
|
punpcklwd xmm0, xmm6
|
||||||
punpckhwd xmm7,xmm6
|
punpckhwd xmm7, xmm6
|
||||||
movdqa xmm2,xmm0
|
movdqa xmm2, xmm0
|
||||||
movdqa xmm6,xmm7
|
movdqa xmm6, xmm7
|
||||||
pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1L
|
pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1L
|
||||||
pmaddwd xmm7,[rel PW_MF050_MF256] ; xmm7=tmp1H
|
pmaddwd xmm7, [rel PW_MF050_MF256] ; xmm7=tmp1H
|
||||||
pmaddwd xmm2,[rel PW_MF256_F050] ; xmm2=tmp2L
|
pmaddwd xmm2, [rel PW_MF256_F050] ; xmm2=tmp2L
|
||||||
pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H
|
pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H
|
||||||
|
|
||||||
paddd xmm0,xmm5 ; xmm0=tmp1L
|
paddd xmm0, xmm5 ; xmm0=tmp1L
|
||||||
paddd xmm7,xmm4 ; xmm7=tmp1H
|
paddd xmm7, xmm4 ; xmm7=tmp1H
|
||||||
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
|
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
|
||||||
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
|
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
|
||||||
|
|
||||||
@@ -676,53 +676,53 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
|
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
|
||||||
movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
|
movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
|
||||||
|
|
||||||
movdqa xmm0,xmm5
|
movdqa xmm0, xmm5
|
||||||
movdqa xmm7,xmm4
|
movdqa xmm7, xmm4
|
||||||
paddd xmm5,xmm1 ; xmm5=data0L
|
paddd xmm5, xmm1 ; xmm5=data0L
|
||||||
paddd xmm4,xmm3 ; xmm4=data0H
|
paddd xmm4, xmm3 ; xmm4=data0H
|
||||||
psubd xmm0,xmm1 ; xmm0=data7L
|
psubd xmm0, xmm1 ; xmm0=data7L
|
||||||
psubd xmm7,xmm3 ; xmm7=data7H
|
psubd xmm7, xmm3 ; xmm7=data7H
|
||||||
|
|
||||||
movdqa xmm1,[rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2]
|
movdqa xmm1, [rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2]
|
||||||
|
|
||||||
paddd xmm5,xmm1
|
paddd xmm5, xmm1
|
||||||
paddd xmm4,xmm1
|
paddd xmm4, xmm1
|
||||||
psrad xmm5,DESCALE_P2
|
psrad xmm5, DESCALE_P2
|
||||||
psrad xmm4,DESCALE_P2
|
psrad xmm4, DESCALE_P2
|
||||||
paddd xmm0,xmm1
|
paddd xmm0, xmm1
|
||||||
paddd xmm7,xmm1
|
paddd xmm7, xmm1
|
||||||
psrad xmm0,DESCALE_P2
|
psrad xmm0, DESCALE_P2
|
||||||
psrad xmm7,DESCALE_P2
|
psrad xmm7, DESCALE_P2
|
||||||
|
|
||||||
packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
|
packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
|
||||||
packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
|
packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
|
||||||
|
|
||||||
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
|
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
|
||||||
movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
|
movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
|
||||||
|
|
||||||
movdqa xmm4,xmm3
|
movdqa xmm4, xmm3
|
||||||
movdqa xmm7,xmm1
|
movdqa xmm7, xmm1
|
||||||
paddd xmm3,xmm2 ; xmm3=data1L
|
paddd xmm3, xmm2 ; xmm3=data1L
|
||||||
paddd xmm1,xmm6 ; xmm1=data1H
|
paddd xmm1, xmm6 ; xmm1=data1H
|
||||||
psubd xmm4,xmm2 ; xmm4=data6L
|
psubd xmm4, xmm2 ; xmm4=data6L
|
||||||
psubd xmm7,xmm6 ; xmm7=data6H
|
psubd xmm7, xmm6 ; xmm7=data6H
|
||||||
|
|
||||||
movdqa xmm2,[rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2]
|
movdqa xmm2, [rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2]
|
||||||
|
|
||||||
paddd xmm3,xmm2
|
paddd xmm3, xmm2
|
||||||
paddd xmm1,xmm2
|
paddd xmm1, xmm2
|
||||||
psrad xmm3,DESCALE_P2
|
psrad xmm3, DESCALE_P2
|
||||||
psrad xmm1,DESCALE_P2
|
psrad xmm1, DESCALE_P2
|
||||||
paddd xmm4,xmm2
|
paddd xmm4, xmm2
|
||||||
paddd xmm7,xmm2
|
paddd xmm7, xmm2
|
||||||
psrad xmm4,DESCALE_P2
|
psrad xmm4, DESCALE_P2
|
||||||
psrad xmm7,DESCALE_P2
|
psrad xmm7, DESCALE_P2
|
||||||
|
|
||||||
packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
|
packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
|
||||||
packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
|
packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
|
||||||
|
|
||||||
packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||||
packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||||
|
|
||||||
movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
|
movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
|
||||||
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
|
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
|
||||||
@@ -732,91 +732,91 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||||
|
|
||||||
movdqa xmm4,xmm6
|
movdqa xmm4, xmm6
|
||||||
movdqa xmm0,xmm2
|
movdqa xmm0, xmm2
|
||||||
paddd xmm6,xmm1 ; xmm6=data2L
|
paddd xmm6, xmm1 ; xmm6=data2L
|
||||||
paddd xmm2,xmm7 ; xmm2=data2H
|
paddd xmm2, xmm7 ; xmm2=data2H
|
||||||
psubd xmm4,xmm1 ; xmm4=data5L
|
psubd xmm4, xmm1 ; xmm4=data5L
|
||||||
psubd xmm0,xmm7 ; xmm0=data5H
|
psubd xmm0, xmm7 ; xmm0=data5H
|
||||||
|
|
||||||
movdqa xmm5,[rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2]
|
movdqa xmm5, [rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2]
|
||||||
|
|
||||||
paddd xmm6,xmm5
|
paddd xmm6, xmm5
|
||||||
paddd xmm2,xmm5
|
paddd xmm2, xmm5
|
||||||
psrad xmm6,DESCALE_P2
|
psrad xmm6, DESCALE_P2
|
||||||
psrad xmm2,DESCALE_P2
|
psrad xmm2, DESCALE_P2
|
||||||
paddd xmm4,xmm5
|
paddd xmm4, xmm5
|
||||||
paddd xmm0,xmm5
|
paddd xmm0, xmm5
|
||||||
psrad xmm4,DESCALE_P2
|
psrad xmm4, DESCALE_P2
|
||||||
psrad xmm0,DESCALE_P2
|
psrad xmm0, DESCALE_P2
|
||||||
|
|
||||||
packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
|
packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
|
||||||
packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
|
packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
|
||||||
|
|
||||||
movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
|
movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
|
||||||
movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
|
movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
|
||||||
movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
|
movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
|
||||||
movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
|
movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
|
||||||
|
|
||||||
movdqa xmm2,xmm3
|
movdqa xmm2, xmm3
|
||||||
movdqa xmm0,xmm1
|
movdqa xmm0, xmm1
|
||||||
paddd xmm3,xmm7 ; xmm3=data3L
|
paddd xmm3, xmm7 ; xmm3=data3L
|
||||||
paddd xmm1,xmm5 ; xmm1=data3H
|
paddd xmm1, xmm5 ; xmm1=data3H
|
||||||
psubd xmm2,xmm7 ; xmm2=data4L
|
psubd xmm2, xmm7 ; xmm2=data4L
|
||||||
psubd xmm0,xmm5 ; xmm0=data4H
|
psubd xmm0, xmm5 ; xmm0=data4H
|
||||||
|
|
||||||
movdqa xmm7,[rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2]
|
movdqa xmm7, [rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2]
|
||||||
|
|
||||||
paddd xmm3,xmm7
|
paddd xmm3, xmm7
|
||||||
paddd xmm1,xmm7
|
paddd xmm1, xmm7
|
||||||
psrad xmm3,DESCALE_P2
|
psrad xmm3, DESCALE_P2
|
||||||
psrad xmm1,DESCALE_P2
|
psrad xmm1, DESCALE_P2
|
||||||
paddd xmm2,xmm7
|
paddd xmm2, xmm7
|
||||||
paddd xmm0,xmm7
|
paddd xmm0, xmm7
|
||||||
psrad xmm2,DESCALE_P2
|
psrad xmm2, DESCALE_P2
|
||||||
psrad xmm0,DESCALE_P2
|
psrad xmm0, DESCALE_P2
|
||||||
|
|
||||||
movdqa xmm5,[rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP]
|
movdqa xmm5, [rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP]
|
||||||
|
|
||||||
packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
|
packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
|
||||||
packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
|
packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
|
||||||
|
|
||||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||||
|
|
||||||
packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
|
packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
|
||||||
packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
|
packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
|
||||||
|
|
||||||
paddb xmm7,xmm5
|
paddb xmm7, xmm5
|
||||||
paddb xmm1,xmm5
|
paddb xmm1, xmm5
|
||||||
paddb xmm6,xmm5
|
paddb xmm6, xmm5
|
||||||
paddb xmm3,xmm5
|
paddb xmm3, xmm5
|
||||||
|
|
||||||
movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
|
movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
|
||||||
punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
|
punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
|
||||||
punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
|
punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
|
||||||
movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
|
movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
|
||||||
punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
|
punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
|
||||||
punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
|
punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
|
||||||
|
|
||||||
movdqa xmm4,xmm7 ; transpose coefficients(phase 2)
|
movdqa xmm4, xmm7 ; transpose coefficients(phase 2)
|
||||||
punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
||||||
punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
|
punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
|
||||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
|
movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
|
||||||
punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
||||||
punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
|
punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
|
||||||
|
|
||||||
movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
|
movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
|
||||||
punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||||
punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||||
movdqa xmm3,xmm4 ; transpose coefficients(phase 3)
|
movdqa xmm3, xmm4 ; transpose coefficients(phase 3)
|
||||||
punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
||||||
punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
||||||
|
|
||||||
pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||||
pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||||
pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
|
pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
|
||||||
pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
|
pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
|
||||||
|
|
||||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
||||||
mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
|
mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
|
||||||
@@ -837,7 +837,7 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
|
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
|
||||||
|
|
||||||
uncollect_args
|
uncollect_args
|
||||||
mov rsp,rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -47,18 +47,18 @@ F_3_072 equ 25172 ; FIX(3.072711026)
|
|||||||
%else
|
%else
|
||||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||||
F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
|
F_0_298 equ DESCALE( 320652955, 30-CONST_BITS) ; FIX(0.298631336)
|
||||||
F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
|
F_0_390 equ DESCALE( 418953276, 30-CONST_BITS) ; FIX(0.390180644)
|
||||||
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
|
F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
|
||||||
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
|
F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
|
||||||
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
|
F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
|
||||||
F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
|
F_1_175 equ DESCALE(1262586813, 30-CONST_BITS) ; FIX(1.175875602)
|
||||||
F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
|
F_1_501 equ DESCALE(1612031267, 30-CONST_BITS) ; FIX(1.501321110)
|
||||||
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
|
F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
|
||||||
F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
|
F_1_961 equ DESCALE(2106220350, 30-CONST_BITS) ; FIX(1.961570560)
|
||||||
F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
|
F_2_053 equ DESCALE(2204520673, 30-CONST_BITS) ; FIX(2.053119869)
|
||||||
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
|
F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
|
||||||
F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
|
F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026)
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
; --------------------------------------------------------------------------
|
; --------------------------------------------------------------------------
|
||||||
@@ -108,11 +108,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
|||||||
|
|
||||||
EXTN(jsimd_idct_islow_sse2):
|
EXTN(jsimd_idct_islow_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov eax,esp ; eax = original ebp
|
mov eax, esp ; eax = original ebp
|
||||||
sub esp, byte 4
|
sub esp, byte 4
|
||||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [esp],eax
|
mov [esp], eax
|
||||||
mov ebp,esp ; ebp = aligned ebp
|
mov ebp, esp ; ebp = aligned ebp
|
||||||
lea esp, [wk(0)]
|
lea esp, [wk(0)]
|
||||||
pushpic ebx
|
pushpic ebx
|
||||||
; push ecx ; unused
|
; push ecx ; unused
|
||||||
@@ -140,11 +140,11 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||||
por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||||
por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||||
por xmm1,xmm0
|
por xmm1, xmm0
|
||||||
packsswb xmm1,xmm1
|
packsswb xmm1, xmm1
|
||||||
packsswb xmm1,xmm1
|
packsswb xmm1, xmm1
|
||||||
movd eax,xmm1
|
movd eax, xmm1
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jnz short .columnDCT
|
jnz short .columnDCT
|
||||||
|
|
||||||
; -- AC terms all zero
|
; -- AC terms all zero
|
||||||
@@ -152,27 +152,27 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||||
pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
|
|
||||||
psllw xmm5,PASS1_BITS
|
psllw xmm5, PASS1_BITS
|
||||||
|
|
||||||
movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
|
movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
|
||||||
punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
|
punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
|
||||||
punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
|
punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
|
||||||
|
|
||||||
pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
|
pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
|
||||||
pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
|
pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
|
||||||
pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
|
pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
|
||||||
pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
|
pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
|
||||||
pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
|
pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
|
||||||
pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
|
pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
|
||||||
pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
|
pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
|
||||||
pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
|
pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
|
movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
|
||||||
movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
|
movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
|
||||||
movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
|
movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
|
||||||
movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
|
movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
|
||||||
jmp near .column_end
|
jmp near .column_end
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
%endif
|
%endif
|
||||||
.columnDCT:
|
.columnDCT:
|
||||||
|
|
||||||
@@ -196,53 +196,53 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
||||||
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
||||||
|
|
||||||
movdqa xmm4,xmm1 ; xmm1=in2=z2
|
movdqa xmm4, xmm1 ; xmm1=in2=z2
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
punpcklwd xmm4,xmm3 ; xmm3=in6=z3
|
punpcklwd xmm4, xmm3 ; xmm3=in6=z3
|
||||||
punpckhwd xmm5,xmm3
|
punpckhwd xmm5, xmm3
|
||||||
movdqa xmm1,xmm4
|
movdqa xmm1, xmm4
|
||||||
movdqa xmm3,xmm5
|
movdqa xmm3, xmm5
|
||||||
pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L
|
pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L
|
||||||
pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
|
pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
|
||||||
pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
|
pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
|
||||||
pmaddwd xmm3,[GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H
|
pmaddwd xmm3, [GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H
|
||||||
|
|
||||||
movdqa xmm6,xmm0
|
movdqa xmm6, xmm0
|
||||||
paddw xmm0,xmm2 ; xmm0=in0+in4
|
paddw xmm0, xmm2 ; xmm0=in0+in4
|
||||||
psubw xmm6,xmm2 ; xmm6=in0-in4
|
psubw xmm6, xmm2 ; xmm6=in0-in4
|
||||||
|
|
||||||
pxor xmm7,xmm7
|
pxor xmm7, xmm7
|
||||||
pxor xmm2,xmm2
|
pxor xmm2, xmm2
|
||||||
punpcklwd xmm7,xmm0 ; xmm7=tmp0L
|
punpcklwd xmm7, xmm0 ; xmm7=tmp0L
|
||||||
punpckhwd xmm2,xmm0 ; xmm2=tmp0H
|
punpckhwd xmm2, xmm0 ; xmm2=tmp0H
|
||||||
psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
|
psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
|
||||||
psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
|
psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
|
||||||
|
|
||||||
movdqa xmm0,xmm7
|
movdqa xmm0, xmm7
|
||||||
paddd xmm7,xmm4 ; xmm7=tmp10L
|
paddd xmm7, xmm4 ; xmm7=tmp10L
|
||||||
psubd xmm0,xmm4 ; xmm0=tmp13L
|
psubd xmm0, xmm4 ; xmm0=tmp13L
|
||||||
movdqa xmm4,xmm2
|
movdqa xmm4, xmm2
|
||||||
paddd xmm2,xmm5 ; xmm2=tmp10H
|
paddd xmm2, xmm5 ; xmm2=tmp10H
|
||||||
psubd xmm4,xmm5 ; xmm4=tmp13H
|
psubd xmm4, xmm5 ; xmm4=tmp13H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
|
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
|
||||||
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
|
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
|
||||||
movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
|
movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
|
||||||
movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
|
movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
|
||||||
|
|
||||||
pxor xmm5,xmm5
|
pxor xmm5, xmm5
|
||||||
pxor xmm7,xmm7
|
pxor xmm7, xmm7
|
||||||
punpcklwd xmm5,xmm6 ; xmm5=tmp1L
|
punpcklwd xmm5, xmm6 ; xmm5=tmp1L
|
||||||
punpckhwd xmm7,xmm6 ; xmm7=tmp1H
|
punpckhwd xmm7, xmm6 ; xmm7=tmp1H
|
||||||
psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
|
psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
|
||||||
psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
|
psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
|
||||||
|
|
||||||
movdqa xmm2,xmm5
|
movdqa xmm2, xmm5
|
||||||
paddd xmm5,xmm1 ; xmm5=tmp11L
|
paddd xmm5, xmm1 ; xmm5=tmp11L
|
||||||
psubd xmm2,xmm1 ; xmm2=tmp12L
|
psubd xmm2, xmm1 ; xmm2=tmp12L
|
||||||
movdqa xmm0,xmm7
|
movdqa xmm0, xmm7
|
||||||
paddd xmm7,xmm3 ; xmm7=tmp11H
|
paddd xmm7, xmm3 ; xmm7=tmp11H
|
||||||
psubd xmm0,xmm3 ; xmm0=tmp12H
|
psubd xmm0, xmm3 ; xmm0=tmp12H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
|
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
|
||||||
movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
|
movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
|
||||||
@@ -260,10 +260,10 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
|
|
||||||
movdqa xmm5,xmm6
|
movdqa xmm5, xmm6
|
||||||
movdqa xmm7,xmm4
|
movdqa xmm7, xmm4
|
||||||
paddw xmm5,xmm3 ; xmm5=z3
|
paddw xmm5, xmm3 ; xmm5=z3
|
||||||
paddw xmm7,xmm1 ; xmm7=z4
|
paddw xmm7, xmm1 ; xmm7=z4
|
||||||
|
|
||||||
; (Original)
|
; (Original)
|
||||||
; z5 = (z3 + z4) * 1.175875602;
|
; z5 = (z3 + z4) * 1.175875602;
|
||||||
@@ -274,16 +274,16 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||||
|
|
||||||
movdqa xmm2,xmm5
|
movdqa xmm2, xmm5
|
||||||
movdqa xmm0,xmm5
|
movdqa xmm0, xmm5
|
||||||
punpcklwd xmm2,xmm7
|
punpcklwd xmm2, xmm7
|
||||||
punpckhwd xmm0,xmm7
|
punpckhwd xmm0, xmm7
|
||||||
movdqa xmm5,xmm2
|
movdqa xmm5, xmm2
|
||||||
movdqa xmm7,xmm0
|
movdqa xmm7, xmm0
|
||||||
pmaddwd xmm2,[GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L
|
pmaddwd xmm2, [GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L
|
||||||
pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H
|
pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H
|
||||||
pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
|
pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
|
||||||
pmaddwd xmm7,[GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H
|
pmaddwd xmm7, [GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
|
movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
|
||||||
movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
|
movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
|
||||||
@@ -304,38 +304,38 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
; tmp0 += z3; tmp1 += z4;
|
; tmp0 += z3; tmp1 += z4;
|
||||||
; tmp2 += z3; tmp3 += z4;
|
; tmp2 += z3; tmp3 += z4;
|
||||||
|
|
||||||
movdqa xmm2,xmm3
|
movdqa xmm2, xmm3
|
||||||
movdqa xmm0,xmm3
|
movdqa xmm0, xmm3
|
||||||
punpcklwd xmm2,xmm4
|
punpcklwd xmm2, xmm4
|
||||||
punpckhwd xmm0,xmm4
|
punpckhwd xmm0, xmm4
|
||||||
movdqa xmm3,xmm2
|
movdqa xmm3, xmm2
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
pmaddwd xmm2,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L
|
pmaddwd xmm2, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L
|
||||||
pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H
|
pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H
|
||||||
pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L
|
pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L
|
||||||
pmaddwd xmm4,[GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H
|
pmaddwd xmm4, [GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H
|
||||||
|
|
||||||
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
|
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
|
||||||
paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
|
paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
|
||||||
paddd xmm3,xmm5 ; xmm3=tmp3L
|
paddd xmm3, xmm5 ; xmm3=tmp3L
|
||||||
paddd xmm4,xmm7 ; xmm4=tmp3H
|
paddd xmm4, xmm7 ; xmm4=tmp3H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
|
movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
|
||||||
movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
|
movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
|
||||||
|
|
||||||
movdqa xmm2,xmm1
|
movdqa xmm2, xmm1
|
||||||
movdqa xmm0,xmm1
|
movdqa xmm0, xmm1
|
||||||
punpcklwd xmm2,xmm6
|
punpcklwd xmm2, xmm6
|
||||||
punpckhwd xmm0,xmm6
|
punpckhwd xmm0, xmm6
|
||||||
movdqa xmm1,xmm2
|
movdqa xmm1, xmm2
|
||||||
movdqa xmm6,xmm0
|
movdqa xmm6, xmm0
|
||||||
pmaddwd xmm2,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L
|
pmaddwd xmm2, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L
|
||||||
pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H
|
pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H
|
||||||
pmaddwd xmm1,[GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L
|
pmaddwd xmm1, [GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L
|
||||||
pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
|
pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
|
||||||
|
|
||||||
paddd xmm2,xmm5 ; xmm2=tmp1L
|
paddd xmm2, xmm5 ; xmm2=tmp1L
|
||||||
paddd xmm0,xmm7 ; xmm0=tmp1H
|
paddd xmm0, xmm7 ; xmm0=tmp1H
|
||||||
paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
|
paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
|
||||||
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
|
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
|
||||||
|
|
||||||
@@ -347,57 +347,57 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
|
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
|
||||||
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
|
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
|
||||||
|
|
||||||
movdqa xmm2,xmm5
|
movdqa xmm2, xmm5
|
||||||
movdqa xmm0,xmm7
|
movdqa xmm0, xmm7
|
||||||
paddd xmm5,xmm3 ; xmm5=data0L
|
paddd xmm5, xmm3 ; xmm5=data0L
|
||||||
paddd xmm7,xmm4 ; xmm7=data0H
|
paddd xmm7, xmm4 ; xmm7=data0H
|
||||||
psubd xmm2,xmm3 ; xmm2=data7L
|
psubd xmm2, xmm3 ; xmm2=data7L
|
||||||
psubd xmm0,xmm4 ; xmm0=data7H
|
psubd xmm0, xmm4 ; xmm0=data7H
|
||||||
|
|
||||||
movdqa xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1]
|
movdqa xmm3, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1]
|
||||||
|
|
||||||
paddd xmm5,xmm3
|
paddd xmm5, xmm3
|
||||||
paddd xmm7,xmm3
|
paddd xmm7, xmm3
|
||||||
psrad xmm5,DESCALE_P1
|
psrad xmm5, DESCALE_P1
|
||||||
psrad xmm7,DESCALE_P1
|
psrad xmm7, DESCALE_P1
|
||||||
paddd xmm2,xmm3
|
paddd xmm2, xmm3
|
||||||
paddd xmm0,xmm3
|
paddd xmm0, xmm3
|
||||||
psrad xmm2,DESCALE_P1
|
psrad xmm2, DESCALE_P1
|
||||||
psrad xmm0,DESCALE_P1
|
psrad xmm0, DESCALE_P1
|
||||||
|
|
||||||
packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
|
packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
|
||||||
packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
|
packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
|
||||||
|
|
||||||
movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
|
movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
|
||||||
movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
|
movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
|
||||||
|
|
||||||
movdqa xmm7,xmm4
|
movdqa xmm7, xmm4
|
||||||
movdqa xmm0,xmm3
|
movdqa xmm0, xmm3
|
||||||
paddd xmm4,xmm1 ; xmm4=data1L
|
paddd xmm4, xmm1 ; xmm4=data1L
|
||||||
paddd xmm3,xmm6 ; xmm3=data1H
|
paddd xmm3, xmm6 ; xmm3=data1H
|
||||||
psubd xmm7,xmm1 ; xmm7=data6L
|
psubd xmm7, xmm1 ; xmm7=data6L
|
||||||
psubd xmm0,xmm6 ; xmm0=data6H
|
psubd xmm0, xmm6 ; xmm0=data6H
|
||||||
|
|
||||||
movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1]
|
movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1]
|
||||||
|
|
||||||
paddd xmm4,xmm1
|
paddd xmm4, xmm1
|
||||||
paddd xmm3,xmm1
|
paddd xmm3, xmm1
|
||||||
psrad xmm4,DESCALE_P1
|
psrad xmm4, DESCALE_P1
|
||||||
psrad xmm3,DESCALE_P1
|
psrad xmm3, DESCALE_P1
|
||||||
paddd xmm7,xmm1
|
paddd xmm7, xmm1
|
||||||
paddd xmm0,xmm1
|
paddd xmm0, xmm1
|
||||||
psrad xmm7,DESCALE_P1
|
psrad xmm7, DESCALE_P1
|
||||||
psrad xmm0,DESCALE_P1
|
psrad xmm0, DESCALE_P1
|
||||||
|
|
||||||
packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
|
packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
|
||||||
packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
|
packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
|
||||||
|
|
||||||
movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
|
movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
|
punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
|
||||||
punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
|
punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
|
||||||
movdqa xmm1,xmm7 ; transpose coefficients(phase 1)
|
movdqa xmm1, xmm7 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
|
punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
|
||||||
punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
|
punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
|
||||||
|
|
||||||
movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
|
movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
|
||||||
movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
|
movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
|
||||||
@@ -409,69 +409,69 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
|
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
|
||||||
movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
|
movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
|
||||||
|
|
||||||
movdqa xmm5,xmm3
|
movdqa xmm5, xmm3
|
||||||
movdqa xmm6,xmm0
|
movdqa xmm6, xmm0
|
||||||
paddd xmm3,xmm4 ; xmm3=data2L
|
paddd xmm3, xmm4 ; xmm3=data2L
|
||||||
paddd xmm0,xmm2 ; xmm0=data2H
|
paddd xmm0, xmm2 ; xmm0=data2H
|
||||||
psubd xmm5,xmm4 ; xmm5=data5L
|
psubd xmm5, xmm4 ; xmm5=data5L
|
||||||
psubd xmm6,xmm2 ; xmm6=data5H
|
psubd xmm6, xmm2 ; xmm6=data5H
|
||||||
|
|
||||||
movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1]
|
movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1]
|
||||||
|
|
||||||
paddd xmm3,xmm7
|
paddd xmm3, xmm7
|
||||||
paddd xmm0,xmm7
|
paddd xmm0, xmm7
|
||||||
psrad xmm3,DESCALE_P1
|
psrad xmm3, DESCALE_P1
|
||||||
psrad xmm0,DESCALE_P1
|
psrad xmm0, DESCALE_P1
|
||||||
paddd xmm5,xmm7
|
paddd xmm5, xmm7
|
||||||
paddd xmm6,xmm7
|
paddd xmm6, xmm7
|
||||||
psrad xmm5,DESCALE_P1
|
psrad xmm5, DESCALE_P1
|
||||||
psrad xmm6,DESCALE_P1
|
psrad xmm6, DESCALE_P1
|
||||||
|
|
||||||
packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
|
packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
|
||||||
packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
|
packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
|
||||||
|
|
||||||
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
|
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
|
||||||
movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
|
movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
|
||||||
movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
|
movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
|
||||||
movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
|
movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
|
||||||
|
|
||||||
movdqa xmm0,xmm1
|
movdqa xmm0, xmm1
|
||||||
movdqa xmm6,xmm4
|
movdqa xmm6, xmm4
|
||||||
paddd xmm1,xmm2 ; xmm1=data3L
|
paddd xmm1, xmm2 ; xmm1=data3L
|
||||||
paddd xmm4,xmm7 ; xmm4=data3H
|
paddd xmm4, xmm7 ; xmm4=data3H
|
||||||
psubd xmm0,xmm2 ; xmm0=data4L
|
psubd xmm0, xmm2 ; xmm0=data4L
|
||||||
psubd xmm6,xmm7 ; xmm6=data4H
|
psubd xmm6, xmm7 ; xmm6=data4H
|
||||||
|
|
||||||
movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1]
|
movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1]
|
||||||
|
|
||||||
paddd xmm1,xmm2
|
paddd xmm1, xmm2
|
||||||
paddd xmm4,xmm2
|
paddd xmm4, xmm2
|
||||||
psrad xmm1,DESCALE_P1
|
psrad xmm1, DESCALE_P1
|
||||||
psrad xmm4,DESCALE_P1
|
psrad xmm4, DESCALE_P1
|
||||||
paddd xmm0,xmm2
|
paddd xmm0, xmm2
|
||||||
paddd xmm6,xmm2
|
paddd xmm6, xmm2
|
||||||
psrad xmm0,DESCALE_P1
|
psrad xmm0, DESCALE_P1
|
||||||
psrad xmm6,DESCALE_P1
|
psrad xmm6, DESCALE_P1
|
||||||
|
|
||||||
packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
|
packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
|
||||||
packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
|
packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
|
||||||
|
|
||||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
|
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
|
||||||
movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
|
movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
|
||||||
|
|
||||||
movdqa xmm4,xmm3 ; transpose coefficients(phase 1)
|
movdqa xmm4, xmm3 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
|
punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
|
||||||
punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
|
punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
|
||||||
movdqa xmm6,xmm0 ; transpose coefficients(phase 1)
|
movdqa xmm6, xmm0 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
|
punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
|
||||||
punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
|
punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
|
||||||
|
|
||||||
movdqa xmm1,xmm7 ; transpose coefficients(phase 2)
|
movdqa xmm1, xmm7 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
|
punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
|
||||||
punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
|
punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
|
||||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
|
movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
|
punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
|
||||||
punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
|
punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
|
||||||
|
|
||||||
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
|
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
|
||||||
movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
|
movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
|
||||||
@@ -479,19 +479,19 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
|
movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
|
||||||
movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
|
movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
|
||||||
|
|
||||||
movdqa xmm2,xmm0 ; transpose coefficients(phase 2)
|
movdqa xmm2, xmm0 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
|
punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
|
||||||
punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
|
punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
|
||||||
movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
|
movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
|
punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
|
||||||
punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
|
punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
|
||||||
|
|
||||||
movdqa xmm3,xmm7 ; transpose coefficients(phase 3)
|
movdqa xmm3, xmm7 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
|
punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
|
||||||
punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
|
punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
|
||||||
movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
|
movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
|
punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
|
||||||
punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
|
punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
|
||||||
|
|
||||||
movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
|
movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
|
||||||
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
|
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
|
||||||
@@ -499,12 +499,12 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
|
movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
|
||||||
movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
|
movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
|
||||||
|
|
||||||
movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
|
movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
|
punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
|
||||||
punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
|
punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
|
||||||
movdqa xmm4,xmm2 ; transpose coefficients(phase 3)
|
movdqa xmm4, xmm2 ; transpose coefficients(phase 3)
|
||||||
punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
|
punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
|
||||||
punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
|
punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
|
||||||
|
|
||||||
movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
|
movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
|
||||||
movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
|
movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
|
||||||
@@ -536,53 +536,53 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
||||||
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
||||||
|
|
||||||
movdqa xmm6,xmm1 ; xmm1=in2=z2
|
movdqa xmm6, xmm1 ; xmm1=in2=z2
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
punpcklwd xmm6,xmm2 ; xmm2=in6=z3
|
punpcklwd xmm6, xmm2 ; xmm2=in6=z3
|
||||||
punpckhwd xmm5,xmm2
|
punpckhwd xmm5, xmm2
|
||||||
movdqa xmm1,xmm6
|
movdqa xmm1, xmm6
|
||||||
movdqa xmm2,xmm5
|
movdqa xmm2, xmm5
|
||||||
pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L
|
pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L
|
||||||
pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
|
pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
|
||||||
pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
|
pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
|
||||||
pmaddwd xmm2,[GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H
|
pmaddwd xmm2, [GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H
|
||||||
|
|
||||||
movdqa xmm3,xmm7
|
movdqa xmm3, xmm7
|
||||||
paddw xmm7,xmm0 ; xmm7=in0+in4
|
paddw xmm7, xmm0 ; xmm7=in0+in4
|
||||||
psubw xmm3,xmm0 ; xmm3=in0-in4
|
psubw xmm3, xmm0 ; xmm3=in0-in4
|
||||||
|
|
||||||
pxor xmm4,xmm4
|
pxor xmm4, xmm4
|
||||||
pxor xmm0,xmm0
|
pxor xmm0, xmm0
|
||||||
punpcklwd xmm4,xmm7 ; xmm4=tmp0L
|
punpcklwd xmm4, xmm7 ; xmm4=tmp0L
|
||||||
punpckhwd xmm0,xmm7 ; xmm0=tmp0H
|
punpckhwd xmm0, xmm7 ; xmm0=tmp0H
|
||||||
psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
|
psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
|
||||||
psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
|
psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
|
||||||
|
|
||||||
movdqa xmm7,xmm4
|
movdqa xmm7, xmm4
|
||||||
paddd xmm4,xmm6 ; xmm4=tmp10L
|
paddd xmm4, xmm6 ; xmm4=tmp10L
|
||||||
psubd xmm7,xmm6 ; xmm7=tmp13L
|
psubd xmm7, xmm6 ; xmm7=tmp13L
|
||||||
movdqa xmm6,xmm0
|
movdqa xmm6, xmm0
|
||||||
paddd xmm0,xmm5 ; xmm0=tmp10H
|
paddd xmm0, xmm5 ; xmm0=tmp10H
|
||||||
psubd xmm6,xmm5 ; xmm6=tmp13H
|
psubd xmm6, xmm5 ; xmm6=tmp13H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
|
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
|
||||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
|
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
|
||||||
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
|
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
|
||||||
movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
|
movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
|
||||||
|
|
||||||
pxor xmm5,xmm5
|
pxor xmm5, xmm5
|
||||||
pxor xmm4,xmm4
|
pxor xmm4, xmm4
|
||||||
punpcklwd xmm5,xmm3 ; xmm5=tmp1L
|
punpcklwd xmm5, xmm3 ; xmm5=tmp1L
|
||||||
punpckhwd xmm4,xmm3 ; xmm4=tmp1H
|
punpckhwd xmm4, xmm3 ; xmm4=tmp1H
|
||||||
psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
|
psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
|
||||||
psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
|
psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
|
||||||
|
|
||||||
movdqa xmm0,xmm5
|
movdqa xmm0, xmm5
|
||||||
paddd xmm5,xmm1 ; xmm5=tmp11L
|
paddd xmm5, xmm1 ; xmm5=tmp11L
|
||||||
psubd xmm0,xmm1 ; xmm0=tmp12L
|
psubd xmm0, xmm1 ; xmm0=tmp12L
|
||||||
movdqa xmm7,xmm4
|
movdqa xmm7, xmm4
|
||||||
paddd xmm4,xmm2 ; xmm4=tmp11H
|
paddd xmm4, xmm2 ; xmm4=tmp11H
|
||||||
psubd xmm7,xmm2 ; xmm7=tmp12H
|
psubd xmm7, xmm2 ; xmm7=tmp12H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
|
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
|
||||||
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
|
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
|
||||||
@@ -596,10 +596,10 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
|
movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
|
||||||
movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
|
movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
|
||||||
|
|
||||||
movdqa xmm5,xmm6
|
movdqa xmm5, xmm6
|
||||||
movdqa xmm4,xmm3
|
movdqa xmm4, xmm3
|
||||||
paddw xmm5,xmm1 ; xmm5=z3
|
paddw xmm5, xmm1 ; xmm5=z3
|
||||||
paddw xmm4,xmm2 ; xmm4=z4
|
paddw xmm4, xmm2 ; xmm4=z4
|
||||||
|
|
||||||
; (Original)
|
; (Original)
|
||||||
; z5 = (z3 + z4) * 1.175875602;
|
; z5 = (z3 + z4) * 1.175875602;
|
||||||
@@ -610,16 +610,16 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||||
|
|
||||||
movdqa xmm0,xmm5
|
movdqa xmm0, xmm5
|
||||||
movdqa xmm7,xmm5
|
movdqa xmm7, xmm5
|
||||||
punpcklwd xmm0,xmm4
|
punpcklwd xmm0, xmm4
|
||||||
punpckhwd xmm7,xmm4
|
punpckhwd xmm7, xmm4
|
||||||
movdqa xmm5,xmm0
|
movdqa xmm5, xmm0
|
||||||
movdqa xmm4,xmm7
|
movdqa xmm4, xmm7
|
||||||
pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L
|
pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L
|
||||||
pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H
|
pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H
|
||||||
pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
|
pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
|
||||||
pmaddwd xmm4,[GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H
|
pmaddwd xmm4, [GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
|
movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
|
||||||
movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
|
movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
|
||||||
@@ -640,38 +640,38 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
; tmp0 += z3; tmp1 += z4;
|
; tmp0 += z3; tmp1 += z4;
|
||||||
; tmp2 += z3; tmp3 += z4;
|
; tmp2 += z3; tmp3 += z4;
|
||||||
|
|
||||||
movdqa xmm0,xmm1
|
movdqa xmm0, xmm1
|
||||||
movdqa xmm7,xmm1
|
movdqa xmm7, xmm1
|
||||||
punpcklwd xmm0,xmm3
|
punpcklwd xmm0, xmm3
|
||||||
punpckhwd xmm7,xmm3
|
punpckhwd xmm7, xmm3
|
||||||
movdqa xmm1,xmm0
|
movdqa xmm1, xmm0
|
||||||
movdqa xmm3,xmm7
|
movdqa xmm3, xmm7
|
||||||
pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L
|
pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L
|
||||||
pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H
|
pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H
|
||||||
pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L
|
pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L
|
||||||
pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H
|
pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H
|
||||||
|
|
||||||
paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
|
paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
|
||||||
paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
|
paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
|
||||||
paddd xmm1,xmm5 ; xmm1=tmp3L
|
paddd xmm1, xmm5 ; xmm1=tmp3L
|
||||||
paddd xmm3,xmm4 ; xmm3=tmp3H
|
paddd xmm3, xmm4 ; xmm3=tmp3H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
|
movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
|
||||||
movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
|
movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
|
||||||
|
|
||||||
movdqa xmm0,xmm2
|
movdqa xmm0, xmm2
|
||||||
movdqa xmm7,xmm2
|
movdqa xmm7, xmm2
|
||||||
punpcklwd xmm0,xmm6
|
punpcklwd xmm0, xmm6
|
||||||
punpckhwd xmm7,xmm6
|
punpckhwd xmm7, xmm6
|
||||||
movdqa xmm2,xmm0
|
movdqa xmm2, xmm0
|
||||||
movdqa xmm6,xmm7
|
movdqa xmm6, xmm7
|
||||||
pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L
|
pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L
|
||||||
pmaddwd xmm7,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H
|
pmaddwd xmm7, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H
|
||||||
pmaddwd xmm2,[GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L
|
pmaddwd xmm2, [GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L
|
||||||
pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
|
pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
|
||||||
|
|
||||||
paddd xmm0,xmm5 ; xmm0=tmp1L
|
paddd xmm0, xmm5 ; xmm0=tmp1L
|
||||||
paddd xmm7,xmm4 ; xmm7=tmp1H
|
paddd xmm7, xmm4 ; xmm7=tmp1H
|
||||||
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
|
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
|
||||||
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
|
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
|
||||||
|
|
||||||
@@ -683,53 +683,53 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
|
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
|
||||||
movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
|
movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
|
||||||
|
|
||||||
movdqa xmm0,xmm5
|
movdqa xmm0, xmm5
|
||||||
movdqa xmm7,xmm4
|
movdqa xmm7, xmm4
|
||||||
paddd xmm5,xmm1 ; xmm5=data0L
|
paddd xmm5, xmm1 ; xmm5=data0L
|
||||||
paddd xmm4,xmm3 ; xmm4=data0H
|
paddd xmm4, xmm3 ; xmm4=data0H
|
||||||
psubd xmm0,xmm1 ; xmm0=data7L
|
psubd xmm0, xmm1 ; xmm0=data7L
|
||||||
psubd xmm7,xmm3 ; xmm7=data7H
|
psubd xmm7, xmm3 ; xmm7=data7H
|
||||||
|
|
||||||
movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2]
|
movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2]
|
||||||
|
|
||||||
paddd xmm5,xmm1
|
paddd xmm5, xmm1
|
||||||
paddd xmm4,xmm1
|
paddd xmm4, xmm1
|
||||||
psrad xmm5,DESCALE_P2
|
psrad xmm5, DESCALE_P2
|
||||||
psrad xmm4,DESCALE_P2
|
psrad xmm4, DESCALE_P2
|
||||||
paddd xmm0,xmm1
|
paddd xmm0, xmm1
|
||||||
paddd xmm7,xmm1
|
paddd xmm7, xmm1
|
||||||
psrad xmm0,DESCALE_P2
|
psrad xmm0, DESCALE_P2
|
||||||
psrad xmm7,DESCALE_P2
|
psrad xmm7, DESCALE_P2
|
||||||
|
|
||||||
packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
|
packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
|
||||||
packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
|
packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
|
||||||
|
|
||||||
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
|
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
|
||||||
movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
|
movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
|
||||||
|
|
||||||
movdqa xmm4,xmm3
|
movdqa xmm4, xmm3
|
||||||
movdqa xmm7,xmm1
|
movdqa xmm7, xmm1
|
||||||
paddd xmm3,xmm2 ; xmm3=data1L
|
paddd xmm3, xmm2 ; xmm3=data1L
|
||||||
paddd xmm1,xmm6 ; xmm1=data1H
|
paddd xmm1, xmm6 ; xmm1=data1H
|
||||||
psubd xmm4,xmm2 ; xmm4=data6L
|
psubd xmm4, xmm2 ; xmm4=data6L
|
||||||
psubd xmm7,xmm6 ; xmm7=data6H
|
psubd xmm7, xmm6 ; xmm7=data6H
|
||||||
|
|
||||||
movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2]
|
movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2]
|
||||||
|
|
||||||
paddd xmm3,xmm2
|
paddd xmm3, xmm2
|
||||||
paddd xmm1,xmm2
|
paddd xmm1, xmm2
|
||||||
psrad xmm3,DESCALE_P2
|
psrad xmm3, DESCALE_P2
|
||||||
psrad xmm1,DESCALE_P2
|
psrad xmm1, DESCALE_P2
|
||||||
paddd xmm4,xmm2
|
paddd xmm4, xmm2
|
||||||
paddd xmm7,xmm2
|
paddd xmm7, xmm2
|
||||||
psrad xmm4,DESCALE_P2
|
psrad xmm4, DESCALE_P2
|
||||||
psrad xmm7,DESCALE_P2
|
psrad xmm7, DESCALE_P2
|
||||||
|
|
||||||
packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
|
packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
|
||||||
packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
|
packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
|
||||||
|
|
||||||
packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||||
packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||||
|
|
||||||
movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
|
movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
|
||||||
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
|
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
|
||||||
@@ -739,91 +739,91 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||||
|
|
||||||
movdqa xmm4,xmm6
|
movdqa xmm4, xmm6
|
||||||
movdqa xmm0,xmm2
|
movdqa xmm0, xmm2
|
||||||
paddd xmm6,xmm1 ; xmm6=data2L
|
paddd xmm6, xmm1 ; xmm6=data2L
|
||||||
paddd xmm2,xmm7 ; xmm2=data2H
|
paddd xmm2, xmm7 ; xmm2=data2H
|
||||||
psubd xmm4,xmm1 ; xmm4=data5L
|
psubd xmm4, xmm1 ; xmm4=data5L
|
||||||
psubd xmm0,xmm7 ; xmm0=data5H
|
psubd xmm0, xmm7 ; xmm0=data5H
|
||||||
|
|
||||||
movdqa xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2]
|
movdqa xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2]
|
||||||
|
|
||||||
paddd xmm6,xmm5
|
paddd xmm6, xmm5
|
||||||
paddd xmm2,xmm5
|
paddd xmm2, xmm5
|
||||||
psrad xmm6,DESCALE_P2
|
psrad xmm6, DESCALE_P2
|
||||||
psrad xmm2,DESCALE_P2
|
psrad xmm2, DESCALE_P2
|
||||||
paddd xmm4,xmm5
|
paddd xmm4, xmm5
|
||||||
paddd xmm0,xmm5
|
paddd xmm0, xmm5
|
||||||
psrad xmm4,DESCALE_P2
|
psrad xmm4, DESCALE_P2
|
||||||
psrad xmm0,DESCALE_P2
|
psrad xmm0, DESCALE_P2
|
||||||
|
|
||||||
packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
|
packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
|
||||||
packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
|
packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
|
||||||
|
|
||||||
movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
|
movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
|
||||||
movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
|
movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
|
||||||
movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
|
movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
|
||||||
movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
|
movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
|
||||||
|
|
||||||
movdqa xmm2,xmm3
|
movdqa xmm2, xmm3
|
||||||
movdqa xmm0,xmm1
|
movdqa xmm0, xmm1
|
||||||
paddd xmm3,xmm7 ; xmm3=data3L
|
paddd xmm3, xmm7 ; xmm3=data3L
|
||||||
paddd xmm1,xmm5 ; xmm1=data3H
|
paddd xmm1, xmm5 ; xmm1=data3H
|
||||||
psubd xmm2,xmm7 ; xmm2=data4L
|
psubd xmm2, xmm7 ; xmm2=data4L
|
||||||
psubd xmm0,xmm5 ; xmm0=data4H
|
psubd xmm0, xmm5 ; xmm0=data4H
|
||||||
|
|
||||||
movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2]
|
movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2]
|
||||||
|
|
||||||
paddd xmm3,xmm7
|
paddd xmm3, xmm7
|
||||||
paddd xmm1,xmm7
|
paddd xmm1, xmm7
|
||||||
psrad xmm3,DESCALE_P2
|
psrad xmm3, DESCALE_P2
|
||||||
psrad xmm1,DESCALE_P2
|
psrad xmm1, DESCALE_P2
|
||||||
paddd xmm2,xmm7
|
paddd xmm2, xmm7
|
||||||
paddd xmm0,xmm7
|
paddd xmm0, xmm7
|
||||||
psrad xmm2,DESCALE_P2
|
psrad xmm2, DESCALE_P2
|
||||||
psrad xmm0,DESCALE_P2
|
psrad xmm0, DESCALE_P2
|
||||||
|
|
||||||
movdqa xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP]
|
movdqa xmm5, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP]
|
||||||
|
|
||||||
packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
|
packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
|
||||||
packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
|
packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
|
||||||
|
|
||||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||||
|
|
||||||
packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
|
packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
|
||||||
packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
|
packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
|
||||||
|
|
||||||
paddb xmm7,xmm5
|
paddb xmm7, xmm5
|
||||||
paddb xmm1,xmm5
|
paddb xmm1, xmm5
|
||||||
paddb xmm6,xmm5
|
paddb xmm6, xmm5
|
||||||
paddb xmm3,xmm5
|
paddb xmm3, xmm5
|
||||||
|
|
||||||
movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
|
movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
|
||||||
punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
|
punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
|
||||||
punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
|
punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
|
||||||
movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
|
movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
|
||||||
punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
|
punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
|
||||||
punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
|
punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
|
||||||
|
|
||||||
movdqa xmm4,xmm7 ; transpose coefficients(phase 2)
|
movdqa xmm4, xmm7 ; transpose coefficients(phase 2)
|
||||||
punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
||||||
punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
|
punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
|
||||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
|
movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
|
||||||
punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
||||||
punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
|
punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
|
||||||
|
|
||||||
movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
|
movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
|
||||||
punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||||
punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||||
movdqa xmm3,xmm4 ; transpose coefficients(phase 3)
|
movdqa xmm3, xmm4 ; transpose coefficients(phase 3)
|
||||||
punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
||||||
punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
||||||
|
|
||||||
pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||||
pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||||
pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
|
pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
|
||||||
pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
|
pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
|
||||||
|
|
||||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||||
mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
|
mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
|
||||||
@@ -848,7 +848,7 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
; pop edx ; need not be preserved
|
; pop edx ; need not be preserved
|
||||||
; pop ecx ; unused
|
; pop ecx ; unused
|
||||||
poppic ebx
|
poppic ebx
|
||||||
mov esp,ebp ; esp <- aligned ebp
|
mov esp, ebp ; esp <- aligned ebp
|
||||||
pop esp ; esp <- original ebp
|
pop esp ; esp <- original ebp
|
||||||
pop ebp
|
pop ebp
|
||||||
ret
|
ret
|
||||||
|
|||||||
@@ -52,20 +52,20 @@ F_3_624 equ 29692 ; FIX(3.624509785)
|
|||||||
%else
|
%else
|
||||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||||
F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243)
|
F_0_211 equ DESCALE( 226735879, 30-CONST_BITS) ; FIX(0.211164243)
|
||||||
F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579)
|
F_0_509 equ DESCALE( 547388834, 30-CONST_BITS) ; FIX(0.509795579)
|
||||||
F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887)
|
F_0_601 equ DESCALE( 645689155, 30-CONST_BITS) ; FIX(0.601344887)
|
||||||
F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822)
|
F_0_720 equ DESCALE( 774124714, 30-CONST_BITS) ; FIX(0.720959822)
|
||||||
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
|
F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
|
||||||
F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095)
|
F_0_850 equ DESCALE( 913142361, 30-CONST_BITS) ; FIX(0.850430095)
|
||||||
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
|
F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
|
||||||
F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337)
|
F_1_061 equ DESCALE(1139878239, 30-CONST_BITS) ; FIX(1.061594337)
|
||||||
F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580)
|
F_1_272 equ DESCALE(1366614119, 30-CONST_BITS) ; FIX(1.272758580)
|
||||||
F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981)
|
F_1_451 equ DESCALE(1558831516, 30-CONST_BITS) ; FIX(1.451774981)
|
||||||
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
|
F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
|
||||||
F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803)
|
F_2_172 equ DESCALE(2332956230, 30-CONST_BITS) ; FIX(2.172734803)
|
||||||
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
|
F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
|
||||||
F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785)
|
F_3_624 equ DESCALE(3891787747, 30-CONST_BITS) ; FIX(3.624509785)
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
; --------------------------------------------------------------------------
|
; --------------------------------------------------------------------------
|
||||||
@@ -117,11 +117,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
|||||||
|
|
||||||
EXTN(jsimd_idct_4x4_sse2):
|
EXTN(jsimd_idct_4x4_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp ; rax = original rbp
|
mov rax, rsp ; rax = original rbp
|
||||||
sub rsp, byte 4
|
sub rsp, byte 4
|
||||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [rsp],rax
|
mov [rsp], rax
|
||||||
mov rbp,rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args
|
||||||
|
|
||||||
@@ -141,11 +141,11 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||||
por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||||
por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||||
por xmm0,xmm1
|
por xmm0, xmm1
|
||||||
packsswb xmm0,xmm0
|
packsswb xmm0, xmm0
|
||||||
packsswb xmm0,xmm0
|
packsswb xmm0, xmm0
|
||||||
movd eax,xmm0
|
movd eax, xmm0
|
||||||
test rax,rax
|
test rax, rax
|
||||||
jnz short .columnDCT
|
jnz short .columnDCT
|
||||||
|
|
||||||
; -- AC terms all zero
|
; -- AC terms all zero
|
||||||
@@ -153,16 +153,16 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||||
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
|
|
||||||
psllw xmm0,PASS1_BITS
|
psllw xmm0, PASS1_BITS
|
||||||
|
|
||||||
movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
|
movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
|
||||||
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||||
punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
|
punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
|
||||||
|
|
||||||
pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
|
pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
|
||||||
pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
|
pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
|
||||||
pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
|
pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
|
||||||
pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
|
pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
|
||||||
|
|
||||||
jmp near .column_end
|
jmp near .column_end
|
||||||
%endif
|
%endif
|
||||||
@@ -179,32 +179,32 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
|
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
movdqa xmm5,xmm0
|
movdqa xmm5, xmm0
|
||||||
punpcklwd xmm4,xmm1
|
punpcklwd xmm4, xmm1
|
||||||
punpckhwd xmm5,xmm1
|
punpckhwd xmm5, xmm1
|
||||||
movdqa xmm0,xmm4
|
movdqa xmm0, xmm4
|
||||||
movdqa xmm1,xmm5
|
movdqa xmm1, xmm5
|
||||||
pmaddwd xmm4,[rel PW_F256_F089] ; xmm4=(tmp2L)
|
pmaddwd xmm4, [rel PW_F256_F089] ; xmm4=(tmp2L)
|
||||||
pmaddwd xmm5,[rel PW_F256_F089] ; xmm5=(tmp2H)
|
pmaddwd xmm5, [rel PW_F256_F089] ; xmm5=(tmp2H)
|
||||||
pmaddwd xmm0,[rel PW_F106_MF217] ; xmm0=(tmp0L)
|
pmaddwd xmm0, [rel PW_F106_MF217] ; xmm0=(tmp0L)
|
||||||
pmaddwd xmm1,[rel PW_F106_MF217] ; xmm1=(tmp0H)
|
pmaddwd xmm1, [rel PW_F106_MF217] ; xmm1=(tmp0H)
|
||||||
|
|
||||||
movdqa xmm6,xmm2
|
movdqa xmm6, xmm2
|
||||||
movdqa xmm7,xmm2
|
movdqa xmm7, xmm2
|
||||||
punpcklwd xmm6,xmm3
|
punpcklwd xmm6, xmm3
|
||||||
punpckhwd xmm7,xmm3
|
punpckhwd xmm7, xmm3
|
||||||
movdqa xmm2,xmm6
|
movdqa xmm2, xmm6
|
||||||
movdqa xmm3,xmm7
|
movdqa xmm3, xmm7
|
||||||
pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2L)
|
pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2L)
|
||||||
pmaddwd xmm7,[rel PW_MF060_MF050] ; xmm7=(tmp2H)
|
pmaddwd xmm7, [rel PW_MF060_MF050] ; xmm7=(tmp2H)
|
||||||
pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0L)
|
pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0L)
|
||||||
pmaddwd xmm3,[rel PW_F145_MF021] ; xmm3=(tmp0H)
|
pmaddwd xmm3, [rel PW_F145_MF021] ; xmm3=(tmp0H)
|
||||||
|
|
||||||
paddd xmm6,xmm4 ; xmm6=tmp2L
|
paddd xmm6, xmm4 ; xmm6=tmp2L
|
||||||
paddd xmm7,xmm5 ; xmm7=tmp2H
|
paddd xmm7, xmm5 ; xmm7=tmp2H
|
||||||
paddd xmm2,xmm0 ; xmm2=tmp0L
|
paddd xmm2, xmm0 ; xmm2=tmp0L
|
||||||
paddd xmm3,xmm1 ; xmm3=tmp0H
|
paddd xmm3, xmm1 ; xmm3=tmp0H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
|
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
|
||||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
|
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
|
||||||
@@ -218,86 +218,86 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
|
|
||||||
pxor xmm1,xmm1
|
pxor xmm1, xmm1
|
||||||
pxor xmm2,xmm2
|
pxor xmm2, xmm2
|
||||||
punpcklwd xmm1,xmm4 ; xmm1=tmp0L
|
punpcklwd xmm1, xmm4 ; xmm1=tmp0L
|
||||||
punpckhwd xmm2,xmm4 ; xmm2=tmp0H
|
punpckhwd xmm2, xmm4 ; xmm2=tmp0H
|
||||||
psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
|
psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
|
||||||
psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
|
psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
|
||||||
|
|
||||||
movdqa xmm3,xmm5 ; xmm5=in2=z2
|
movdqa xmm3, xmm5 ; xmm5=in2=z2
|
||||||
punpcklwd xmm5,xmm0 ; xmm0=in6=z3
|
punpcklwd xmm5, xmm0 ; xmm0=in6=z3
|
||||||
punpckhwd xmm3,xmm0
|
punpckhwd xmm3, xmm0
|
||||||
pmaddwd xmm5,[rel PW_F184_MF076] ; xmm5=tmp2L
|
pmaddwd xmm5, [rel PW_F184_MF076] ; xmm5=tmp2L
|
||||||
pmaddwd xmm3,[rel PW_F184_MF076] ; xmm3=tmp2H
|
pmaddwd xmm3, [rel PW_F184_MF076] ; xmm3=tmp2H
|
||||||
|
|
||||||
movdqa xmm4,xmm1
|
movdqa xmm4, xmm1
|
||||||
movdqa xmm0,xmm2
|
movdqa xmm0, xmm2
|
||||||
paddd xmm1,xmm5 ; xmm1=tmp10L
|
paddd xmm1, xmm5 ; xmm1=tmp10L
|
||||||
paddd xmm2,xmm3 ; xmm2=tmp10H
|
paddd xmm2, xmm3 ; xmm2=tmp10H
|
||||||
psubd xmm4,xmm5 ; xmm4=tmp12L
|
psubd xmm4, xmm5 ; xmm4=tmp12L
|
||||||
psubd xmm0,xmm3 ; xmm0=tmp12H
|
psubd xmm0, xmm3 ; xmm0=tmp12H
|
||||||
|
|
||||||
; -- Final output stage
|
; -- Final output stage
|
||||||
|
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
movdqa xmm3,xmm2
|
movdqa xmm3, xmm2
|
||||||
paddd xmm1,xmm6 ; xmm1=data0L
|
paddd xmm1, xmm6 ; xmm1=data0L
|
||||||
paddd xmm2,xmm7 ; xmm2=data0H
|
paddd xmm2, xmm7 ; xmm2=data0H
|
||||||
psubd xmm5,xmm6 ; xmm5=data3L
|
psubd xmm5, xmm6 ; xmm5=data3L
|
||||||
psubd xmm3,xmm7 ; xmm3=data3H
|
psubd xmm3, xmm7 ; xmm3=data3H
|
||||||
|
|
||||||
movdqa xmm6,[rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4]
|
movdqa xmm6, [rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4]
|
||||||
|
|
||||||
paddd xmm1,xmm6
|
paddd xmm1, xmm6
|
||||||
paddd xmm2,xmm6
|
paddd xmm2, xmm6
|
||||||
psrad xmm1,DESCALE_P1_4
|
psrad xmm1, DESCALE_P1_4
|
||||||
psrad xmm2,DESCALE_P1_4
|
psrad xmm2, DESCALE_P1_4
|
||||||
paddd xmm5,xmm6
|
paddd xmm5, xmm6
|
||||||
paddd xmm3,xmm6
|
paddd xmm3, xmm6
|
||||||
psrad xmm5,DESCALE_P1_4
|
psrad xmm5, DESCALE_P1_4
|
||||||
psrad xmm3,DESCALE_P1_4
|
psrad xmm3, DESCALE_P1_4
|
||||||
|
|
||||||
packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
|
packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
|
||||||
packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
|
packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
|
||||||
|
|
||||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
|
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
|
||||||
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
|
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
|
||||||
|
|
||||||
movdqa xmm2,xmm4
|
movdqa xmm2, xmm4
|
||||||
movdqa xmm3,xmm0
|
movdqa xmm3, xmm0
|
||||||
paddd xmm4,xmm7 ; xmm4=data1L
|
paddd xmm4, xmm7 ; xmm4=data1L
|
||||||
paddd xmm0,xmm6 ; xmm0=data1H
|
paddd xmm0, xmm6 ; xmm0=data1H
|
||||||
psubd xmm2,xmm7 ; xmm2=data2L
|
psubd xmm2, xmm7 ; xmm2=data2L
|
||||||
psubd xmm3,xmm6 ; xmm3=data2H
|
psubd xmm3, xmm6 ; xmm3=data2H
|
||||||
|
|
||||||
movdqa xmm7,[rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4]
|
movdqa xmm7, [rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4]
|
||||||
|
|
||||||
paddd xmm4,xmm7
|
paddd xmm4, xmm7
|
||||||
paddd xmm0,xmm7
|
paddd xmm0, xmm7
|
||||||
psrad xmm4,DESCALE_P1_4
|
psrad xmm4, DESCALE_P1_4
|
||||||
psrad xmm0,DESCALE_P1_4
|
psrad xmm0, DESCALE_P1_4
|
||||||
paddd xmm2,xmm7
|
paddd xmm2, xmm7
|
||||||
paddd xmm3,xmm7
|
paddd xmm3, xmm7
|
||||||
psrad xmm2,DESCALE_P1_4
|
psrad xmm2, DESCALE_P1_4
|
||||||
psrad xmm3,DESCALE_P1_4
|
psrad xmm3, DESCALE_P1_4
|
||||||
|
|
||||||
packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
|
packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
|
||||||
packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
|
packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
|
||||||
|
|
||||||
movdqa xmm6,xmm1 ; transpose coefficients(phase 1)
|
movdqa xmm6, xmm1 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
|
punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
|
||||||
punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
|
punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
|
||||||
movdqa xmm7,xmm2 ; transpose coefficients(phase 1)
|
movdqa xmm7, xmm2 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
|
punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
|
||||||
punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
|
punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
|
||||||
|
|
||||||
movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
|
movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
|
punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
|
||||||
punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
|
punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
|
||||||
movdqa xmm3,xmm6 ; transpose coefficients(phase 2)
|
movdqa xmm3, xmm6 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
|
punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
|
||||||
punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
|
punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
|
||||||
.column_end:
|
.column_end:
|
||||||
|
|
||||||
; -- Prefetch the next coefficient block
|
; -- Prefetch the next coefficient block
|
||||||
@@ -315,70 +315,70 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
pxor xmm4,xmm4
|
pxor xmm4, xmm4
|
||||||
punpcklwd xmm4,xmm1 ; xmm4=tmp0
|
punpcklwd xmm4, xmm1 ; xmm4=tmp0
|
||||||
psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
|
psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
|
||||||
|
|
||||||
; -- Odd part
|
; -- Odd part
|
||||||
|
|
||||||
punpckhwd xmm1,xmm0
|
punpckhwd xmm1, xmm0
|
||||||
punpckhwd xmm6,xmm3
|
punpckhwd xmm6, xmm3
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
movdqa xmm2,xmm6
|
movdqa xmm2, xmm6
|
||||||
pmaddwd xmm1,[rel PW_F256_F089] ; xmm1=(tmp2)
|
pmaddwd xmm1, [rel PW_F256_F089] ; xmm1=(tmp2)
|
||||||
pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2)
|
pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2)
|
||||||
pmaddwd xmm5,[rel PW_F106_MF217] ; xmm5=(tmp0)
|
pmaddwd xmm5, [rel PW_F106_MF217] ; xmm5=(tmp0)
|
||||||
pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0)
|
pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0)
|
||||||
|
|
||||||
paddd xmm6,xmm1 ; xmm6=tmp2
|
paddd xmm6, xmm1 ; xmm6=tmp2
|
||||||
paddd xmm2,xmm5 ; xmm2=tmp0
|
paddd xmm2, xmm5 ; xmm2=tmp0
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
punpcklwd xmm0,xmm3
|
punpcklwd xmm0, xmm3
|
||||||
pmaddwd xmm0,[rel PW_F184_MF076] ; xmm0=tmp2
|
pmaddwd xmm0, [rel PW_F184_MF076] ; xmm0=tmp2
|
||||||
|
|
||||||
movdqa xmm7,xmm4
|
movdqa xmm7, xmm4
|
||||||
paddd xmm4,xmm0 ; xmm4=tmp10
|
paddd xmm4, xmm0 ; xmm4=tmp10
|
||||||
psubd xmm7,xmm0 ; xmm7=tmp12
|
psubd xmm7, xmm0 ; xmm7=tmp12
|
||||||
|
|
||||||
; -- Final output stage
|
; -- Final output stage
|
||||||
|
|
||||||
movdqa xmm1,[rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4]
|
movdqa xmm1, [rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4]
|
||||||
|
|
||||||
movdqa xmm5,xmm4
|
movdqa xmm5, xmm4
|
||||||
movdqa xmm3,xmm7
|
movdqa xmm3, xmm7
|
||||||
paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30)
|
paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30)
|
||||||
paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31)
|
paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31)
|
||||||
psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33)
|
psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33)
|
||||||
psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32)
|
psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
|
||||||
|
|
||||||
paddd xmm4,xmm1
|
paddd xmm4, xmm1
|
||||||
paddd xmm7,xmm1
|
paddd xmm7, xmm1
|
||||||
psrad xmm4,DESCALE_P2_4
|
psrad xmm4, DESCALE_P2_4
|
||||||
psrad xmm7,DESCALE_P2_4
|
psrad xmm7, DESCALE_P2_4
|
||||||
paddd xmm5,xmm1
|
paddd xmm5, xmm1
|
||||||
paddd xmm3,xmm1
|
paddd xmm3, xmm1
|
||||||
psrad xmm5,DESCALE_P2_4
|
psrad xmm5, DESCALE_P2_4
|
||||||
psrad xmm3,DESCALE_P2_4
|
psrad xmm3, DESCALE_P2_4
|
||||||
|
|
||||||
packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
|
packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
|
||||||
packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
|
packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
|
||||||
|
|
||||||
movdqa xmm0,xmm4 ; transpose coefficients(phase 1)
|
movdqa xmm0, xmm4 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
|
punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
|
||||||
punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
|
punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
|
||||||
|
|
||||||
movdqa xmm6,xmm4 ; transpose coefficients(phase 2)
|
movdqa xmm6, xmm4 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
|
punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
|
||||||
punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
|
punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
|
||||||
|
|
||||||
packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
|
packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
|
||||||
paddb xmm4,[rel PB_CENTERJSAMP]
|
paddb xmm4, [rel PB_CENTERJSAMP]
|
||||||
|
|
||||||
pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
|
pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
|
||||||
pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
|
pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
|
||||||
pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
|
pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
|
||||||
|
|
||||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
||||||
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
|
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
|
||||||
@@ -390,7 +390,7 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
|
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
|
||||||
|
|
||||||
uncollect_args
|
uncollect_args
|
||||||
mov rsp,rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
@@ -416,8 +416,8 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
|
|
||||||
EXTN(jsimd_idct_2x2_sse2):
|
EXTN(jsimd_idct_2x2_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp
|
mov rax, rsp
|
||||||
mov rbp,rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
@@ -450,27 +450,27 @@ EXTN(jsimd_idct_2x2_sse2):
|
|||||||
; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
|
; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
|
||||||
; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
|
; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
|
||||||
|
|
||||||
pcmpeqd xmm7,xmm7
|
pcmpeqd xmm7, xmm7
|
||||||
pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
|
pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
|
||||||
|
|
||||||
movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
|
movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
|
||||||
movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
|
movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
|
||||||
punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
|
punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
|
||||||
punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
|
punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
|
||||||
pmaddwd xmm4,[rel PW_F362_MF127]
|
pmaddwd xmm4, [rel PW_F362_MF127]
|
||||||
pmaddwd xmm5,[rel PW_F085_MF072]
|
pmaddwd xmm5, [rel PW_F085_MF072]
|
||||||
|
|
||||||
psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
|
psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
|
||||||
pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
|
pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
|
||||||
psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
|
psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
|
||||||
pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
|
pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
|
||||||
por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
|
por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
|
||||||
por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
|
por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
|
||||||
pmaddwd xmm0,[rel PW_F362_MF127]
|
pmaddwd xmm0, [rel PW_F362_MF127]
|
||||||
pmaddwd xmm2,[rel PW_F085_MF072]
|
pmaddwd xmm2, [rel PW_F085_MF072]
|
||||||
|
|
||||||
paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3]
|
paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3]
|
||||||
paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
|
paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
@@ -479,36 +479,36 @@ EXTN(jsimd_idct_2x2_sse2):
|
|||||||
|
|
||||||
; xmm6=(00 01 ** 03 ** 05 ** 07)
|
; xmm6=(00 01 ** 03 ** 05 ** 07)
|
||||||
|
|
||||||
movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
|
movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
|
||||||
pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
|
pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
|
||||||
pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
|
pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
|
||||||
psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
|
psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
|
||||||
psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
|
psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
|
||||||
|
|
||||||
; -- Final output stage
|
; -- Final output stage
|
||||||
|
|
||||||
movdqa xmm3,xmm6
|
movdqa xmm3, xmm6
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
|
paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
|
||||||
paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
|
paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
|
||||||
psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
|
psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
|
||||||
psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
|
psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
|
||||||
|
|
||||||
movdqa xmm2,[rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2]
|
movdqa xmm2, [rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2]
|
||||||
|
|
||||||
punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **)
|
punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **)
|
||||||
|
|
||||||
movdqa xmm7,xmm1
|
movdqa xmm7, xmm1
|
||||||
punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3)
|
punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3)
|
||||||
punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7)
|
punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7)
|
||||||
|
|
||||||
paddd xmm6,xmm2
|
paddd xmm6, xmm2
|
||||||
psrad xmm6,DESCALE_P1_2
|
psrad xmm6, DESCALE_P1_2
|
||||||
|
|
||||||
paddd xmm1,xmm2
|
paddd xmm1, xmm2
|
||||||
paddd xmm7,xmm2
|
paddd xmm7, xmm2
|
||||||
psrad xmm1,DESCALE_P1_2
|
psrad xmm1, DESCALE_P1_2
|
||||||
psrad xmm7,DESCALE_P1_2
|
psrad xmm7, DESCALE_P1_2
|
||||||
|
|
||||||
; -- Prefetch the next coefficient block
|
; -- Prefetch the next coefficient block
|
||||||
|
|
||||||
@@ -531,34 +531,34 @@ EXTN(jsimd_idct_2x2_sse2):
|
|||||||
|
|
||||||
; -- Odd part
|
; -- Odd part
|
||||||
|
|
||||||
packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
|
packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
|
||||||
packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
|
packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
|
||||||
pmaddwd xmm1,[rel PW_F362_MF127]
|
pmaddwd xmm1, [rel PW_F362_MF127]
|
||||||
pmaddwd xmm7,[rel PW_F085_MF072]
|
pmaddwd xmm7, [rel PW_F085_MF072]
|
||||||
|
|
||||||
paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
|
paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
|
pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
|
||||||
|
|
||||||
; -- Final output stage
|
; -- Final output stage
|
||||||
|
|
||||||
movdqa xmm4,xmm6
|
movdqa xmm4, xmm6
|
||||||
paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
|
paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
|
||||||
psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
|
psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
|
||||||
|
|
||||||
punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1)
|
punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1)
|
||||||
|
|
||||||
paddd xmm6,[rel PD_DESCALE_P2_2]
|
paddd xmm6, [rel PD_DESCALE_P2_2]
|
||||||
psrad xmm6,DESCALE_P2_2
|
psrad xmm6, DESCALE_P2_2
|
||||||
|
|
||||||
packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
|
packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
|
||||||
packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
|
packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
|
||||||
paddb xmm6,[rel PB_CENTERJSAMP]
|
paddb xmm6, [rel PB_CENTERJSAMP]
|
||||||
|
|
||||||
pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --)
|
pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
|
||||||
pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --)
|
pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
|
||||||
|
|
||||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
||||||
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
|
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
|
||||||
|
|||||||
@@ -51,20 +51,20 @@ F_3_624 equ 29692 ; FIX(3.624509785)
|
|||||||
%else
|
%else
|
||||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||||
F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243)
|
F_0_211 equ DESCALE( 226735879, 30-CONST_BITS) ; FIX(0.211164243)
|
||||||
F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579)
|
F_0_509 equ DESCALE( 547388834, 30-CONST_BITS) ; FIX(0.509795579)
|
||||||
F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887)
|
F_0_601 equ DESCALE( 645689155, 30-CONST_BITS) ; FIX(0.601344887)
|
||||||
F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822)
|
F_0_720 equ DESCALE( 774124714, 30-CONST_BITS) ; FIX(0.720959822)
|
||||||
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
|
F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
|
||||||
F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095)
|
F_0_850 equ DESCALE( 913142361, 30-CONST_BITS) ; FIX(0.850430095)
|
||||||
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
|
F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
|
||||||
F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337)
|
F_1_061 equ DESCALE(1139878239, 30-CONST_BITS) ; FIX(1.061594337)
|
||||||
F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580)
|
F_1_272 equ DESCALE(1366614119, 30-CONST_BITS) ; FIX(1.272758580)
|
||||||
F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981)
|
F_1_451 equ DESCALE(1558831516, 30-CONST_BITS) ; FIX(1.451774981)
|
||||||
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
|
F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
|
||||||
F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803)
|
F_2_172 equ DESCALE(2332956230, 30-CONST_BITS) ; FIX(2.172734803)
|
||||||
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
|
F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
|
||||||
F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785)
|
F_3_624 equ DESCALE(3891787747, 30-CONST_BITS) ; FIX(3.624509785)
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
; --------------------------------------------------------------------------
|
; --------------------------------------------------------------------------
|
||||||
@@ -116,11 +116,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
|||||||
|
|
||||||
EXTN(jsimd_idct_4x4_sse2):
|
EXTN(jsimd_idct_4x4_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov eax,esp ; eax = original ebp
|
mov eax, esp ; eax = original ebp
|
||||||
sub esp, byte 4
|
sub esp, byte 4
|
||||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||||
mov [esp],eax
|
mov [esp], eax
|
||||||
mov ebp,esp ; ebp = aligned ebp
|
mov ebp, esp ; ebp = aligned ebp
|
||||||
lea esp, [wk(0)]
|
lea esp, [wk(0)]
|
||||||
pushpic ebx
|
pushpic ebx
|
||||||
; push ecx ; unused
|
; push ecx ; unused
|
||||||
@@ -147,11 +147,11 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||||
por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||||
por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||||
por xmm0,xmm1
|
por xmm0, xmm1
|
||||||
packsswb xmm0,xmm0
|
packsswb xmm0, xmm0
|
||||||
packsswb xmm0,xmm0
|
packsswb xmm0, xmm0
|
||||||
movd eax,xmm0
|
movd eax, xmm0
|
||||||
test eax,eax
|
test eax, eax
|
||||||
jnz short .columnDCT
|
jnz short .columnDCT
|
||||||
|
|
||||||
; -- AC terms all zero
|
; -- AC terms all zero
|
||||||
@@ -159,19 +159,19 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||||
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
|
|
||||||
psllw xmm0,PASS1_BITS
|
psllw xmm0, PASS1_BITS
|
||||||
|
|
||||||
movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
|
movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
|
||||||
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||||
punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
|
punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
|
||||||
|
|
||||||
pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
|
pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
|
||||||
pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
|
pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
|
||||||
pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
|
pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
|
||||||
pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
|
pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
|
||||||
|
|
||||||
jmp near .column_end
|
jmp near .column_end
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
%endif
|
%endif
|
||||||
.columnDCT:
|
.columnDCT:
|
||||||
|
|
||||||
@@ -186,32 +186,32 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
|
|
||||||
movdqa xmm4,xmm0
|
movdqa xmm4, xmm0
|
||||||
movdqa xmm5,xmm0
|
movdqa xmm5, xmm0
|
||||||
punpcklwd xmm4,xmm1
|
punpcklwd xmm4, xmm1
|
||||||
punpckhwd xmm5,xmm1
|
punpckhwd xmm5, xmm1
|
||||||
movdqa xmm0,xmm4
|
movdqa xmm0, xmm4
|
||||||
movdqa xmm1,xmm5
|
movdqa xmm1, xmm5
|
||||||
pmaddwd xmm4,[GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L)
|
pmaddwd xmm4, [GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L)
|
||||||
pmaddwd xmm5,[GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H)
|
pmaddwd xmm5, [GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H)
|
||||||
pmaddwd xmm0,[GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L)
|
pmaddwd xmm0, [GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L)
|
||||||
pmaddwd xmm1,[GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H)
|
pmaddwd xmm1, [GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H)
|
||||||
|
|
||||||
movdqa xmm6,xmm2
|
movdqa xmm6, xmm2
|
||||||
movdqa xmm7,xmm2
|
movdqa xmm7, xmm2
|
||||||
punpcklwd xmm6,xmm3
|
punpcklwd xmm6, xmm3
|
||||||
punpckhwd xmm7,xmm3
|
punpckhwd xmm7, xmm3
|
||||||
movdqa xmm2,xmm6
|
movdqa xmm2, xmm6
|
||||||
movdqa xmm3,xmm7
|
movdqa xmm3, xmm7
|
||||||
pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L)
|
pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L)
|
||||||
pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H)
|
pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H)
|
||||||
pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L)
|
pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L)
|
||||||
pmaddwd xmm3,[GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H)
|
pmaddwd xmm3, [GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H)
|
||||||
|
|
||||||
paddd xmm6,xmm4 ; xmm6=tmp2L
|
paddd xmm6, xmm4 ; xmm6=tmp2L
|
||||||
paddd xmm7,xmm5 ; xmm7=tmp2H
|
paddd xmm7, xmm5 ; xmm7=tmp2H
|
||||||
paddd xmm2,xmm0 ; xmm2=tmp0L
|
paddd xmm2, xmm0 ; xmm2=tmp0L
|
||||||
paddd xmm3,xmm1 ; xmm3=tmp0H
|
paddd xmm3, xmm1 ; xmm3=tmp0H
|
||||||
|
|
||||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
|
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
|
||||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
|
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
|
||||||
@@ -225,86 +225,86 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||||
|
|
||||||
pxor xmm1,xmm1
|
pxor xmm1, xmm1
|
||||||
pxor xmm2,xmm2
|
pxor xmm2, xmm2
|
||||||
punpcklwd xmm1,xmm4 ; xmm1=tmp0L
|
punpcklwd xmm1, xmm4 ; xmm1=tmp0L
|
||||||
punpckhwd xmm2,xmm4 ; xmm2=tmp0H
|
punpckhwd xmm2, xmm4 ; xmm2=tmp0H
|
||||||
psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
|
psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
|
||||||
psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
|
psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
|
||||||
|
|
||||||
movdqa xmm3,xmm5 ; xmm5=in2=z2
|
movdqa xmm3, xmm5 ; xmm5=in2=z2
|
||||||
punpcklwd xmm5,xmm0 ; xmm0=in6=z3
|
punpcklwd xmm5, xmm0 ; xmm0=in6=z3
|
||||||
punpckhwd xmm3,xmm0
|
punpckhwd xmm3, xmm0
|
||||||
pmaddwd xmm5,[GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L
|
pmaddwd xmm5, [GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L
|
||||||
pmaddwd xmm3,[GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H
|
pmaddwd xmm3, [GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H
|
||||||
|
|
||||||
movdqa xmm4,xmm1
|
movdqa xmm4, xmm1
|
||||||
movdqa xmm0,xmm2
|
movdqa xmm0, xmm2
|
||||||
paddd xmm1,xmm5 ; xmm1=tmp10L
|
paddd xmm1, xmm5 ; xmm1=tmp10L
|
||||||
paddd xmm2,xmm3 ; xmm2=tmp10H
|
paddd xmm2, xmm3 ; xmm2=tmp10H
|
||||||
psubd xmm4,xmm5 ; xmm4=tmp12L
|
psubd xmm4, xmm5 ; xmm4=tmp12L
|
||||||
psubd xmm0,xmm3 ; xmm0=tmp12H
|
psubd xmm0, xmm3 ; xmm0=tmp12H
|
||||||
|
|
||||||
; -- Final output stage
|
; -- Final output stage
|
||||||
|
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
movdqa xmm3,xmm2
|
movdqa xmm3, xmm2
|
||||||
paddd xmm1,xmm6 ; xmm1=data0L
|
paddd xmm1, xmm6 ; xmm1=data0L
|
||||||
paddd xmm2,xmm7 ; xmm2=data0H
|
paddd xmm2, xmm7 ; xmm2=data0H
|
||||||
psubd xmm5,xmm6 ; xmm5=data3L
|
psubd xmm5, xmm6 ; xmm5=data3L
|
||||||
psubd xmm3,xmm7 ; xmm3=data3H
|
psubd xmm3, xmm7 ; xmm3=data3H
|
||||||
|
|
||||||
movdqa xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4]
|
movdqa xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4]
|
||||||
|
|
||||||
paddd xmm1,xmm6
|
paddd xmm1, xmm6
|
||||||
paddd xmm2,xmm6
|
paddd xmm2, xmm6
|
||||||
psrad xmm1,DESCALE_P1_4
|
psrad xmm1, DESCALE_P1_4
|
||||||
psrad xmm2,DESCALE_P1_4
|
psrad xmm2, DESCALE_P1_4
|
||||||
paddd xmm5,xmm6
|
paddd xmm5, xmm6
|
||||||
paddd xmm3,xmm6
|
paddd xmm3, xmm6
|
||||||
psrad xmm5,DESCALE_P1_4
|
psrad xmm5, DESCALE_P1_4
|
||||||
psrad xmm3,DESCALE_P1_4
|
psrad xmm3, DESCALE_P1_4
|
||||||
|
|
||||||
packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
|
packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
|
||||||
packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
|
packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
|
||||||
|
|
||||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
|
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
|
||||||
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
|
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
|
||||||
|
|
||||||
movdqa xmm2,xmm4
|
movdqa xmm2, xmm4
|
||||||
movdqa xmm3,xmm0
|
movdqa xmm3, xmm0
|
||||||
paddd xmm4,xmm7 ; xmm4=data1L
|
paddd xmm4, xmm7 ; xmm4=data1L
|
||||||
paddd xmm0,xmm6 ; xmm0=data1H
|
paddd xmm0, xmm6 ; xmm0=data1H
|
||||||
psubd xmm2,xmm7 ; xmm2=data2L
|
psubd xmm2, xmm7 ; xmm2=data2L
|
||||||
psubd xmm3,xmm6 ; xmm3=data2H
|
psubd xmm3, xmm6 ; xmm3=data2H
|
||||||
|
|
||||||
movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4]
|
movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4]
|
||||||
|
|
||||||
paddd xmm4,xmm7
|
paddd xmm4, xmm7
|
||||||
paddd xmm0,xmm7
|
paddd xmm0, xmm7
|
||||||
psrad xmm4,DESCALE_P1_4
|
psrad xmm4, DESCALE_P1_4
|
||||||
psrad xmm0,DESCALE_P1_4
|
psrad xmm0, DESCALE_P1_4
|
||||||
paddd xmm2,xmm7
|
paddd xmm2, xmm7
|
||||||
paddd xmm3,xmm7
|
paddd xmm3, xmm7
|
||||||
psrad xmm2,DESCALE_P1_4
|
psrad xmm2, DESCALE_P1_4
|
||||||
psrad xmm3,DESCALE_P1_4
|
psrad xmm3, DESCALE_P1_4
|
||||||
|
|
||||||
packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
|
packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
|
||||||
packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
|
packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
|
||||||
|
|
||||||
movdqa xmm6,xmm1 ; transpose coefficients(phase 1)
|
movdqa xmm6, xmm1 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
|
punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
|
||||||
punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
|
punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
|
||||||
movdqa xmm7,xmm2 ; transpose coefficients(phase 1)
|
movdqa xmm7, xmm2 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
|
punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
|
||||||
punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
|
punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
|
||||||
|
|
||||||
movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
|
movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
|
punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
|
||||||
punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
|
punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
|
||||||
movdqa xmm3,xmm6 ; transpose coefficients(phase 2)
|
movdqa xmm3, xmm6 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
|
punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
|
||||||
punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
|
punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
|
||||||
.column_end:
|
.column_end:
|
||||||
|
|
||||||
; -- Prefetch the next coefficient block
|
; -- Prefetch the next coefficient block
|
||||||
@@ -322,70 +322,70 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
pxor xmm4,xmm4
|
pxor xmm4, xmm4
|
||||||
punpcklwd xmm4,xmm1 ; xmm4=tmp0
|
punpcklwd xmm4, xmm1 ; xmm4=tmp0
|
||||||
psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
|
psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
|
||||||
|
|
||||||
; -- Odd part
|
; -- Odd part
|
||||||
|
|
||||||
punpckhwd xmm1,xmm0
|
punpckhwd xmm1, xmm0
|
||||||
punpckhwd xmm6,xmm3
|
punpckhwd xmm6, xmm3
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
movdqa xmm2,xmm6
|
movdqa xmm2, xmm6
|
||||||
pmaddwd xmm1,[GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2)
|
pmaddwd xmm1, [GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2)
|
||||||
pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2)
|
pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2)
|
||||||
pmaddwd xmm5,[GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0)
|
pmaddwd xmm5, [GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0)
|
||||||
pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0)
|
pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0)
|
||||||
|
|
||||||
paddd xmm6,xmm1 ; xmm6=tmp2
|
paddd xmm6, xmm1 ; xmm6=tmp2
|
||||||
paddd xmm2,xmm5 ; xmm2=tmp0
|
paddd xmm2, xmm5 ; xmm2=tmp0
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
punpcklwd xmm0,xmm3
|
punpcklwd xmm0, xmm3
|
||||||
pmaddwd xmm0,[GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2
|
pmaddwd xmm0, [GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2
|
||||||
|
|
||||||
movdqa xmm7,xmm4
|
movdqa xmm7, xmm4
|
||||||
paddd xmm4,xmm0 ; xmm4=tmp10
|
paddd xmm4, xmm0 ; xmm4=tmp10
|
||||||
psubd xmm7,xmm0 ; xmm7=tmp12
|
psubd xmm7, xmm0 ; xmm7=tmp12
|
||||||
|
|
||||||
; -- Final output stage
|
; -- Final output stage
|
||||||
|
|
||||||
movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4]
|
movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4]
|
||||||
|
|
||||||
movdqa xmm5,xmm4
|
movdqa xmm5, xmm4
|
||||||
movdqa xmm3,xmm7
|
movdqa xmm3, xmm7
|
||||||
paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30)
|
paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30)
|
||||||
paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31)
|
paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31)
|
||||||
psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33)
|
psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33)
|
||||||
psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32)
|
psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
|
||||||
|
|
||||||
paddd xmm4,xmm1
|
paddd xmm4, xmm1
|
||||||
paddd xmm7,xmm1
|
paddd xmm7, xmm1
|
||||||
psrad xmm4,DESCALE_P2_4
|
psrad xmm4, DESCALE_P2_4
|
||||||
psrad xmm7,DESCALE_P2_4
|
psrad xmm7, DESCALE_P2_4
|
||||||
paddd xmm5,xmm1
|
paddd xmm5, xmm1
|
||||||
paddd xmm3,xmm1
|
paddd xmm3, xmm1
|
||||||
psrad xmm5,DESCALE_P2_4
|
psrad xmm5, DESCALE_P2_4
|
||||||
psrad xmm3,DESCALE_P2_4
|
psrad xmm3, DESCALE_P2_4
|
||||||
|
|
||||||
packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
|
packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
|
||||||
packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
|
packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
|
||||||
|
|
||||||
movdqa xmm0,xmm4 ; transpose coefficients(phase 1)
|
movdqa xmm0, xmm4 ; transpose coefficients(phase 1)
|
||||||
punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
|
punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
|
||||||
punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
|
punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
|
||||||
|
|
||||||
movdqa xmm6,xmm4 ; transpose coefficients(phase 2)
|
movdqa xmm6, xmm4 ; transpose coefficients(phase 2)
|
||||||
punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
|
punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
|
||||||
punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
|
punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
|
||||||
|
|
||||||
packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
|
packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
|
||||||
paddb xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)]
|
paddb xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)]
|
||||||
|
|
||||||
pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
|
pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
|
||||||
pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
|
pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
|
||||||
pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
|
pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
|
||||||
|
|
||||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||||
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||||
@@ -401,7 +401,7 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
; pop edx ; need not be preserved
|
; pop edx ; need not be preserved
|
||||||
; pop ecx ; unused
|
; pop ecx ; unused
|
||||||
poppic ebx
|
poppic ebx
|
||||||
mov esp,ebp ; esp <- aligned ebp
|
mov esp, ebp ; esp <- aligned ebp
|
||||||
pop esp ; esp <- original ebp
|
pop esp ; esp <- original ebp
|
||||||
pop ebp
|
pop ebp
|
||||||
ret
|
ret
|
||||||
@@ -427,7 +427,7 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
|
|
||||||
EXTN(jsimd_idct_2x2_sse2):
|
EXTN(jsimd_idct_2x2_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov ebp,esp
|
mov ebp, esp
|
||||||
push ebx
|
push ebx
|
||||||
; push ecx ; need not be preserved
|
; push ecx ; need not be preserved
|
||||||
; push edx ; need not be preserved
|
; push edx ; need not be preserved
|
||||||
@@ -465,27 +465,27 @@ EXTN(jsimd_idct_2x2_sse2):
|
|||||||
; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
|
; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
|
||||||
; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
|
; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
|
||||||
|
|
||||||
pcmpeqd xmm7,xmm7
|
pcmpeqd xmm7, xmm7
|
||||||
pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
|
pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
|
||||||
|
|
||||||
movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
|
movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
|
||||||
movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
|
movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
|
||||||
punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
|
punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
|
||||||
punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
|
punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
|
||||||
pmaddwd xmm4,[GOTOFF(ebx,PW_F362_MF127)]
|
pmaddwd xmm4, [GOTOFF(ebx,PW_F362_MF127)]
|
||||||
pmaddwd xmm5,[GOTOFF(ebx,PW_F085_MF072)]
|
pmaddwd xmm5, [GOTOFF(ebx,PW_F085_MF072)]
|
||||||
|
|
||||||
psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
|
psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
|
||||||
pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
|
pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
|
||||||
psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
|
psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
|
||||||
pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
|
pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
|
||||||
por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
|
por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
|
||||||
por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
|
por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
|
||||||
pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)]
|
pmaddwd xmm0, [GOTOFF(ebx,PW_F362_MF127)]
|
||||||
pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)]
|
pmaddwd xmm2, [GOTOFF(ebx,PW_F085_MF072)]
|
||||||
|
|
||||||
paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3]
|
paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3]
|
||||||
paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
|
paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
@@ -494,36 +494,36 @@ EXTN(jsimd_idct_2x2_sse2):
|
|||||||
|
|
||||||
; xmm6=(00 01 ** 03 ** 05 ** 07)
|
; xmm6=(00 01 ** 03 ** 05 ** 07)
|
||||||
|
|
||||||
movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
|
movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
|
||||||
pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
|
pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
|
||||||
pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
|
pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
|
||||||
psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
|
psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
|
||||||
psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
|
psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
|
||||||
|
|
||||||
; -- Final output stage
|
; -- Final output stage
|
||||||
|
|
||||||
movdqa xmm3,xmm6
|
movdqa xmm3, xmm6
|
||||||
movdqa xmm5,xmm1
|
movdqa xmm5, xmm1
|
||||||
paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
|
paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
|
||||||
paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
|
paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
|
||||||
psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
|
psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
|
||||||
psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
|
psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
|
||||||
|
|
||||||
movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2]
|
movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2]
|
||||||
|
|
||||||
punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **)
|
punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **)
|
||||||
|
|
||||||
movdqa xmm7,xmm1
|
movdqa xmm7, xmm1
|
||||||
punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3)
|
punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3)
|
||||||
punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7)
|
punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7)
|
||||||
|
|
||||||
paddd xmm6,xmm2
|
paddd xmm6, xmm2
|
||||||
psrad xmm6,DESCALE_P1_2
|
psrad xmm6, DESCALE_P1_2
|
||||||
|
|
||||||
paddd xmm1,xmm2
|
paddd xmm1, xmm2
|
||||||
paddd xmm7,xmm2
|
paddd xmm7, xmm2
|
||||||
psrad xmm1,DESCALE_P1_2
|
psrad xmm1, DESCALE_P1_2
|
||||||
psrad xmm7,DESCALE_P1_2
|
psrad xmm7, DESCALE_P1_2
|
||||||
|
|
||||||
; -- Prefetch the next coefficient block
|
; -- Prefetch the next coefficient block
|
||||||
|
|
||||||
@@ -546,34 +546,34 @@ EXTN(jsimd_idct_2x2_sse2):
|
|||||||
|
|
||||||
; -- Odd part
|
; -- Odd part
|
||||||
|
|
||||||
packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
|
packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
|
||||||
packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
|
packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
|
||||||
pmaddwd xmm1,[GOTOFF(ebx,PW_F362_MF127)]
|
pmaddwd xmm1, [GOTOFF(ebx,PW_F362_MF127)]
|
||||||
pmaddwd xmm7,[GOTOFF(ebx,PW_F085_MF072)]
|
pmaddwd xmm7, [GOTOFF(ebx,PW_F085_MF072)]
|
||||||
|
|
||||||
paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
|
paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
|
||||||
|
|
||||||
; -- Even part
|
; -- Even part
|
||||||
|
|
||||||
pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
|
pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
|
||||||
|
|
||||||
; -- Final output stage
|
; -- Final output stage
|
||||||
|
|
||||||
movdqa xmm4,xmm6
|
movdqa xmm4, xmm6
|
||||||
paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
|
paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
|
||||||
psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
|
psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
|
||||||
|
|
||||||
punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1)
|
punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1)
|
||||||
|
|
||||||
paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)]
|
paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)]
|
||||||
psrad xmm6,DESCALE_P2_2
|
psrad xmm6, DESCALE_P2_2
|
||||||
|
|
||||||
packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
|
packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
|
||||||
packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
|
packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
|
||||||
paddb xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)]
|
paddb xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)]
|
||||||
|
|
||||||
pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --)
|
pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
|
||||||
pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --)
|
pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
|
||||||
|
|
||||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||||
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||||
|
|||||||
@@ -39,14 +39,14 @@
|
|||||||
|
|
||||||
EXTN(jsimd_convsamp_float_sse2):
|
EXTN(jsimd_convsamp_float_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp
|
mov rax, rsp
|
||||||
mov rbp,rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
pcmpeqw xmm7,xmm7
|
pcmpeqw xmm7, xmm7
|
||||||
psllw xmm7,7
|
psllw xmm7, 7
|
||||||
packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
|
packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
|
||||||
|
|
||||||
mov rsi, r10
|
mov rsi, r10
|
||||||
mov eax, r11d
|
mov eax, r11d
|
||||||
@@ -59,25 +59,25 @@ EXTN(jsimd_convsamp_float_sse2):
|
|||||||
movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
|
movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
|
||||||
movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
|
movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
|
||||||
|
|
||||||
psubb xmm0,xmm7 ; xmm0=(01234567)
|
psubb xmm0, xmm7 ; xmm0=(01234567)
|
||||||
psubb xmm1,xmm7 ; xmm1=(89ABCDEF)
|
psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
|
||||||
|
|
||||||
punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
|
punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
|
||||||
punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
|
punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
|
||||||
|
|
||||||
punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3)
|
punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
|
||||||
punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7)
|
punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
|
||||||
punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B)
|
punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
|
||||||
punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F)
|
punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
|
||||||
|
|
||||||
psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
|
psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
|
||||||
psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
|
psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
|
||||||
cvtdq2ps xmm2,xmm2 ; xmm2=(0123)
|
cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
|
||||||
cvtdq2ps xmm0,xmm0 ; xmm0=(4567)
|
cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
|
||||||
psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
|
psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
|
||||||
psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
|
psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
|
||||||
cvtdq2ps xmm3,xmm3 ; xmm3=(89AB)
|
cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
|
||||||
cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF)
|
cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
|
||||||
|
|
||||||
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
|
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
|
||||||
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
|
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
|
||||||
@@ -113,8 +113,8 @@ EXTN(jsimd_convsamp_float_sse2):
|
|||||||
|
|
||||||
EXTN(jsimd_quantize_float_sse2):
|
EXTN(jsimd_quantize_float_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp
|
mov rax, rsp
|
||||||
mov rbp,rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args
|
||||||
|
|
||||||
mov rsi, r12
|
mov rsi, r12
|
||||||
@@ -131,13 +131,13 @@ EXTN(jsimd_quantize_float_sse2):
|
|||||||
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
|
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||||
mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
|
mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
|
||||||
|
|
||||||
cvtps2dq xmm0,xmm0
|
cvtps2dq xmm0, xmm0
|
||||||
cvtps2dq xmm1,xmm1
|
cvtps2dq xmm1, xmm1
|
||||||
cvtps2dq xmm2,xmm2
|
cvtps2dq xmm2, xmm2
|
||||||
cvtps2dq xmm3,xmm3
|
cvtps2dq xmm3, xmm3
|
||||||
|
|
||||||
packssdw xmm0,xmm1
|
packssdw xmm0, xmm1
|
||||||
packssdw xmm2,xmm3
|
packssdw xmm2, xmm3
|
||||||
|
|
||||||
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
|
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
|
||||||
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
|
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
|
||||||
|
|||||||
@@ -38,22 +38,22 @@
|
|||||||
|
|
||||||
EXTN(jsimd_convsamp_float_sse2):
|
EXTN(jsimd_convsamp_float_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov ebp,esp
|
mov ebp, esp
|
||||||
push ebx
|
push ebx
|
||||||
; push ecx ; need not be preserved
|
; push ecx ; need not be preserved
|
||||||
; push edx ; need not be preserved
|
; push edx ; need not be preserved
|
||||||
push esi
|
push esi
|
||||||
push edi
|
push edi
|
||||||
|
|
||||||
pcmpeqw xmm7,xmm7
|
pcmpeqw xmm7, xmm7
|
||||||
psllw xmm7,7
|
psllw xmm7, 7
|
||||||
packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
|
packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
|
||||||
|
|
||||||
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
|
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
|
||||||
mov eax, JDIMENSION [start_col]
|
mov eax, JDIMENSION [start_col]
|
||||||
mov edi, POINTER [workspace] ; (DCTELEM *)
|
mov edi, POINTER [workspace] ; (DCTELEM *)
|
||||||
mov ecx, DCTSIZE/2
|
mov ecx, DCTSIZE/2
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.convloop:
|
.convloop:
|
||||||
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||||
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||||
@@ -61,25 +61,25 @@ EXTN(jsimd_convsamp_float_sse2):
|
|||||||
movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
|
movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
|
||||||
movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
|
movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
|
||||||
|
|
||||||
psubb xmm0,xmm7 ; xmm0=(01234567)
|
psubb xmm0, xmm7 ; xmm0=(01234567)
|
||||||
psubb xmm1,xmm7 ; xmm1=(89ABCDEF)
|
psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
|
||||||
|
|
||||||
punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
|
punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
|
||||||
punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
|
punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
|
||||||
|
|
||||||
punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3)
|
punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
|
||||||
punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7)
|
punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
|
||||||
punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B)
|
punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
|
||||||
punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F)
|
punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
|
||||||
|
|
||||||
psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
|
psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
|
||||||
psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
|
psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
|
||||||
cvtdq2ps xmm2,xmm2 ; xmm2=(0123)
|
cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
|
||||||
cvtdq2ps xmm0,xmm0 ; xmm0=(4567)
|
cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
|
||||||
psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
|
psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
|
||||||
psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
|
psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
|
||||||
cvtdq2ps xmm3,xmm3 ; xmm3=(89AB)
|
cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
|
||||||
cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF)
|
cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
|
||||||
|
|
||||||
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
|
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
|
||||||
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
|
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
|
||||||
@@ -118,7 +118,7 @@ EXTN(jsimd_convsamp_float_sse2):
|
|||||||
|
|
||||||
EXTN(jsimd_quantize_float_sse2):
|
EXTN(jsimd_quantize_float_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov ebp,esp
|
mov ebp, esp
|
||||||
; push ebx ; unused
|
; push ebx ; unused
|
||||||
; push ecx ; unused
|
; push ecx ; unused
|
||||||
; push edx ; need not be preserved
|
; push edx ; need not be preserved
|
||||||
@@ -129,7 +129,7 @@ EXTN(jsimd_quantize_float_sse2):
|
|||||||
mov edx, POINTER [divisors]
|
mov edx, POINTER [divisors]
|
||||||
mov edi, JCOEFPTR [coef_block]
|
mov edi, JCOEFPTR [coef_block]
|
||||||
mov eax, DCTSIZE2/16
|
mov eax, DCTSIZE2/16
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.quantloop:
|
.quantloop:
|
||||||
movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
|
movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
|
||||||
movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
|
movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
|
||||||
@@ -140,13 +140,13 @@ EXTN(jsimd_quantize_float_sse2):
|
|||||||
mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
|
mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
|
||||||
mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
|
mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
|
||||||
|
|
||||||
cvtps2dq xmm0,xmm0
|
cvtps2dq xmm0, xmm0
|
||||||
cvtps2dq xmm1,xmm1
|
cvtps2dq xmm1, xmm1
|
||||||
cvtps2dq xmm2,xmm2
|
cvtps2dq xmm2, xmm2
|
||||||
cvtps2dq xmm3,xmm3
|
cvtps2dq xmm3, xmm3
|
||||||
|
|
||||||
packssdw xmm0,xmm1
|
packssdw xmm0, xmm1
|
||||||
packssdw xmm2,xmm3
|
packssdw xmm2, xmm3
|
||||||
|
|
||||||
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
|
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
|
||||||
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
|
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
|
||||||
|
|||||||
@@ -39,14 +39,14 @@
|
|||||||
|
|
||||||
EXTN(jsimd_convsamp_sse2):
|
EXTN(jsimd_convsamp_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp
|
mov rax, rsp
|
||||||
mov rbp,rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
pxor xmm6,xmm6 ; xmm6=(all 0's)
|
pxor xmm6, xmm6 ; xmm6=(all 0's)
|
||||||
pcmpeqw xmm7,xmm7
|
pcmpeqw xmm7, xmm7
|
||||||
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||||
|
|
||||||
mov rsi, r10
|
mov rsi, r10
|
||||||
mov eax, r11d
|
mov eax, r11d
|
||||||
@@ -65,14 +65,14 @@ EXTN(jsimd_convsamp_sse2):
|
|||||||
movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
|
movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
|
||||||
movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
|
movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
|
||||||
|
|
||||||
punpcklbw xmm0,xmm6 ; xmm0=(01234567)
|
punpcklbw xmm0, xmm6 ; xmm0=(01234567)
|
||||||
punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF)
|
punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
|
||||||
paddw xmm0,xmm7
|
paddw xmm0, xmm7
|
||||||
paddw xmm1,xmm7
|
paddw xmm1, xmm7
|
||||||
punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN)
|
punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
|
||||||
punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV)
|
punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
|
||||||
paddw xmm2,xmm7
|
paddw xmm2, xmm7
|
||||||
paddw xmm3,xmm7
|
paddw xmm3, xmm7
|
||||||
|
|
||||||
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
|
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
|
||||||
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
|
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
|
||||||
@@ -115,8 +115,8 @@ EXTN(jsimd_convsamp_sse2):
|
|||||||
|
|
||||||
EXTN(jsimd_quantize_sse2):
|
EXTN(jsimd_quantize_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rax,rsp
|
mov rax, rsp
|
||||||
mov rbp,rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args
|
||||||
|
|
||||||
mov rsi, r12
|
mov rsi, r12
|
||||||
@@ -128,22 +128,22 @@ EXTN(jsimd_quantize_sse2):
|
|||||||
movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
|
movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
|
||||||
movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
|
movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
|
||||||
movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
|
movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
|
||||||
movdqa xmm0,xmm4
|
movdqa xmm0, xmm4
|
||||||
movdqa xmm1,xmm5
|
movdqa xmm1, xmm5
|
||||||
movdqa xmm2,xmm6
|
movdqa xmm2, xmm6
|
||||||
movdqa xmm3,xmm7
|
movdqa xmm3, xmm7
|
||||||
psraw xmm4,(WORD_BIT-1)
|
psraw xmm4, (WORD_BIT-1)
|
||||||
psraw xmm5,(WORD_BIT-1)
|
psraw xmm5, (WORD_BIT-1)
|
||||||
psraw xmm6,(WORD_BIT-1)
|
psraw xmm6, (WORD_BIT-1)
|
||||||
psraw xmm7,(WORD_BIT-1)
|
psraw xmm7, (WORD_BIT-1)
|
||||||
pxor xmm0,xmm4
|
pxor xmm0, xmm4
|
||||||
pxor xmm1,xmm5
|
pxor xmm1, xmm5
|
||||||
pxor xmm2,xmm6
|
pxor xmm2, xmm6
|
||||||
pxor xmm3,xmm7
|
pxor xmm3, xmm7
|
||||||
psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
|
psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
|
||||||
psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
|
psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
|
||||||
psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
|
psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
|
||||||
psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
|
psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
|
||||||
|
|
||||||
paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
|
paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
|
||||||
paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
|
paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
|
||||||
@@ -158,14 +158,14 @@ EXTN(jsimd_quantize_sse2):
|
|||||||
pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
|
pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
|
||||||
pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
|
pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
|
||||||
|
|
||||||
pxor xmm0,xmm4
|
pxor xmm0, xmm4
|
||||||
pxor xmm1,xmm5
|
pxor xmm1, xmm5
|
||||||
pxor xmm2,xmm6
|
pxor xmm2, xmm6
|
||||||
pxor xmm3,xmm7
|
pxor xmm3, xmm7
|
||||||
psubw xmm0,xmm4
|
psubw xmm0, xmm4
|
||||||
psubw xmm1,xmm5
|
psubw xmm1, xmm5
|
||||||
psubw xmm2,xmm6
|
psubw xmm2, xmm6
|
||||||
psubw xmm3,xmm7
|
psubw xmm3, xmm7
|
||||||
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
|
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
|
||||||
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
|
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
|
||||||
movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
|
movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
|
||||||
|
|||||||
@@ -38,22 +38,22 @@
|
|||||||
|
|
||||||
EXTN(jsimd_convsamp_sse2):
|
EXTN(jsimd_convsamp_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov ebp,esp
|
mov ebp, esp
|
||||||
push ebx
|
push ebx
|
||||||
; push ecx ; need not be preserved
|
; push ecx ; need not be preserved
|
||||||
; push edx ; need not be preserved
|
; push edx ; need not be preserved
|
||||||
push esi
|
push esi
|
||||||
push edi
|
push edi
|
||||||
|
|
||||||
pxor xmm6,xmm6 ; xmm6=(all 0's)
|
pxor xmm6, xmm6 ; xmm6=(all 0's)
|
||||||
pcmpeqw xmm7,xmm7
|
pcmpeqw xmm7, xmm7
|
||||||
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||||
|
|
||||||
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
|
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
|
||||||
mov eax, JDIMENSION [start_col]
|
mov eax, JDIMENSION [start_col]
|
||||||
mov edi, POINTER [workspace] ; (DCTELEM *)
|
mov edi, POINTER [workspace] ; (DCTELEM *)
|
||||||
mov ecx, DCTSIZE/4
|
mov ecx, DCTSIZE/4
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.convloop:
|
.convloop:
|
||||||
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||||
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||||
@@ -67,14 +67,14 @@ EXTN(jsimd_convsamp_sse2):
|
|||||||
movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
|
movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
|
||||||
movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
|
movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
|
||||||
|
|
||||||
punpcklbw xmm0,xmm6 ; xmm0=(01234567)
|
punpcklbw xmm0, xmm6 ; xmm0=(01234567)
|
||||||
punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF)
|
punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
|
||||||
paddw xmm0,xmm7
|
paddw xmm0, xmm7
|
||||||
paddw xmm1,xmm7
|
paddw xmm1, xmm7
|
||||||
punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN)
|
punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
|
||||||
punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV)
|
punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
|
||||||
paddw xmm2,xmm7
|
paddw xmm2, xmm7
|
||||||
paddw xmm3,xmm7
|
paddw xmm3, xmm7
|
||||||
|
|
||||||
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
|
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
|
||||||
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
|
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
|
||||||
@@ -120,7 +120,7 @@ EXTN(jsimd_convsamp_sse2):
|
|||||||
|
|
||||||
EXTN(jsimd_quantize_sse2):
|
EXTN(jsimd_quantize_sse2):
|
||||||
push ebp
|
push ebp
|
||||||
mov ebp,esp
|
mov ebp, esp
|
||||||
; push ebx ; unused
|
; push ebx ; unused
|
||||||
; push ecx ; unused
|
; push ecx ; unused
|
||||||
; push edx ; need not be preserved
|
; push edx ; need not be preserved
|
||||||
@@ -131,28 +131,28 @@ EXTN(jsimd_quantize_sse2):
|
|||||||
mov edx, POINTER [divisors]
|
mov edx, POINTER [divisors]
|
||||||
mov edi, JCOEFPTR [coef_block]
|
mov edi, JCOEFPTR [coef_block]
|
||||||
mov eax, DCTSIZE2/32
|
mov eax, DCTSIZE2/32
|
||||||
alignx 16,7
|
alignx 16, 7
|
||||||
.quantloop:
|
.quantloop:
|
||||||
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
|
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
|
||||||
movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
|
movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
|
||||||
movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
|
movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
|
||||||
movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
|
movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
|
||||||
movdqa xmm0,xmm4
|
movdqa xmm0, xmm4
|
||||||
movdqa xmm1,xmm5
|
movdqa xmm1, xmm5
|
||||||
movdqa xmm2,xmm6
|
movdqa xmm2, xmm6
|
||||||
movdqa xmm3,xmm7
|
movdqa xmm3, xmm7
|
||||||
psraw xmm4,(WORD_BIT-1)
|
psraw xmm4, (WORD_BIT-1)
|
||||||
psraw xmm5,(WORD_BIT-1)
|
psraw xmm5, (WORD_BIT-1)
|
||||||
psraw xmm6,(WORD_BIT-1)
|
psraw xmm6, (WORD_BIT-1)
|
||||||
psraw xmm7,(WORD_BIT-1)
|
psraw xmm7, (WORD_BIT-1)
|
||||||
pxor xmm0,xmm4
|
pxor xmm0, xmm4
|
||||||
pxor xmm1,xmm5
|
pxor xmm1, xmm5
|
||||||
pxor xmm2,xmm6
|
pxor xmm2, xmm6
|
||||||
pxor xmm3,xmm7
|
pxor xmm3, xmm7
|
||||||
psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
|
psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
|
||||||
psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
|
psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
|
||||||
psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
|
psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
|
||||||
psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
|
psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
|
||||||
|
|
||||||
paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
|
paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
|
||||||
paddw xmm1, XMMWORD [CORRECTION(1,0,edx)]
|
paddw xmm1, XMMWORD [CORRECTION(1,0,edx)]
|
||||||
@@ -167,14 +167,14 @@ EXTN(jsimd_quantize_sse2):
|
|||||||
pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
|
pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
|
||||||
pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
|
pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
|
||||||
|
|
||||||
pxor xmm0,xmm4
|
pxor xmm0, xmm4
|
||||||
pxor xmm1,xmm5
|
pxor xmm1, xmm5
|
||||||
pxor xmm2,xmm6
|
pxor xmm2, xmm6
|
||||||
pxor xmm3,xmm7
|
pxor xmm3, xmm7
|
||||||
psubw xmm0,xmm4
|
psubw xmm0, xmm4
|
||||||
psubw xmm1,xmm5
|
psubw xmm1, xmm5
|
||||||
psubw xmm2,xmm6
|
psubw xmm2, xmm6
|
||||||
psubw xmm3,xmm7
|
psubw xmm3, xmm7
|
||||||
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
|
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
|
||||||
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
|
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
|
||||||
movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
|
movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
|
||||||
|
|||||||
@@ -190,7 +190,7 @@ section .note.GNU-stack noalloc noexec nowrite progbits
|
|||||||
|
|
||||||
%ifdef PIC ; -------------------------------------------
|
%ifdef PIC ; -------------------------------------------
|
||||||
|
|
||||||
%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
|
%ifidn GOT_SYMBOL, _MACHO_PIC_ ; --------------------
|
||||||
|
|
||||||
; At present, nasm doesn't seem to support PIC generation for Mach-O.
|
; At present, nasm doesn't seem to support PIC generation for Mach-O.
|
||||||
; The PIC support code below is a little tricky.
|
; The PIC support code below is a little tricky.
|
||||||
@@ -210,19 +210,20 @@ const_base:
|
|||||||
ret
|
ret
|
||||||
%%adjust:
|
%%adjust:
|
||||||
push ebp
|
push ebp
|
||||||
xor ebp,ebp ; ebp = 0
|
xor ebp, ebp ; ebp = 0
|
||||||
%ifidni %1,ebx ; (%1 == ebx)
|
%ifidni %1, ebx ; (%1 == ebx)
|
||||||
; db 0x8D,0x9C + jmp near const_base =
|
; db 0x8D,0x9C + jmp near const_base =
|
||||||
; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
|
; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
|
||||||
db 0x8D,0x9C ; 8D,9C
|
db 0x8D, 0x9C ; 8D,9C
|
||||||
jmp near const_base ; E9,(const_base-%%ref)
|
jmp near const_base ; E9,(const_base-%%ref)
|
||||||
%%ref:
|
%%ref:
|
||||||
%else ; (%1 != ebx)
|
%else ; (%1 != ebx)
|
||||||
; db 0x8D,0x8C + jmp near const_base =
|
; db 0x8D,0x8C + jmp near const_base =
|
||||||
; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
|
; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
|
||||||
db 0x8D,0x8C ; 8D,8C
|
db 0x8D, 0x8C ; 8D,8C
|
||||||
jmp near const_base ; E9,(const_base-%%ref)
|
jmp near const_base ; E9,(const_base-%%ref)
|
||||||
%%ref: mov %1, ecx
|
%%ref:
|
||||||
|
mov %1, ecx
|
||||||
%endif ; (%1 == ebx)
|
%endif ; (%1 == ebx)
|
||||||
pop ebp
|
pop ebp
|
||||||
%endmacro
|
%endmacro
|
||||||
@@ -251,7 +252,7 @@ const_base:
|
|||||||
pop %1
|
pop %1
|
||||||
%endmacro
|
%endmacro
|
||||||
%imacro movpic 2.nolist
|
%imacro movpic 2.nolist
|
||||||
mov %1,%2
|
mov %1, %2
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%else ; !PIC -----------------------------------------
|
%else ; !PIC -----------------------------------------
|
||||||
@@ -277,7 +278,8 @@ const_base:
|
|||||||
%define FILLB(b,n) (($$-(b)) & ((n)-1))
|
%define FILLB(b,n) (($$-(b)) & ((n)-1))
|
||||||
|
|
||||||
%imacro alignx 1-2.nolist 0xFFFF
|
%imacro alignx 1-2.nolist 0xFFFF
|
||||||
%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
|
%%bs: \
|
||||||
|
times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
|
||||||
db 0x90 ; nop
|
db 0x90 ; nop
|
||||||
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
|
times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
|
||||||
db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
|
db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
|
||||||
|
|||||||
Reference in New Issue
Block a user