Eliminate the use of the MASKMOVDQU instruction, to speed up decompression performance by 10x on AMD Bobcat embedded processors (and ~5% on AMD desktop processors.)
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@835 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
@@ -19,6 +19,13 @@ calling conventions.
|
|||||||
images (specifically, images in which the component count was erroneously set
|
images (specifically, images in which the component count was erroneously set
|
||||||
to a large value) would cause libjpeg-turbo to segfault.
|
to a large value) would cause libjpeg-turbo to segfault.
|
||||||
|
|
||||||
|
[5] Worked around a severe performance issue with "Bobcat" (AMD Embedded APU)
|
||||||
|
processors. The MASKMOVDQU instruction, which was used by the libjpeg-turbo
|
||||||
|
SSE2 SIMD code, is apparently implemented in microcode on AMD processors, and
|
||||||
|
it is painfully slow on Bobcat processors in particular. Eliminating the use
|
||||||
|
of this instruction improved performance by an order of magnitude on Bobcat
|
||||||
|
processors and by a small amount (typically 5%) on AMD desktop processors.
|
||||||
|
|
||||||
|
|
||||||
1.2.0
|
1.2.0
|
||||||
=====
|
=====
|
||||||
|
|||||||
@@ -251,17 +251,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
|
||||||
jmp short .out0
|
jmp short .out0
|
||||||
.out1: ; --(unaligned)-----------------
|
.out1: ; --(unaligned)-----------------
|
||||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
|
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
.out0:
|
.out0:
|
||||||
|
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||||
sub rcx, byte SIZEOF_XMMWORD
|
sub rcx, byte SIZEOF_XMMWORD
|
||||||
jz near .nextrow
|
jz near .nextrow
|
||||||
|
|
||||||
@@ -275,17 +271,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
||||||
cmp rcx, byte 2*SIZEOF_XMMWORD
|
cmp rcx, byte 2*SIZEOF_XMMWORD
|
||||||
jb short .column_st16
|
jb short .column_st16
|
||||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
movdqa xmmA,xmmF
|
movdqa xmmA,xmmF
|
||||||
sub rcx, byte 2*SIZEOF_XMMWORD
|
sub rcx, byte 2*SIZEOF_XMMWORD
|
||||||
jmp short .column_st15
|
jmp short .column_st15
|
||||||
.column_st16:
|
.column_st16:
|
||||||
cmp rcx, byte SIZEOF_XMMWORD
|
cmp rcx, byte SIZEOF_XMMWORD
|
||||||
jb short .column_st15
|
jb short .column_st15
|
||||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmD
|
movdqa xmmA,xmmD
|
||||||
sub rcx, byte SIZEOF_XMMWORD
|
sub rcx, byte SIZEOF_XMMWORD
|
||||||
@@ -363,7 +358,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
por xmmA,xmmG
|
por xmmA,xmmG
|
||||||
por xmmE,xmmC
|
por xmmE,xmmC
|
||||||
.adj0: ; ----------------
|
.adj0: ; ----------------
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
movdqu XMMWORD [rdi], xmmA
|
||||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||||
|
|
||||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||||
@@ -409,19 +404,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||||
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
|
||||||
jmp short .out0
|
jmp short .out0
|
||||||
.out1: ; --(unaligned)-----------------
|
.out1: ; --(unaligned)-----------------
|
||||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
|
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
|
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
.out0:
|
.out0:
|
||||||
|
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||||
sub rcx, byte SIZEOF_XMMWORD
|
sub rcx, byte SIZEOF_XMMWORD
|
||||||
jz near .nextrow
|
jz near .nextrow
|
||||||
|
|
||||||
@@ -434,17 +424,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||||
cmp rcx, byte SIZEOF_XMMWORD/2
|
cmp rcx, byte SIZEOF_XMMWORD/2
|
||||||
jb short .column_st16
|
jb short .column_st16
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
movdqa xmmA,xmmC
|
movdqa xmmA,xmmC
|
||||||
movdqa xmmD,xmmH
|
movdqa xmmD,xmmH
|
||||||
sub rcx, byte SIZEOF_XMMWORD/2
|
sub rcx, byte SIZEOF_XMMWORD/2
|
||||||
.column_st16:
|
.column_st16:
|
||||||
cmp rcx, byte SIZEOF_XMMWORD/4
|
cmp rcx, byte SIZEOF_XMMWORD/4
|
||||||
jb short .column_st15
|
jb short .column_st15
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmD
|
movdqa xmmA,xmmD
|
||||||
sub rcx, byte SIZEOF_XMMWORD/4
|
sub rcx, byte SIZEOF_XMMWORD/4
|
||||||
@@ -503,7 +492,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
por xmmA,xmmB
|
por xmmA,xmmB
|
||||||
por xmmE,xmmG
|
por xmmE,xmmG
|
||||||
.adj0: ; ----------------
|
.adj0: ; ----------------
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
movdqu XMMWORD [rdi], xmmA
|
||||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||||
|
|
||||||
%endif ; RGB_PIXELSIZE ; ---------------
|
%endif ; RGB_PIXELSIZE ; ---------------
|
||||||
|
|||||||
@@ -262,17 +262,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||||
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
||||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
|
||||||
jmp short .out0
|
jmp short .out0
|
||||||
.out1: ; --(unaligned)-----------------
|
.out1: ; --(unaligned)-----------------
|
||||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
||||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
|
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
|
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
.out0:
|
.out0:
|
||||||
|
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||||
sub ecx, byte SIZEOF_XMMWORD
|
sub ecx, byte SIZEOF_XMMWORD
|
||||||
jz near .nextrow
|
jz near .nextrow
|
||||||
|
|
||||||
@@ -287,17 +283,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
||||||
cmp ecx, byte 2*SIZEOF_XMMWORD
|
cmp ecx, byte 2*SIZEOF_XMMWORD
|
||||||
jb short .column_st16
|
jb short .column_st16
|
||||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
|
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
movdqa xmmA,xmmF
|
movdqa xmmA,xmmF
|
||||||
sub ecx, byte 2*SIZEOF_XMMWORD
|
sub ecx, byte 2*SIZEOF_XMMWORD
|
||||||
jmp short .column_st15
|
jmp short .column_st15
|
||||||
.column_st16:
|
.column_st16:
|
||||||
cmp ecx, byte SIZEOF_XMMWORD
|
cmp ecx, byte SIZEOF_XMMWORD
|
||||||
jb short .column_st15
|
jb short .column_st15
|
||||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmD
|
movdqa xmmA,xmmD
|
||||||
sub ecx, byte SIZEOF_XMMWORD
|
sub ecx, byte SIZEOF_XMMWORD
|
||||||
@@ -375,7 +370,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
por xmmA,xmmG
|
por xmmA,xmmG
|
||||||
por xmmE,xmmC
|
por xmmE,xmmC
|
||||||
.adj0: ; ----------------
|
.adj0: ; ----------------
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [edi], xmmA
|
||||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||||
|
|
||||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||||
@@ -421,19 +416,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||||
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
||||||
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
||||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
|
||||||
jmp short .out0
|
jmp short .out0
|
||||||
.out1: ; --(unaligned)-----------------
|
.out1: ; --(unaligned)-----------------
|
||||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
||||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
|
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
|
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
|
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
.out0:
|
.out0:
|
||||||
|
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||||
sub ecx, byte SIZEOF_XMMWORD
|
sub ecx, byte SIZEOF_XMMWORD
|
||||||
jz near .nextrow
|
jz near .nextrow
|
||||||
|
|
||||||
@@ -447,17 +437,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||||
cmp ecx, byte SIZEOF_XMMWORD/2
|
cmp ecx, byte SIZEOF_XMMWORD/2
|
||||||
jb short .column_st16
|
jb short .column_st16
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
|
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
movdqa xmmA,xmmC
|
movdqa xmmA,xmmC
|
||||||
movdqa xmmD,xmmH
|
movdqa xmmD,xmmH
|
||||||
sub ecx, byte SIZEOF_XMMWORD/2
|
sub ecx, byte SIZEOF_XMMWORD/2
|
||||||
.column_st16:
|
.column_st16:
|
||||||
cmp ecx, byte SIZEOF_XMMWORD/4
|
cmp ecx, byte SIZEOF_XMMWORD/4
|
||||||
jb short .column_st15
|
jb short .column_st15
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmD
|
movdqa xmmA,xmmD
|
||||||
sub ecx, byte SIZEOF_XMMWORD/4
|
sub ecx, byte SIZEOF_XMMWORD/4
|
||||||
@@ -516,7 +505,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
por xmmA,xmmB
|
por xmmA,xmmB
|
||||||
por xmmE,xmmG
|
por xmmE,xmmG
|
||||||
.adj0: ; ----------------
|
.adj0: ; ----------------
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [edi], xmmA
|
||||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||||
|
|
||||||
%endif ; RGB_PIXELSIZE ; ---------------
|
%endif ; RGB_PIXELSIZE ; ---------------
|
||||||
|
|||||||
@@ -252,17 +252,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
|
||||||
jmp short .out0
|
jmp short .out0
|
||||||
.out1: ; --(unaligned)-----------------
|
.out1: ; --(unaligned)-----------------
|
||||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
|
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
.out0:
|
.out0:
|
||||||
|
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||||
sub rcx, byte SIZEOF_XMMWORD
|
sub rcx, byte SIZEOF_XMMWORD
|
||||||
jz near .endcolumn
|
jz near .endcolumn
|
||||||
|
|
||||||
@@ -275,21 +271,19 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
jmp near .columnloop
|
jmp near .columnloop
|
||||||
|
|
||||||
.column_st32:
|
.column_st32:
|
||||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
|
||||||
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
||||||
cmp rcx, byte 2*SIZEOF_XMMWORD
|
cmp rcx, byte 2*SIZEOF_XMMWORD
|
||||||
jb short .column_st16
|
jb short .column_st16
|
||||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
movdqa xmmA,xmmF
|
movdqa xmmA,xmmF
|
||||||
sub rcx, byte 2*SIZEOF_XMMWORD
|
sub rcx, byte 2*SIZEOF_XMMWORD
|
||||||
jmp short .column_st15
|
jmp short .column_st15
|
||||||
.column_st16:
|
.column_st16:
|
||||||
cmp rcx, byte SIZEOF_XMMWORD
|
cmp rcx, byte SIZEOF_XMMWORD
|
||||||
jb short .column_st15
|
jb short .column_st15
|
||||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmD
|
movdqa xmmA,xmmD
|
||||||
sub rcx, byte SIZEOF_XMMWORD
|
sub rcx, byte SIZEOF_XMMWORD
|
||||||
@@ -367,7 +361,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
por xmmA,xmmG
|
por xmmA,xmmG
|
||||||
por xmmE,xmmC
|
por xmmE,xmmC
|
||||||
.adj0: ; ----------------
|
.adj0: ; ----------------
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [rdi],xmmA
|
||||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||||
|
|
||||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||||
@@ -413,19 +407,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||||
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
|
||||||
jmp short .out0
|
jmp short .out0
|
||||||
.out1: ; --(unaligned)-----------------
|
.out1: ; --(unaligned)-----------------
|
||||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
|
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
|
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
.out0:
|
.out0:
|
||||||
|
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||||
sub rcx, byte SIZEOF_XMMWORD
|
sub rcx, byte SIZEOF_XMMWORD
|
||||||
jz near .endcolumn
|
jz near .endcolumn
|
||||||
|
|
||||||
@@ -441,17 +430,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||||
cmp rcx, byte SIZEOF_XMMWORD/2
|
cmp rcx, byte SIZEOF_XMMWORD/2
|
||||||
jb short .column_st16
|
jb short .column_st16
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
movdqa xmmA,xmmC
|
movdqa xmmA,xmmC
|
||||||
movdqa xmmD,xmmH
|
movdqa xmmD,xmmH
|
||||||
sub rcx, byte SIZEOF_XMMWORD/2
|
sub rcx, byte SIZEOF_XMMWORD/2
|
||||||
.column_st16:
|
.column_st16:
|
||||||
cmp rcx, byte SIZEOF_XMMWORD/4
|
cmp rcx, byte SIZEOF_XMMWORD/4
|
||||||
jb short .column_st15
|
jb short .column_st15
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmD
|
movdqa xmmA,xmmD
|
||||||
sub rcx, byte SIZEOF_XMMWORD/4
|
sub rcx, byte SIZEOF_XMMWORD/4
|
||||||
@@ -510,7 +498,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
por xmmA,xmmB
|
por xmmA,xmmB
|
||||||
por xmmE,xmmG
|
por xmmE,xmmG
|
||||||
.adj0: ; ----------------
|
.adj0: ; ----------------
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [rdi],xmmA
|
||||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||||
|
|
||||||
%endif ; RGB_PIXELSIZE ; ---------------
|
%endif ; RGB_PIXELSIZE ; ---------------
|
||||||
|
|||||||
@@ -264,17 +264,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||||
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
||||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
|
||||||
jmp short .out0
|
jmp short .out0
|
||||||
.out1: ; --(unaligned)-----------------
|
.out1: ; --(unaligned)-----------------
|
||||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
||||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
|
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
|
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
.out0:
|
.out0:
|
||||||
|
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||||
sub ecx, byte SIZEOF_XMMWORD
|
sub ecx, byte SIZEOF_XMMWORD
|
||||||
jz near .endcolumn
|
jz near .endcolumn
|
||||||
|
|
||||||
@@ -292,17 +288,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
||||||
cmp ecx, byte 2*SIZEOF_XMMWORD
|
cmp ecx, byte 2*SIZEOF_XMMWORD
|
||||||
jb short .column_st16
|
jb short .column_st16
|
||||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
|
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
movdqa xmmA,xmmF
|
movdqa xmmA,xmmF
|
||||||
sub ecx, byte 2*SIZEOF_XMMWORD
|
sub ecx, byte 2*SIZEOF_XMMWORD
|
||||||
jmp short .column_st15
|
jmp short .column_st15
|
||||||
.column_st16:
|
.column_st16:
|
||||||
cmp ecx, byte SIZEOF_XMMWORD
|
cmp ecx, byte SIZEOF_XMMWORD
|
||||||
jb short .column_st15
|
jb short .column_st15
|
||||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmD
|
movdqa xmmA,xmmD
|
||||||
sub ecx, byte SIZEOF_XMMWORD
|
sub ecx, byte SIZEOF_XMMWORD
|
||||||
@@ -380,7 +375,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
por xmmA,xmmG
|
por xmmA,xmmG
|
||||||
por xmmE,xmmC
|
por xmmE,xmmC
|
||||||
.adj0: ; ----------------
|
.adj0: ; ----------------
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [edi], xmmA
|
||||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||||
|
|
||||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||||
@@ -426,19 +421,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||||
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
||||||
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
||||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
|
||||||
jmp short .out0
|
jmp short .out0
|
||||||
.out1: ; --(unaligned)-----------------
|
.out1: ; --(unaligned)-----------------
|
||||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
||||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
|
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
|
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
|
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
.out0:
|
.out0:
|
||||||
|
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||||
sub ecx, byte SIZEOF_XMMWORD
|
sub ecx, byte SIZEOF_XMMWORD
|
||||||
jz near .endcolumn
|
jz near .endcolumn
|
||||||
|
|
||||||
@@ -455,17 +445,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||||
cmp ecx, byte SIZEOF_XMMWORD/2
|
cmp ecx, byte SIZEOF_XMMWORD/2
|
||||||
jb short .column_st16
|
jb short .column_st16
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
|
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
|
||||||
movdqa xmmA,xmmC
|
movdqa xmmA,xmmC
|
||||||
movdqa xmmD,xmmH
|
movdqa xmmD,xmmH
|
||||||
sub ecx, byte SIZEOF_XMMWORD/2
|
sub ecx, byte SIZEOF_XMMWORD/2
|
||||||
.column_st16:
|
.column_st16:
|
||||||
cmp ecx, byte SIZEOF_XMMWORD/4
|
cmp ecx, byte SIZEOF_XMMWORD/4
|
||||||
jb short .column_st15
|
jb short .column_st15
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||||
movdqa xmmA,xmmD
|
movdqa xmmA,xmmD
|
||||||
sub ecx, byte SIZEOF_XMMWORD/4
|
sub ecx, byte SIZEOF_XMMWORD/4
|
||||||
@@ -524,7 +513,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
por xmmA,xmmB
|
por xmmA,xmmB
|
||||||
por xmmE,xmmG
|
por xmmE,xmmG
|
||||||
.adj0: ; ----------------
|
.adj0: ; ----------------
|
||||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
movdqu XMMWORD [edi], xmmA
|
||||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||||
|
|
||||||
%endif ; RGB_PIXELSIZE ; ---------------
|
%endif ; RGB_PIXELSIZE ; ---------------
|
||||||
|
|||||||
Reference in New Issue
Block a user