Eliminate the use of the MASKMOVDQU instruction, to speed up decompression performance by 10x on AMD Bobcat embedded processors (and ~5% on AMD desktop processors.)

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@835 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
DRC
2012-06-13 01:21:29 +00:00
parent dd2b651243
commit 69799275be
5 changed files with 83 additions and 121 deletions

View File

@@ -19,6 +19,13 @@ calling conventions.
images (specifically, images in which the component count was erroneously set images (specifically, images in which the component count was erroneously set
to a large value) would cause libjpeg-turbo to segfault. to a large value) would cause libjpeg-turbo to segfault.
[5] Worked around a severe performance issue with "Bobcat" (AMD Embedded APU)
processors. The MASKMOVDQU instruction, which was used by the libjpeg-turbo
SSE2 SIMD code, is apparently implemented in microcode on AMD processors, and
it is painfully slow on Bobcat processors in particular. Eliminating the use
of this instruction improved performance by an order of magnitude on Bobcat
processors and by a small amount (typically 5%) on AMD desktop processors.
1.2.0 1.2.0
===== =====

View File

@@ -251,17 +251,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0 jmp short .out0
.out1: ; --(unaligned)----------------- .out1: ; --(unaligned)-----------------
pcmpeqb xmmH,xmmH ; xmmH=(all 1's) movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
add rdi, byte SIZEOF_XMMWORD ; outptr
.out0: .out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow jz near .nextrow
@@ -275,17 +271,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16 jb short .column_st16
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD add rdi, byte 2*SIZEOF_XMMWORD ; outptr
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF movdqa xmmA,xmmF
sub rcx, byte 2*SIZEOF_XMMWORD sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15 jmp short .column_st15
.column_st16: .column_st16:
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15 jb short .column_st15
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
@@ -363,7 +358,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
por xmmA,xmmG por xmmA,xmmG
por xmmE,xmmC por xmmE,xmmC
.adj0: ; ---------------- .adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA movdqu XMMWORD [rdi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; --------------- %endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
@@ -409,19 +404,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0 jmp short .out0
.out1: ; --(unaligned)----------------- .out1: ; --(unaligned)-----------------
pcmpeqb xmmE,xmmE ; xmmE=(all 1's) movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
add rdi, byte SIZEOF_XMMWORD ; outptr
.out0: .out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow jz near .nextrow
@@ -434,17 +424,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
pcmpeqb xmmE,xmmE ; xmmE=(all 1's) pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2 cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16 jb short .column_st16
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD add rdi, byte 2*SIZEOF_XMMWORD ; outptr
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC movdqa xmmA,xmmC
movdqa xmmD,xmmH movdqa xmmD,xmmH
sub rcx, byte SIZEOF_XMMWORD/2 sub rcx, byte SIZEOF_XMMWORD/2
.column_st16: .column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4 cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15 jb short .column_st15
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4 sub rcx, byte SIZEOF_XMMWORD/4
@@ -503,7 +492,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
por xmmA,xmmB por xmmA,xmmB
por xmmE,xmmG por xmmE,xmmG
.adj0: ; ---------------- .adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA movdqu XMMWORD [rdi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; --------------- %endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------

View File

@@ -262,17 +262,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0 jmp short .out0
.out1: ; --(unaligned)----------------- .out1: ; --(unaligned)-----------------
pcmpeqb xmmH,xmmH ; xmmH=(all 1's) movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
add edi, byte SIZEOF_XMMWORD ; outptr
.out0: .out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow jz near .nextrow
@@ -287,17 +283,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16 jb short .column_st16
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD add edi, byte 2*SIZEOF_XMMWORD ; outptr
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF movdqa xmmA,xmmF
sub ecx, byte 2*SIZEOF_XMMWORD sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15 jmp short .column_st15
.column_st16: .column_st16:
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15 jb short .column_st15
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
@@ -375,7 +370,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
por xmmA,xmmG por xmmA,xmmG
por xmmE,xmmC por xmmE,xmmC
.adj0: ; ---------------- .adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; --------------- %endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
@@ -421,19 +416,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0 jmp short .out0
.out1: ; --(unaligned)----------------- .out1: ; --(unaligned)-----------------
pcmpeqb xmmE,xmmE ; xmmE=(all 1's) movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
add edi, byte SIZEOF_XMMWORD ; outptr
.out0: .out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow jz near .nextrow
@@ -447,17 +437,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
pcmpeqb xmmE,xmmE ; xmmE=(all 1's) pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2 cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16 jb short .column_st16
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD add edi, byte 2*SIZEOF_XMMWORD ; outptr
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC movdqa xmmA,xmmC
movdqa xmmD,xmmH movdqa xmmD,xmmH
sub ecx, byte SIZEOF_XMMWORD/2 sub ecx, byte SIZEOF_XMMWORD/2
.column_st16: .column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4 cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15 jb short .column_st15
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4 sub ecx, byte SIZEOF_XMMWORD/4
@@ -516,7 +505,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
por xmmA,xmmB por xmmA,xmmB
por xmmE,xmmG por xmmE,xmmG
.adj0: ; ---------------- .adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; --------------- %endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------

View File

@@ -252,17 +252,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0 jmp short .out0
.out1: ; --(unaligned)----------------- .out1: ; --(unaligned)-----------------
pcmpeqb xmmH,xmmH ; xmmH=(all 1's) movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
add rdi, byte SIZEOF_XMMWORD ; outptr
.out0: .out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn jz near .endcolumn
@@ -275,21 +271,19 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
jmp near .columnloop jmp near .columnloop
.column_st32: .column_st32:
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16 jb short .column_st16
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD add rdi, byte 2*SIZEOF_XMMWORD ; outptr
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF movdqa xmmA,xmmF
sub rcx, byte 2*SIZEOF_XMMWORD sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15 jmp short .column_st15
.column_st16: .column_st16:
cmp rcx, byte SIZEOF_XMMWORD cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15 jb short .column_st15
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
@@ -367,7 +361,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
por xmmA,xmmG por xmmA,xmmG
por xmmE,xmmC por xmmE,xmmC
.adj0: ; ---------------- .adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [rdi],xmmA
%endif ; STRICT_MEMORY_ACCESS ; --------------- %endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
@@ -413,19 +407,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0 jmp short .out0
.out1: ; --(unaligned)----------------- .out1: ; --(unaligned)-----------------
pcmpeqb xmmE,xmmE ; xmmE=(all 1's) movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
add rdi, byte SIZEOF_XMMWORD ; outptr
.out0: .out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn jz near .endcolumn
@@ -441,17 +430,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
pcmpeqb xmmE,xmmE ; xmmE=(all 1's) pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2 cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16 jb short .column_st16
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD add rdi, byte 2*SIZEOF_XMMWORD ; outptr
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC movdqa xmmA,xmmC
movdqa xmmD,xmmH movdqa xmmD,xmmH
sub rcx, byte SIZEOF_XMMWORD/2 sub rcx, byte SIZEOF_XMMWORD/2
.column_st16: .column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4 cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15 jb short .column_st15
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4 sub rcx, byte SIZEOF_XMMWORD/4
@@ -510,7 +498,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
por xmmA,xmmB por xmmA,xmmB
por xmmE,xmmG por xmmE,xmmG
.adj0: ; ---------------- .adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [rdi],xmmA
%endif ; STRICT_MEMORY_ACCESS ; --------------- %endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------

View File

@@ -264,17 +264,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0 jmp short .out0
.out1: ; --(unaligned)----------------- .out1: ; --(unaligned)-----------------
pcmpeqb xmmH,xmmH ; xmmH=(all 1's) movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
add edi, byte SIZEOF_XMMWORD ; outptr
.out0: .out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn jz near .endcolumn
@@ -292,17 +288,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16 jb short .column_st16
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD add edi, byte 2*SIZEOF_XMMWORD ; outptr
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF movdqa xmmA,xmmF
sub ecx, byte 2*SIZEOF_XMMWORD sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15 jmp short .column_st15
.column_st16: .column_st16:
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15 jb short .column_st15
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
@@ -380,7 +375,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
por xmmA,xmmG por xmmA,xmmG
por xmmE,xmmC por xmmE,xmmC
.adj0: ; ---------------- .adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; --------------- %endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; ----------- %else ; RGB_PIXELSIZE == 4 ; -----------
@@ -426,19 +421,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0 jmp short .out0
.out1: ; --(unaligned)----------------- .out1: ; --(unaligned)-----------------
pcmpeqb xmmE,xmmE ; xmmE=(all 1's) movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
add edi, byte SIZEOF_XMMWORD ; outptr
.out0: .out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn jz near .endcolumn
@@ -455,17 +445,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
pcmpeqb xmmE,xmmE ; xmmE=(all 1's) pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2 cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16 jb short .column_st16
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD add edi, byte 2*SIZEOF_XMMWORD ; outptr
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC movdqa xmmA,xmmC
movdqa xmmD,xmmH movdqa xmmD,xmmH
sub ecx, byte SIZEOF_XMMWORD/2 sub ecx, byte SIZEOF_XMMWORD/2
.column_st16: .column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4 cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15 jb short .column_st15
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4 sub ecx, byte SIZEOF_XMMWORD/4
@@ -524,7 +513,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
por xmmA,xmmB por xmmA,xmmB
por xmmE,xmmG por xmmE,xmmG
.adj0: ; ---------------- .adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; --------------- %endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------