Eliminate the use of the MASKMOVDQU instruction, to speed up decompression performance by 10x on AMD Bobcat embedded processors (and ~5% on AMD desktop processors.)

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@835 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
DRC
2012-06-13 01:21:29 +00:00
parent dd2b651243
commit 69799275be
5 changed files with 83 additions and 121 deletions

View File

@@ -19,6 +19,13 @@ calling conventions.
images (specifically, images in which the component count was erroneously set
to a large value) would cause libjpeg-turbo to segfault.
[5] Worked around a severe performance issue with "Bobcat" (AMD Embedded APU)
processors. The MASKMOVDQU instruction, which was used by the libjpeg-turbo
SSE2 SIMD code, is apparently implemented in microcode on AMD processors, and
it is painfully slow on Bobcat processors in particular. Eliminating the use
of this instruction improved performance by an order of magnitude on Bobcat
processors and by a small amount (typically 5%) on AMD desktop processors.
1.2.0
=====

View File

@@ -251,17 +251,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
.out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow
@@ -275,17 +271,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
@@ -363,7 +358,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
movdqu XMMWORD [rdi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -409,19 +404,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
.out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow
@@ -434,17 +424,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub rcx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
@@ -503,7 +492,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
movdqu XMMWORD [rdi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------

View File

@@ -262,17 +262,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
add edi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow
@@ -287,17 +283,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
@@ -375,7 +370,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -421,19 +416,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
add edi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow
@@ -447,17 +437,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub ecx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
@@ -516,7 +505,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------

View File

@@ -252,17 +252,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
.out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn
@@ -275,21 +271,19 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
jmp near .columnloop
.column_st32:
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
@@ -367,7 +361,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
movdqu XMMWORD [rdi],xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -413,19 +407,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
.out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn
@@ -441,17 +430,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub rcx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
@@ -510,7 +498,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
movdqu XMMWORD [rdi],xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------

View File

@@ -264,17 +264,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
add edi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn
@@ -292,17 +288,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
@@ -380,7 +375,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -426,19 +421,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
add edi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn
@@ -455,17 +445,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub ecx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
@@ -524,7 +513,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------