Eliminate the use of the MASKMOVDQU instruction, to speed up decompression performance by 10x on AMD Bobcat embedded processors (and ~5% on AMD desktop processors.)
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@835 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
@@ -19,6 +19,13 @@ calling conventions.
|
||||
images (specifically, images in which the component count was erroneously set
|
||||
to a large value) would cause libjpeg-turbo to segfault.
|
||||
|
||||
[5] Worked around a severe performance issue with "Bobcat" (AMD Embedded APU)
|
||||
processors. The MASKMOVDQU instruction, which was used by the libjpeg-turbo
|
||||
SSE2 SIMD code, is apparently implemented in microcode on AMD processors, and
|
||||
it is painfully slow on Bobcat processors in particular. Eliminating the use
|
||||
of this instruction improved performance by an order of magnitude on Bobcat
|
||||
processors and by a small amount (typically 5%) on AMD desktop processors.
|
||||
|
||||
|
||||
1.2.0
|
||||
=====
|
||||
|
||||
@@ -251,17 +251,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||
.out0:
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
@@ -275,17 +271,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp rcx, byte 2*SIZEOF_XMMWORD
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmF
|
||||
sub rcx, byte 2*SIZEOF_XMMWORD
|
||||
jmp short .column_st15
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
@@ -363,7 +358,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
por xmmA,xmmG
|
||||
por xmmE,xmmC
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
movdqu XMMWORD [rdi], xmmA
|
||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
@@ -409,19 +404,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||
.out0:
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
@@ -434,17 +424,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
cmp rcx, byte SIZEOF_XMMWORD/2
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmC
|
||||
movdqa xmmD,xmmH
|
||||
sub rcx, byte SIZEOF_XMMWORD/2
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_XMMWORD/4
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub rcx, byte SIZEOF_XMMWORD/4
|
||||
@@ -503,7 +492,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
por xmmA,xmmB
|
||||
por xmmE,xmmG
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
movdqu XMMWORD [rdi], xmmA
|
||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
@@ -262,17 +262,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
||||
.out0:
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
@@ -287,17 +283,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp ecx, byte 2*SIZEOF_XMMWORD
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmF
|
||||
sub ecx, byte 2*SIZEOF_XMMWORD
|
||||
jmp short .column_st15
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
@@ -375,7 +370,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
por xmmA,xmmG
|
||||
por xmmE,xmmC
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
movdqu XMMWORD [edi], xmmA
|
||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
@@ -421,19 +416,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
||||
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
||||
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
||||
.out0:
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
@@ -447,17 +437,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
cmp ecx, byte SIZEOF_XMMWORD/2
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmC
|
||||
movdqa xmmD,xmmH
|
||||
sub ecx, byte SIZEOF_XMMWORD/2
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_XMMWORD/4
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub ecx, byte SIZEOF_XMMWORD/4
|
||||
@@ -516,7 +505,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
por xmmA,xmmB
|
||||
por xmmE,xmmG
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
movdqu XMMWORD [edi], xmmA
|
||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
@@ -252,17 +252,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||
.out0:
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
@@ -275,21 +271,19 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st32:
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp rcx, byte 2*SIZEOF_XMMWORD
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmF
|
||||
sub rcx, byte 2*SIZEOF_XMMWORD
|
||||
jmp short .column_st15
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
@@ -367,7 +361,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
por xmmA,xmmG
|
||||
por xmmE,xmmC
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
movdqu XMMWORD [rdi],xmmA
|
||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
@@ -413,19 +407,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||
.out0:
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
@@ -441,17 +430,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
cmp rcx, byte SIZEOF_XMMWORD/2
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmC
|
||||
movdqa xmmD,xmmH
|
||||
sub rcx, byte SIZEOF_XMMWORD/2
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_XMMWORD/4
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub rcx, byte SIZEOF_XMMWORD/4
|
||||
@@ -510,7 +498,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
por xmmA,xmmB
|
||||
por xmmE,xmmG
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
movdqu XMMWORD [rdi],xmmA
|
||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
@@ -264,17 +264,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
||||
.out0:
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
@@ -292,17 +288,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp ecx, byte 2*SIZEOF_XMMWORD
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmF
|
||||
sub ecx, byte 2*SIZEOF_XMMWORD
|
||||
jmp short .column_st15
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
@@ -380,7 +375,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
por xmmA,xmmG
|
||||
por xmmE,xmmC
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
movdqu XMMWORD [edi], xmmA
|
||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
@@ -426,19 +421,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
||||
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
||||
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
||||
.out0:
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
@@ -455,17 +445,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
cmp ecx, byte SIZEOF_XMMWORD/2
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmC
|
||||
movdqa xmmD,xmmH
|
||||
sub ecx, byte SIZEOF_XMMWORD/2
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_XMMWORD/4
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub ecx, byte SIZEOF_XMMWORD/4
|
||||
@@ -524,7 +513,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
por xmmA,xmmB
|
||||
por xmmE,xmmG
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
movdqu XMMWORD [edi], xmmA
|
||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
Reference in New Issue
Block a user