Fixed regression caused by a bug in the 32-bit strict memory access code in jdmrgss2.asm (contributed by Chromium to stop valgrind from whining whenever the output buffer size was not evenly divisible by 16 bytes.) On Linux/x86, this regression caused incorrect pixels on the right-hand side of images whose rows were not 16-byte aligned, whenever fancy upsampling was used. This patch also enables the strict memory access code on all platforms, not just Linux (it does no harm on other platforms) and removes a couple of pcmpeqb instructions that were rendered unnecessary by r836.

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@839 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
DRC
2012-06-15 21:58:06 +00:00
6 changed files with 13 additions and 352 deletions

View File

@@ -44,6 +44,12 @@ it is painfully slow on Bobcat processors in particular. Eliminating the use
of this instruction improved performance by an order of magnitude on Bobcat
processors and by a small amount (typically 5%) on AMD desktop processors.
[10] Fixed a regression caused by 1.2.0[2] whereby, on Linux/x86 platforms,
decompressing a 4:2:0 or 4:2:2 JPEG image without using fancy upsampling would
produce several incorrect columns of pixels at the right-hand side of the
output image if each row in the output image was not evenly divisible by 16
bytes.
1.2.0
=====

View File

@@ -1,7 +1,7 @@
;
; jdclrss2-64.asm - colorspace conversion (64-bit SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2009 D. R. Commander
;
; Based on
@@ -267,7 +267,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
jmp near .columnloop
.column_st32:
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
@@ -285,7 +284,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
.column_st15:
%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_MMWORD
@@ -319,47 +317,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
test rcx, rcx
jz short .nextrow
mov BYTE [rdi], al
%else
mov rax,rcx
xor rcx, byte 0x0F
shl rcx, 2
movd xmmB,ecx
psrlq xmmH,4
pcmpeqb xmmE,xmmE
psrlq xmmH,xmmB
psrlq xmmE,xmmB
punpcklbw xmmE,xmmH
; ----------------
mov rcx,rdi
and rcx, byte SIZEOF_XMMWORD-1
jz short .adj0
add rax,rcx
cmp rax, byte SIZEOF_XMMWORD
ja short .adj0
and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,rcx
movdqa xmmG,xmmA
movdqa xmmC,xmmE
pslldq xmmA, SIZEOF_XMMWORD/2
pslldq xmmE, SIZEOF_XMMWORD/2
movd xmmD,ecx
sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
jb short .adj1
movd xmmF,ecx
psllq xmmA,xmmF
psllq xmmE,xmmF
jmp short .adj0
.adj1: neg ecx
movd xmmF,ecx
psrlq xmmA,xmmF
psrlq xmmE,xmmF
psllq xmmG,xmmD
psllq xmmC,xmmD
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
movdqu XMMWORD [rdi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -421,7 +378,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
jmp near .columnloop
.column_st32:
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
@@ -438,7 +394,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
.column_st15:
%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_XMMWORD/8
@@ -453,47 +408,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
test rcx, rcx
jz short .nextrow
movd DWORD [rdi], xmmA
%else
cmp rcx, byte SIZEOF_XMMWORD/16
jb near .nextrow
mov rax,rcx
xor rcx, byte 0x03
inc rcx
shl rcx, 4
movd xmmF,ecx
psrlq xmmE,xmmF
punpcklbw xmmE,xmmE
; ----------------
mov rcx,rdi
and rcx, byte SIZEOF_XMMWORD-1
jz short .adj0
lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
cmp rax, byte SIZEOF_XMMWORD
ja short .adj0
and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
movdqa xmmB,xmmA
movdqa xmmG,xmmE
pslldq xmmA, SIZEOF_XMMWORD/2
pslldq xmmE, SIZEOF_XMMWORD/2
movd xmmC,ecx
sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
jb short .adj1
movd xmmH,ecx
psllq xmmA,xmmH
psllq xmmE,xmmH
jmp short .adj0
.adj1: neg rcx
movd xmmH,ecx
psrlq xmmA,xmmH
psrlq xmmE,xmmH
psllq xmmB,xmmC
psllq xmmG,xmmC
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
movdqu XMMWORD [rdi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------

View File

@@ -1,7 +1,7 @@
;
; jdclrss2.asm - colorspace conversion (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
;
; Based on
; x86 SIMD extension for IJG JPEG library
@@ -279,7 +279,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
alignx 16,7
.column_st32:
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
@@ -297,7 +296,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
@@ -331,47 +329,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
test ecx, ecx
jz short .nextrow
mov BYTE [edi], al
%else
mov eax,ecx
xor ecx, byte 0x0F
shl ecx, 2
movd xmmB,ecx
psrlq xmmH,4
pcmpeqb xmmE,xmmE
psrlq xmmH,xmmB
psrlq xmmE,xmmB
punpcklbw xmmE,xmmH
; ----------------
mov ecx,edi
and ecx, byte SIZEOF_XMMWORD-1
jz short .adj0
add eax,ecx
cmp eax, byte SIZEOF_XMMWORD
ja short .adj0
and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
movdqa xmmG,xmmA
movdqa xmmC,xmmE
pslldq xmmA, SIZEOF_XMMWORD/2
pslldq xmmE, SIZEOF_XMMWORD/2
movd xmmD,ecx
sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
jb short .adj1
movd xmmF,ecx
psllq xmmA,xmmF
psllq xmmE,xmmF
jmp short .adj0
.adj1: neg ecx
movd xmmF,ecx
psrlq xmmA,xmmF
psrlq xmmE,xmmF
psllq xmmG,xmmD
psllq xmmC,xmmD
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -434,7 +391,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
alignx 16,7
.column_st32:
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
@@ -451,7 +407,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_XMMWORD/8
@@ -466,47 +421,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
test ecx, ecx
jz short .nextrow
movd DWORD [edi], xmmA
%else
cmp ecx, byte SIZEOF_XMMWORD/16
jb short .nextrow
mov eax,ecx
xor ecx, byte 0x03
inc ecx
shl ecx, 4
movd xmmF,ecx
psrlq xmmE,xmmF
punpcklbw xmmE,xmmE
; ----------------
mov ecx,edi
and ecx, byte SIZEOF_XMMWORD-1
jz short .adj0
lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
cmp eax, byte SIZEOF_XMMWORD
ja short .adj0
and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
movdqa xmmB,xmmA
movdqa xmmG,xmmE
pslldq xmmA, SIZEOF_XMMWORD/2
pslldq xmmE, SIZEOF_XMMWORD/2
movd xmmC,ecx
sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
jb short .adj1
movd xmmH,ecx
psllq xmmA,xmmH
psllq xmmE,xmmH
jmp short .adj0
.adj1: neg ecx
movd xmmH,ecx
psrlq xmmA,xmmH
psrlq xmmE,xmmH
psllq xmmB,xmmC
psllq xmmG,xmmC
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------

View File

@@ -1,7 +1,7 @@
;
; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2009 D. R. Commander
;
; Based on
@@ -12,7 +12,7 @@
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ for
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; [TAB8]
@@ -288,7 +288,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
.column_st15:
%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_MMWORD
@@ -322,47 +321,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
test rcx, rcx
jz short .endcolumn
mov BYTE [rdi], al
%else
mov rax,rcx
xor rcx, byte 0x0F
shl rcx, 2
movd xmmB,ecx
psrlq xmmH,4
pcmpeqb xmmE,xmmE
psrlq xmmH,xmmB
psrlq xmmE,xmmB
punpcklbw xmmE,xmmH
; ----------------
mov rcx,rdi
and rcx, byte SIZEOF_XMMWORD-1
jz short .adj0
add rax,rcx
cmp rax, byte SIZEOF_XMMWORD
ja short .adj0
and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
movdqa xmmG,xmmA
movdqa xmmC,xmmE
pslldq xmmA, SIZEOF_XMMWORD/2
pslldq xmmE, SIZEOF_XMMWORD/2
movd xmmD,ecx
sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
jb short .adj1
movd xmmF,ecx
psllq xmmA,xmmF
psllq xmmE,xmmF
jmp short .adj0
.adj1: neg rcx
movd xmmF,ecx
psrlq xmmA,xmmF
psrlq xmmE,xmmF
psllq xmmG,xmmD
psllq xmmC,xmmD
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
movdqu XMMWORD [rdi],xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -427,7 +385,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
jmp near .columnloop
.column_st32:
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
@@ -444,7 +401,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
.column_st15:
%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_XMMWORD/8
@@ -459,47 +415,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
test rcx, rcx
jz short .endcolumn
movd DWORD [rdi], xmmA
%else
cmp rcx, byte SIZEOF_XMMWORD/16
jb near .endcolumn
mov rax,rcx
xor rcx, byte 0x03
inc rcx
shl rcx, 4
movd xmmF,ecx
psrlq xmmE,xmmF
punpcklbw xmmE,xmmE
; ----------------
mov rcx,rdi
and rcx, byte SIZEOF_XMMWORD-1
jz short .adj0
lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
cmp rax, byte SIZEOF_XMMWORD
ja short .adj0
and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
movdqa xmmB,xmmA
movdqa xmmG,xmmE
pslldq xmmA, SIZEOF_XMMWORD/2
pslldq xmmE, SIZEOF_XMMWORD/2
movd xmmC,ecx
sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
jb short .adj1
movd xmmH,ecx
psllq xmmA,xmmH
psllq xmmE,xmmH
jmp short .adj0
.adj1: neg rcx
movd xmmH,ecx
psrlq xmmA,xmmH
psrlq xmmE,xmmH
psllq xmmB,xmmC
psllq xmmG,xmmC
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
movdqu XMMWORD [rdi],xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------

View File

@@ -1,7 +1,7 @@
;
; jdmrgss2.asm - merged upsampling/color conversion (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
;
; Based on
; x86 SIMD extension for IJG JPEG library
@@ -284,7 +284,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
alignx 16,7
.column_st32:
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
@@ -302,7 +301,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
@@ -336,47 +334,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
test ecx, ecx
jz short .endcolumn
mov BYTE [edi], al
%else
mov eax,ecx
xor ecx, byte 0x0F
shl ecx, 2
movd xmmB,ecx
psrlq xmmH,4
pcmpeqb xmmE,xmmE
psrlq xmmH,xmmB
psrlq xmmE,xmmB
punpcklbw xmmE,xmmH
; ----------------
mov ecx,edi
and ecx, byte SIZEOF_XMMWORD-1
jz short .adj0
add eax,ecx
cmp eax, byte SIZEOF_XMMWORD
ja short .adj0
and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
movdqa xmmG,xmmA
movdqa xmmC,xmmE
pslldq xmmA, SIZEOF_XMMWORD/2
pslldq xmmE, SIZEOF_XMMWORD/2
movd xmmD,ecx
sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
jb short .adj1
movd xmmF,ecx
psllq xmmA,xmmF
psllq xmmE,xmmF
jmp short .adj0
.adj1: neg ecx
movd xmmF,ecx
psrlq xmmA,xmmF
psrlq xmmE,xmmF
psllq xmmG,xmmD
psllq xmmC,xmmD
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -442,7 +399,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
alignx 16,7
.column_st32:
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
@@ -459,62 +415,20 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_XMMWORD/8
jb short .column_st7
movq MMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD/2
add edi, byte SIZEOF_XMMWORD/8*4
sub ecx, byte SIZEOF_XMMWORD/8
psrldq xmmA, 64
psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7:
; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space.
test ecx, ecx
jz short .endcolumn
movd DWORD [edi], xmmA
%else
cmp ecx, byte SIZEOF_XMMWORD/16
jb short .endcolumn
mov eax,ecx
xor ecx, byte 0x03
inc ecx
shl ecx, 4
movd xmmF,ecx
psrlq xmmE,xmmF
punpcklbw xmmE,xmmE
; ----------------
mov ecx,edi
and ecx, byte SIZEOF_XMMWORD-1
jz short .adj0
lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
cmp eax, byte SIZEOF_XMMWORD
ja short .adj0
and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
movdqa xmmB,xmmA
movdqa xmmG,xmmE
pslldq xmmA, SIZEOF_XMMWORD/2
pslldq xmmE, SIZEOF_XMMWORD/2
movd xmmC,ecx
sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
jb short .adj1
movd xmmH,ecx
psllq xmmA,xmmH
psllq xmmE,xmmH
jmp short .adj0
.adj1: neg ecx
movd xmmH,ecx
psrlq xmmA,xmmH
psrlq xmmE,xmmH
psllq xmmB,xmmC
psllq xmmG,xmmC
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------

View File

@@ -86,8 +86,6 @@ section .note.GNU-stack noalloc noexec nowrite progbits
%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
%endif
%define STRICT_MEMORY_ACCESS 1
; To make the code position-independent, append -DPIC to the commandline
;
%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC