Fix x86-64 ABI conformance issue in SIMD code

(descriptions cribbed by DRC from discussion in #20)
In the x86-64 ABI, the high (unused) DWORD of a 32-bit argument's
register is undefined, so it was incorrect to use a 64-bit mov
instruction to transfer a JDIMENSION argument in the 64-bit SSE2 SIMD
functions.  The code worked thus far only because the existing compiler
optimizers weren't smart enough to do anything else with the register in
question, so the upper 32 bits happened to be all zeroes-- for the past
6 years, on every x86-64 compiler previously known to mankind.

The bleeding-edge Clang/LLVM compiler has a smarter optimizer, and
under certain circumstances, it will attempt to load-combine adjacent
32-bit integers from one of the libjpeg structures into a single 64-bit
integer and pass that 64-bit integer as a 32-bit argument to one of the
SIMD functions (which is allowed by the ABI, since the upper 32 bits of
the 32-bit argument's register are undefined.)  This caused the
libjpeg-turbo regression tests to crash.

Also enhance the documentation of JDIMENSION to explain that its size
is significant to the implementation of the SIMD code.

Closes #20.  Refer also to http://crbug.com/532214.
This commit is contained in:
Chandler Carruth
2015-09-15 11:57:03 -07:00
committed by DRC
parent 465a9fe0cb
commit 498d9bc92f
14 changed files with 40 additions and 28 deletions

View File

@@ -11,6 +11,16 @@ incorrectly encode certain JPEG images when quality=100 and the fast integer
forward DCT were used. This was known to cause 'make test' to fail when the forward DCT were used. This was known to cause 'make test' to fail when the
library was built with '-march=haswell' on x86 systems. library was built with '-march=haswell' on x86 systems.
[3] Fixed an issue whereby libjpeg-turbo would crash when built with the latest
& greatest development version of the Clang/LLVM compiler. This was caused by
an x86-64 ABI conformance issue in some of libjpeg-turbo's 64-bit SSE2 SIMD
routines. Those routines were incorrectly using a 64-bit mov instruction to
transfer a 32-bit JDIMENSION argument, whereas the x86-64 ABI allows the upper
(unused) 32 bits of a 32-bit argument's register to be undefined. The new
Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit
structure members into a single 64-bit register, and this exposed the ABI
conformance issue.
1.4.1 1.4.1
===== =====

View File

@@ -162,7 +162,9 @@ typedef long INT32;
* images up to 64K*64K due to 16-bit fields in SOF markers. Therefore * images up to 64K*64K due to 16-bit fields in SOF markers. Therefore
* "unsigned int" is sufficient on all machines. However, if you need to * "unsigned int" is sufficient on all machines. However, if you need to
* handle larger images and you don't mind deviating from the spec, you * handle larger images and you don't mind deviating from the spec, you
* can change this datatype. * can change this datatype. (Note that changing this datatype will
* potentially require modifying the SIMD code. The x86-64 SIMD extensions,
* in particular, assume a 32-bit JDIMENSION.)
*/ */
typedef unsigned int JDIMENSION; typedef unsigned int JDIMENSION;

View File

@@ -50,14 +50,14 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
collect_args collect_args
push rbx push rbx
mov rcx, r10 mov ecx, r10d
test rcx,rcx test rcx,rcx
jz near .return jz near .return
push rcx push rcx
mov rsi, r12 mov rsi, r12
mov rcx, r13 mov ecx, r13d
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]

View File

@@ -50,14 +50,14 @@ EXTN(jsimd_rgb_gray_convert_sse2):
collect_args collect_args
push rbx push rbx
mov rcx, r10 mov ecx, r10d
test rcx,rcx test rcx,rcx
jz near .return jz near .return
push rcx push rcx
mov rsi, r12 mov rsi, r12
mov rcx, r13 mov ecx, r13d
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]

View File

@@ -49,11 +49,11 @@ EXTN(jsimd_h2v1_downsample_sse2):
mov rbp,rsp mov rbp,rsp
collect_args collect_args
mov rcx, r13 mov ecx, r13d
shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
jz near .return jz near .return
mov rdx, r10 mov edx, r10d
; -- expand_right_edge ; -- expand_right_edge
@@ -90,7 +90,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
; -- h2v1_downsample ; -- h2v1_downsample
mov rax, r12 ; rowctr mov eax, r12d ; rowctr
test eax,eax test eax,eax
jle near .return jle near .return
@@ -193,11 +193,11 @@ EXTN(jsimd_h2v2_downsample_sse2):
mov rbp,rsp mov rbp,rsp
collect_args collect_args
mov rcx, r13 mov ecx, r13d
shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
jz near .return jz near .return
mov rdx, r10 mov edx, r10d
; -- expand_right_edge ; -- expand_right_edge
@@ -234,7 +234,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
; -- h2v2_downsample ; -- h2v2_downsample
mov rax, r12 ; rowctr mov eax, r12d ; rowctr
test rax,rax test rax,rax
jle near .return jle near .return

View File

@@ -52,14 +52,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
collect_args collect_args
push rbx push rbx
mov rcx, r10 ; num_cols mov ecx, r10d ; num_cols
test rcx,rcx test rcx,rcx
jz near .return jz near .return
push rcx push rcx
mov rdi, r11 mov rdi, r11
mov rcx, r12 mov ecx, r12d
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]

View File

@@ -52,14 +52,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
collect_args collect_args
push rbx push rbx
mov rcx, r10 ; col mov ecx, r10d ; col
test rcx,rcx test rcx,rcx
jz near .return jz near .return
push rcx push rcx
mov rdi, r11 mov rdi, r11
mov rcx, r12 mov ecx, r12d
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
@@ -455,10 +455,10 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
collect_args collect_args
push rbx push rbx
mov rax, r10 mov eax, r10d
mov rdi, r11 mov rdi, r11
mov rcx, r12 mov ecx, r12d
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]

View File

@@ -67,7 +67,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
mov rbp,rsp mov rbp,rsp
collect_args collect_args
mov rax, r11 ; colctr mov eax, r11d ; colctr
test rax,rax test rax,rax
jz near .return jz near .return
@@ -214,7 +214,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
collect_args collect_args
push rbx push rbx
mov rax, r11 ; colctr mov eax, r11d ; colctr
test rax,rax test rax,rax
jz near .return jz near .return
@@ -506,7 +506,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
mov rbp,rsp mov rbp,rsp
collect_args collect_args
mov rdx, r11 mov edx, r11d
add rdx, byte (2*SIZEOF_XMMWORD)-1 add rdx, byte (2*SIZEOF_XMMWORD)-1
and rdx, byte -(2*SIZEOF_XMMWORD) and rdx, byte -(2*SIZEOF_XMMWORD)
jz near .return jz near .return
@@ -596,7 +596,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
collect_args collect_args
push rbx push rbx
mov rdx, r11 mov edx, r11d
add rdx, byte (2*SIZEOF_XMMWORD)-1 add rdx, byte (2*SIZEOF_XMMWORD)-1
and rdx, byte -(2*SIZEOF_XMMWORD) and rdx, byte -(2*SIZEOF_XMMWORD)
jz near .return jz near .return

View File

@@ -326,7 +326,7 @@ EXTN(jsimd_idct_float_sse2):
mov rax, [original_rbp] mov rax, [original_rbp]
lea rsi, [workspace] ; FAST_FLOAT * wsptr lea rsi, [workspace] ; FAST_FLOAT * wsptr
mov rdi, r12 ; (JSAMPROW *) mov rdi, r12 ; (JSAMPROW *)
mov rax, r13 mov eax, r13d
mov rcx, DCTSIZE/4 ; ctr mov rcx, DCTSIZE/4 ; ctr
.rowloop: .rowloop:

View File

@@ -323,7 +323,7 @@ EXTN(jsimd_idct_ifast_sse2):
mov rax, [original_rbp] mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *) mov rdi, r12 ; (JSAMPROW *)
mov rax, r13 mov eax, r13d
; -- Even part ; -- Even part

View File

@@ -515,7 +515,7 @@ EXTN(jsimd_idct_islow_sse2):
mov rax, [original_rbp] mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *) mov rdi, r12 ; (JSAMPROW *)
mov rax, r13 mov eax, r13d
; -- Even part ; -- Even part

View File

@@ -312,7 +312,7 @@ EXTN(jsimd_idct_4x4_sse2):
mov rax, [original_rbp] mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *) mov rdi, r12 ; (JSAMPROW *)
mov rax, r13 mov eax, r13d
; -- Even part ; -- Even part
@@ -521,7 +521,7 @@ EXTN(jsimd_idct_2x2_sse2):
; ---- Pass 2: process rows, store into output array. ; ---- Pass 2: process rows, store into output array.
mov rdi, r12 ; (JSAMPROW *) mov rdi, r12 ; (JSAMPROW *)
mov rax, r13 mov eax, r13d
; | input:| result:| ; | input:| result:|
; | A0 B0 | | ; | A0 B0 | |

View File

@@ -50,7 +50,7 @@ EXTN(jsimd_convsamp_float_sse2):
packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
mov rsi, r10 mov rsi, r10
mov rax, r11 mov eax, r11d
mov rdi, r12 mov rdi, r12
mov rcx, DCTSIZE/2 mov rcx, DCTSIZE/2
.convloop: .convloop:

View File

@@ -50,7 +50,7 @@ EXTN(jsimd_convsamp_sse2):
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
mov rsi, r10 mov rsi, r10
mov rax, r11 mov eax, r11d
mov rdi, r12 mov rdi, r12
mov rcx, DCTSIZE/4 mov rcx, DCTSIZE/4
.convloop: .convloop: