Fix x86-64 ABI conformance issue in SIMD code

(descriptions cribbed by DRC from discussion in #20) In the x86-64 ABI, the high (unused) DWORD of a 32-bit argument's register is undefined, so it was incorrect to use a 64-bit mov instruction to transfer a JDIMENSION argument in the 64-bit SSE2 SIMD functions. The code worked thus far only because the existing compiler optimizers weren't smart enough to do anything else with the register in question, so the upper 32 bits happened to be all zeroes-- for the past 6 years, on every x86-64 compiler previously known to mankind. The bleeding-edge Clang/LLVM compiler has a smarter optimizer, and under certain circumstances, it will attempt to load-combine adjacent 32-bit integers from one of the libjpeg structures into a single 64-bit integer and pass that 64-bit integer as a 32-bit argument to one of the SIMD functions (which is allowed by the ABI, since the upper 32 bits of the 32-bit argument's register are undefined.) This caused the libjpeg-turbo regression tests to crash. Also enhance the documentation of JDIMENSION to explain that its size is significant to the implementation of the SIMD code. Closes #20. Refer also to http://crbug.com/532214.
2015-09-15 11:57:03 -07:00
parent 465a9fe0cb
commit 498d9bc92f
14 changed files with 40 additions and 28 deletions
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -11,6 +11,16 @@ incorrectly encode certain JPEG images when quality=100 and the fast integer
 forward DCT were used.  This was known to cause 'make test' to fail when the
 library was built with '-march=haswell' on x86 systems.
 [3] Fixed an issue whereby libjpeg-turbo would crash when built with the latest
 & greatest development version of the Clang/LLVM compiler.  This was caused by
 an x86-64 ABI conformance issue in some of libjpeg-turbo's 64-bit SSE2 SIMD
 routines.  Those routines were incorrectly using a 64-bit mov instruction to
 transfer a 32-bit JDIMENSION argument, whereas the x86-64 ABI allows the upper
 (unused) 32 bits of a 32-bit argument's register to be undefined.  The new
 Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit
 structure members into a single 64-bit register, and this exposed the ABI
 conformance issue.
 1.4.1
 =====
--- a/jmorecfg.h
+++ b/jmorecfg.h
@@ -162,7 +162,9 @@ typedef long INT32;
 * images up to 64K*64K due to 16-bit fields in SOF markers.  Therefore
 * "unsigned int" is sufficient on all machines.  However, if you need to
 * handle larger images and you don't mind deviating from the spec, you
- * can change this datatype.
+ * can change this datatype.  (Note that changing this datatype will
 * potentially require modifying the SIMD code.  The x86-64 SIMD extensions,
 * in particular, assume a 32-bit JDIMENSION.)
 */
 typedef unsigned int JDIMENSION;
--- a/simd/jccolext-sse2-64.asm
+++ b/simd/jccolext-sse2-64.asm
@@ -50,14 +50,14 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
        collect_args
        push    rbx
-        mov     rcx, r10
+        mov     ecx, r10d
        test    rcx,rcx
        jz      near .return
        push    rcx
        mov rsi, r12
-        mov rcx, r13
+        mov ecx, r13d
        mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
        mov     rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
        mov     rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
--- a/simd/jcgryext-sse2-64.asm
+++ b/simd/jcgryext-sse2-64.asm
@@ -50,14 +50,14 @@ EXTN(jsimd_rgb_gray_convert_sse2):
        collect_args
        push    rbx
-        mov     rcx, r10
+        mov     ecx, r10d
        test    rcx,rcx
        jz      near .return
        push    rcx
        mov rsi, r12
-        mov rcx, r13
+        mov ecx, r13d
        mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
        lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW]
--- a/simd/jcsample-sse2-64.asm
+++ b/simd/jcsample-sse2-64.asm
@@ -49,11 +49,11 @@ EXTN(jsimd_h2v1_downsample_sse2):
        mov     rbp,rsp
        collect_args
-        mov rcx, r13
+        mov ecx, r13d
        shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
        jz      near .return
-        mov rdx, r10
+        mov edx, r10d
        ; -- expand_right_edge
@@ -90,7 +90,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
        ; -- h2v1_downsample
-        mov     rax, r12        ; rowctr
+        mov     eax, r12d        ; rowctr
        test    eax,eax
        jle     near .return
@@ -193,11 +193,11 @@ EXTN(jsimd_h2v2_downsample_sse2):
        mov     rbp,rsp
        collect_args
-        mov     rcx, r13
+        mov     ecx, r13d
        shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
        jz      near .return
-        mov     rdx, r10
+        mov     edx, r10d
        ; -- expand_right_edge
@@ -234,7 +234,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
        ; -- h2v2_downsample
-        mov     rax, r12        ; rowctr
+        mov     eax, r12d        ; rowctr
        test    rax,rax
        jle     near .return
--- a/simd/jdcolext-sse2-64.asm
+++ b/simd/jdcolext-sse2-64.asm
@@ -52,14 +52,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
        collect_args
        push    rbx
-        mov     rcx, r10        ; num_cols
+        mov     ecx, r10d        ; num_cols
        test    rcx,rcx
        jz      near .return
        push    rcx
        mov     rdi, r11
-        mov     rcx, r12
+        mov     ecx, r12d
        mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
        mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
        mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
--- a/simd/jdmrgext-sse2-64.asm
+++ b/simd/jdmrgext-sse2-64.asm
@@ -52,14 +52,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        collect_args
        push    rbx
-        mov     rcx, r10        ; col
+        mov     ecx, r10d        ; col
        test    rcx,rcx
        jz      near .return
        push    rcx
        mov     rdi, r11
-        mov     rcx, r12
+        mov     ecx, r12d
        mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
        mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
        mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
@@ -455,10 +455,10 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
        collect_args
        push    rbx
-        mov     rax, r10
+        mov     eax, r10d
        mov     rdi, r11
-        mov     rcx, r12
+        mov     ecx, r12d
        mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
        mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
        mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
--- a/simd/jdsample-sse2-64.asm
+++ b/simd/jdsample-sse2-64.asm
@@ -67,7 +67,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
        mov     rbp,rsp
        collect_args
-        mov     rax, r11  ; colctr
+        mov     eax, r11d  ; colctr
        test    rax,rax
        jz      near .return
@@ -214,7 +214,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
        collect_args
        push    rbx
-        mov     rax, r11  ; colctr
+        mov     eax, r11d  ; colctr
        test    rax,rax
        jz      near .return
@@ -506,7 +506,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
        mov     rbp,rsp
        collect_args
-        mov     rdx, r11
+        mov     edx, r11d
        add     rdx, byte (2*SIZEOF_XMMWORD)-1
        and     rdx, byte -(2*SIZEOF_XMMWORD)
        jz      near .return
@@ -596,7 +596,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
        collect_args
        push    rbx
-        mov     rdx, r11
+        mov     edx, r11d
        add     rdx, byte (2*SIZEOF_XMMWORD)-1
        and     rdx, byte -(2*SIZEOF_XMMWORD)
        jz      near .return
--- a/simd/jidctflt-sse2-64.asm
+++ b/simd/jidctflt-sse2-64.asm
@@ -326,7 +326,7 @@ EXTN(jsimd_idct_float_sse2):
        mov     rax, [original_rbp]
        lea     rsi, [workspace]                        ; FAST_FLOAT * wsptr
        mov     rdi, r12        ; (JSAMPROW *)
-        mov     rax, r13
+        mov     eax, r13d
        mov     rcx, DCTSIZE/4                          ; ctr
 .rowloop:
--- a/simd/jidctfst-sse2-64.asm
+++ b/simd/jidctfst-sse2-64.asm
@@ -323,7 +323,7 @@ EXTN(jsimd_idct_ifast_sse2):
        mov     rax, [original_rbp]
        mov     rdi, r12        ; (JSAMPROW *)
-        mov     rax, r13
+        mov     eax, r13d
        ; -- Even part
--- a/simd/jidctint-sse2-64.asm
+++ b/simd/jidctint-sse2-64.asm
@@ -515,7 +515,7 @@ EXTN(jsimd_idct_islow_sse2):
        mov     rax, [original_rbp]
        mov     rdi, r12        ; (JSAMPROW *)
-        mov     rax, r13
+        mov     eax, r13d
        ; -- Even part
--- a/simd/jidctred-sse2-64.asm
+++ b/simd/jidctred-sse2-64.asm
@@ -312,7 +312,7 @@ EXTN(jsimd_idct_4x4_sse2):
        mov     rax, [original_rbp]
        mov     rdi, r12        ; (JSAMPROW *)
-        mov     rax, r13
+        mov     eax, r13d
        ; -- Even part
@@ -521,7 +521,7 @@ EXTN(jsimd_idct_2x2_sse2):
        ; ---- Pass 2: process rows, store into output array.
        mov     rdi, r12        ; (JSAMPROW *)
-        mov     rax, r13
+        mov     eax, r13d
        ; | input:| result:|
        ; | A0 B0 |        |
--- a/simd/jquantf-sse2-64.asm
+++ b/simd/jquantf-sse2-64.asm
@@ -50,7 +50,7 @@ EXTN(jsimd_convsamp_float_sse2):
        packsswb xmm7,xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
        mov rsi, r10
-        mov     rax, r11
+        mov     eax, r11d
        mov rdi, r12
        mov     rcx, DCTSIZE/2
 .convloop:
--- a/simd/jquanti-sse2-64.asm
+++ b/simd/jquanti-sse2-64.asm
@@ -50,7 +50,7 @@ EXTN(jsimd_convsamp_sse2):
        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
        mov rsi, r10
-        mov rax, r11
+        mov eax, r11d
        mov rdi, r12
        mov     rcx, DCTSIZE/4
 .convloop: