ARM64 NEON: Fix another ABI conformance issue

Based on
98a5a9dc89
with wordsmithing by DRC.

In the AArch64 ABI, as in many others, it's forbidden to read/store data
below the stack pointer.  Some SIMD functions were doing just that
(stack pointer misuse) when trying to preserve callee-saved registers,
and this resulted in those registers being restored with incorrect
contents under certain circumstances.

This patch fixes that behavior, and callee-saved registers are now
stored above the stack pointer throughout the function call.  The patch
also removes register saving in places where it is unnecessary for this
ABI, or it makes use of unused scratch regiters instead of callee-saved
registers.

Fixes #97.  Closes #101.

Refer also to https://bugzilla.redhat.com/show_bug.cgi?id=1368569
This commit is contained in:
mayeut
2016-09-20 21:06:24 +02:00
committed by DRC
parent e9d9c31fd2
commit cb88e5da80
2 changed files with 34 additions and 80 deletions

View File

@@ -83,6 +83,12 @@ same structure, it was not known to pose a security threat, but removing the
warning makes it easier to detect actual security issues, should they arise in warning makes it easier to detect actual security issues, should they arise in
the future. the future.
10. Fixed another ABI conformance issue in the 64-bit ARM (AArch64) NEON SIMD
code. Some of the routines were incorrectly reading and storing data below the
stack pointer, which caused segfaults in certain applications under specific
circumstances.
1.5.0 1.5.0
===== =====

View File

@@ -217,8 +217,9 @@ asm_function jsimd_idct_islow_neon
sub sp, sp, #64 sub sp, sp, #64
adr x15, Ljsimd_idct_islow_neon_consts adr x15, Ljsimd_idct_islow_neon_consts
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 mov x10, sp
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
ld1 {v0.8h, v1.8h}, [x15] ld1 {v0.8h, v1.8h}, [x15]
ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64 ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
@@ -243,7 +244,6 @@ asm_function jsimd_idct_islow_neon
shl v10.8h, v2.8h, #(PASS1_BITS) shl v10.8h, v2.8h, #(PASS1_BITS)
sqxtn v16.8b, v15.8h sqxtn v16.8b, v15.8h
mov TMP1, v16.d[0] mov TMP1, v16.d[0]
sub sp, sp, #64
mvn TMP2, TMP1 mvn TMP2, TMP1
cbnz TMP2, 2f cbnz TMP2, 2f
@@ -1117,18 +1117,12 @@ asm_function jsimd_idct_4x4_neon
uxtw x3, w3 uxtw x3, w3
/* Save all used NEON registers */ /* Save all used NEON registers */
sub sp, sp, 272 sub sp, sp, 64
str x15, [sp], 16 mov x9, sp
/* Load constants (v3.4h is just used for padding) */ /* Load constants (v3.4h is just used for padding) */
adr TMP4, Ljsimd_idct_4x4_neon_consts adr TMP4, Ljsimd_idct_4x4_neon_consts
st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
/* Load all COEF_BLOCK into NEON registers with the following allocation: /* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1237,16 +1231,8 @@ asm_function jsimd_idct_4x4_neon
#endif #endif
/* vpop {v8.4h - v15.4h} ;not available */ /* vpop {v8.4h - v15.4h} ;not available */
sub sp, sp, #272
ldr x15, [sp], 16
ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
blr x30 blr x30
.unreq DCT_TABLE .unreq DCT_TABLE
@@ -1320,18 +1306,13 @@ asm_function jsimd_idct_2x2_neon
uxtw x3, w3 uxtw x3, w3
/* vpush {v8.4h - v15.4h} ; not available */ /* vpush {v8.4h - v15.4h} ; not available */
sub sp, sp, 208 sub sp, sp, 64
str x15, [sp], 16 mov x9, sp
/* Load constants */ /* Load constants */
adr TMP2, Ljsimd_idct_2x2_neon_consts adr TMP2, Ljsimd_idct_2x2_neon_consts
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
st1 {v21.8b, v22.8b}, [sp], 16
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
st1 {v30.8b, v31.8b}, [sp], 16
ld1 {v14.4h}, [TMP2] ld1 {v14.4h}, [TMP2]
/* Load all COEF_BLOCK into NEON registers with the following allocation: /* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1431,15 +1412,8 @@ asm_function jsimd_idct_2x2_neon
st1 {v26.b}[1], [TMP2], 1 st1 {v26.b}[1], [TMP2], 1
st1 {v27.b}[5], [TMP2], 1 st1 {v27.b}[5], [TMP2], 1
sub sp, sp, #208
ldr x15, [sp], 16
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
ld1 {v21.8b, v22.8b}, [sp], 16
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
ld1 {v30.8b, v31.8b}, [sp], 16
blr x30 blr x30
.unreq DCT_TABLE .unreq DCT_TABLE
@@ -1719,13 +1693,13 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
INPUT_BUF2 .req x1 INPUT_BUF2 .req x1
RGB .req x7 RGB .req x7
Y .req x8 Y .req x9
U .req x9 U .req x10
V .req x10 V .req x11
N .req w15 N .req w15
sub sp, sp, 336 sub sp, sp, 64
str x15, [sp], 16 mov x9, sp
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
.if \fast_st3 == 1 .if \fast_st3 == 1
@@ -1735,23 +1709,11 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
.endif .endif
/* Save NEON registers */ /* Save NEON registers */
st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
ld1 {v0.4h, v1.4h}, [x15], 16 ld1 {v0.4h, v1.4h}, [x15], 16
ld1 {v2.8h}, [x15] ld1 {v2.8h}, [x15]
/* Save ARM registers and handle input arguments */
/* push {x4, x5, x6, x7, x8, x9, x10, x30} */
stp x4, x5, [sp], 16
stp x6, x7, [sp], 16
stp x8, x9, [sp], 16
stp x10, x30, [sp], 16
ldr INPUT_BUF0, [INPUT_BUF] ldr INPUT_BUF0, [INPUT_BUF]
ldr INPUT_BUF1, [INPUT_BUF, #8] ldr INPUT_BUF1, [INPUT_BUF, #8]
ldr INPUT_BUF2, [INPUT_BUF, #16] ldr INPUT_BUF2, [INPUT_BUF, #16]
@@ -1818,21 +1780,8 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
b.gt 0b b.gt 0b
9: 9:
/* Restore all registers and return */ /* Restore all registers and return */
sub sp, sp, #336
ldr x15, [sp], 16
ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
/* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
ldp x4, x5, [sp], 16
ldp x6, x7, [sp], 16
ldp x8, x9, [sp], 16
ldp x10, x30, [sp], 16
br x30 br x30
.unreq OUTPUT_WIDTH .unreq OUTPUT_WIDTH
.unreq INPUT_ROW .unreq INPUT_ROW
@@ -2101,8 +2050,9 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
/* Save NEON registers */ /* Save NEON registers */
sub sp, sp, #64 sub sp, sp, #64
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 mov x9, sp
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
/* Outer loop over scanlines */ /* Outer loop over scanlines */
cmp NUM_ROWS, #1 cmp NUM_ROWS, #1
@@ -2155,7 +2105,6 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
b.gt 0b b.gt 0b
9: 9:
/* Restore all registers and return */ /* Restore all registers and return */
sub sp, sp, #64
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
br x30 br x30
@@ -2359,8 +2308,9 @@ asm_function jsimd_fdct_islow_neon
/* Save NEON registers */ /* Save NEON registers */
sub sp, sp, #64 sub sp, sp, #64
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 mov x10, sp
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
/* Load all DATA into NEON registers with the following allocation: /* Load all DATA into NEON registers with the following allocation:
* 0 1 2 3 | 4 5 6 7 * 0 1 2 3 | 4 5 6 7
@@ -2590,7 +2540,6 @@ asm_function jsimd_fdct_islow_neon
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
/* Restore NEON registers */ /* Restore NEON registers */
sub sp, sp, #64
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
@@ -3104,7 +3053,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
sub sp, sp, 272 sub sp, sp, 272
sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
/* Save ARM registers */ /* Save ARM registers */
stp x19, x20, [sp], 16 stp x19, x20, [sp]
.if \fast_tbl == 1 .if \fast_tbl == 1
adr x15, Ljsimd_huff_encode_one_block_neon_consts adr x15, Ljsimd_huff_encode_one_block_neon_consts
.else .else
@@ -3318,7 +3267,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
and v18.16b, v18.16b, v23.16b and v18.16b, v18.16b, v23.16b
add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ add x3, x4, #0x400 /* r1 = dctbl->ehufsi */
and v20.16b, v20.16b, v23.16b and v20.16b, v20.16b, v23.16b
add x15, sp, #0x80 /* x15 = t2 */ add x15, sp, #0x90 /* x15 = t2 */
and v22.16b, v22.16b, v23.16b and v22.16b, v22.16b, v23.16b
ldr w10, [x4, x12, lsl #2] ldr w10, [x4, x12, lsl #2]
addp v16.16b, v16.16b, v18.16b addp v16.16b, v16.16b, v18.16b
@@ -3341,7 +3290,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
rbit x9, x9 /* x9 = index0 */ rbit x9, x9 /* x9 = index0 */
ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */
cmp w12, #(64-8) cmp w12, #(64-8)
mov x11, sp add x11, sp, #16
b.lt 4f b.lt 4f
cbz x9, 6f cbz x9, 6f
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
@@ -3445,7 +3394,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
put_bits x3, x11 put_bits x3, x11
cbnz x9, 1b cbnz x9, 1b
6: 6:
add x13, sp, #0xfe add x13, sp, #0x10e
cmp x15, x13 cmp x15, x13
b.hs 1f b.hs 1f
ldr w12, [x5] ldr w12, [x5]
@@ -3453,7 +3402,6 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
checkbuf47 checkbuf47
put_bits x12, x14 put_bits x12, x14
1: 1:
sub sp, sp, 16
str PUT_BUFFER, [x0, #0x10] str PUT_BUFFER, [x0, #0x10]
str PUT_BITSw, [x0, #0x18] str PUT_BITSw, [x0, #0x18]
ldp x19, x20, [sp], 16 ldp x19, x20, [sp], 16