diff --git a/ChangeLog.md b/ChangeLog.md index 9cd195fb..7f417a49 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -83,6 +83,12 @@ same structure, it was not known to pose a security threat, but removing the warning makes it easier to detect actual security issues, should they arise in the future. +10. Fixed another ABI conformance issue in the 64-bit ARM (AArch64) NEON SIMD +code. Some of the routines were incorrectly reading and storing data below the +stack pointer, which caused segfaults in certain applications under specific +circumstances. + + 1.5.0 ===== diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S index 6c1a9594..33098582 100644 --- a/simd/jsimd_arm64_neon.S +++ b/simd/jsimd_arm64_neon.S @@ -217,8 +217,9 @@ asm_function jsimd_idct_islow_neon sub sp, sp, #64 adr x15, Ljsimd_idct_islow_neon_consts - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32 + mov x10, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32 ld1 {v0.8h, v1.8h}, [x15] ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64 ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64 @@ -243,7 +244,6 @@ asm_function jsimd_idct_islow_neon shl v10.8h, v2.8h, #(PASS1_BITS) sqxtn v16.8b, v15.8h mov TMP1, v16.d[0] - sub sp, sp, #64 mvn TMP2, TMP1 cbnz TMP2, 2f @@ -1117,18 +1117,12 @@ asm_function jsimd_idct_4x4_neon uxtw x3, w3 /* Save all used NEON registers */ - sub sp, sp, 272 - str x15, [sp], 16 + sub sp, sp, 64 + mov x9, sp /* Load constants (v3.4h is just used for padding) */ adr TMP4, Ljsimd_idct_4x4_neon_consts - st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 - st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 - st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 - st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 - st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] /* Load all COEF_BLOCK into NEON registers with the following allocation: @@ -1237,16 +1231,8 @@ asm_function jsimd_idct_4x4_neon #endif /* vpop {v8.4h - v15.4h} ;not available */ - sub sp, sp, #272 - ldr x15, [sp], 16 - ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 - ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 - ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 - ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 - ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 blr x30 .unreq DCT_TABLE @@ -1320,18 +1306,13 @@ asm_function jsimd_idct_2x2_neon uxtw x3, w3 /* vpush {v8.4h - v15.4h} ; not available */ - sub sp, sp, 208 - str x15, [sp], 16 + sub sp, sp, 64 + mov x9, sp /* Load constants */ adr TMP2, Ljsimd_idct_2x2_neon_consts - st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 - st1 {v21.8b, v22.8b}, [sp], 16 - st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 - st1 {v30.8b, v31.8b}, [sp], 16 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 ld1 {v14.4h}, [TMP2] /* Load all COEF_BLOCK into NEON registers with the following allocation: @@ -1431,15 +1412,8 @@ asm_function jsimd_idct_2x2_neon st1 {v26.b}[1], [TMP2], 1 st1 {v27.b}[5], [TMP2], 1 - sub sp, sp, #208 - ldr x15, [sp], 16 - ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 - ld1 {v21.8b, v22.8b}, [sp], 16 - ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 - ld1 {v30.8b, v31.8b}, [sp], 16 blr x30 .unreq DCT_TABLE @@ -1719,13 +1693,13 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 INPUT_BUF2 .req x1 RGB .req x7 - Y .req x8 - U .req x9 - V .req x10 + Y .req x9 + U .req x10 + V .req x11 N .req w15 - sub sp, sp, 336 - str x15, [sp], 16 + sub sp, sp, 64 + mov x9, sp /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ .if \fast_st3 == 1 @@ -1735,23 +1709,11 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 .endif /* Save NEON registers */ - st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 - st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 - st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 - st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 - st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 ld1 {v0.4h, v1.4h}, [x15], 16 ld1 {v2.8h}, [x15] - /* Save ARM registers and handle input arguments */ - /* push {x4, x5, x6, x7, x8, x9, x10, x30} */ - stp x4, x5, [sp], 16 - stp x6, x7, [sp], 16 - stp x8, x9, [sp], 16 - stp x10, x30, [sp], 16 ldr INPUT_BUF0, [INPUT_BUF] ldr INPUT_BUF1, [INPUT_BUF, #8] ldr INPUT_BUF2, [INPUT_BUF, #16] @@ -1818,21 +1780,8 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 b.gt 0b 9: /* Restore all registers and return */ - sub sp, sp, #336 - ldr x15, [sp], 16 - ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 - ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 - ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 - ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 - ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 - /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */ - ldp x4, x5, [sp], 16 - ldp x6, x7, [sp], 16 - ldp x8, x9, [sp], 16 - ldp x10, x30, [sp], 16 br x30 .unreq OUTPUT_WIDTH .unreq INPUT_ROW @@ -2101,8 +2050,9 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 /* Save NEON registers */ sub sp, sp, #64 - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + mov x9, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 /* Outer loop over scanlines */ cmp NUM_ROWS, #1 @@ -2155,7 +2105,6 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 b.gt 0b 9: /* Restore all registers and return */ - sub sp, sp, #64 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 br x30 @@ -2359,8 +2308,9 @@ asm_function jsimd_fdct_islow_neon /* Save NEON registers */ sub sp, sp, #64 - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + mov x10, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32 /* Load all DATA into NEON registers with the following allocation: * 0 1 2 3 | 4 5 6 7 @@ -2590,7 +2540,6 @@ asm_function jsimd_fdct_islow_neon st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] /* Restore NEON registers */ - sub sp, sp, #64 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 @@ -3104,7 +3053,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl sub sp, sp, 272 sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ /* Save ARM registers */ - stp x19, x20, [sp], 16 + stp x19, x20, [sp] .if \fast_tbl == 1 adr x15, Ljsimd_huff_encode_one_block_neon_consts .else @@ -3318,7 +3267,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl and v18.16b, v18.16b, v23.16b add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ and v20.16b, v20.16b, v23.16b - add x15, sp, #0x80 /* x15 = t2 */ + add x15, sp, #0x90 /* x15 = t2 */ and v22.16b, v22.16b, v23.16b ldr w10, [x4, x12, lsl #2] addp v16.16b, v16.16b, v18.16b @@ -3341,7 +3290,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl rbit x9, x9 /* x9 = index0 */ ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ cmp w12, #(64-8) - mov x11, sp + add x11, sp, #16 b.lt 4f cbz x9, 6f st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 @@ -3445,7 +3394,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl put_bits x3, x11 cbnz x9, 1b 6: - add x13, sp, #0xfe + add x13, sp, #0x10e cmp x15, x13 b.hs 1f ldr w12, [x5] @@ -3453,7 +3402,6 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl checkbuf47 put_bits x12, x14 1: - sub sp, sp, 16 str PUT_BUFFER, [x0, #0x10] str PUT_BITSw, [x0, #0x18] ldp x19, x20, [sp], 16