diff --git a/ChangeLog.md b/ChangeLog.md
index 9cd195fb..7f417a49 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -83,6 +83,12 @@ same structure, it was not known to pose a security threat, but removing the
 warning makes it easier to detect actual security issues, should they arise in
 the future.
 
+10. Fixed another ABI conformance issue in the 64-bit ARM (AArch64) NEON SIMD
+code.  Some of the routines were incorrectly reading and storing data below the
+stack pointer, which caused segfaults in certain applications under specific
+circumstances.
+
+
 1.5.0
 =====
 
diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
index 6c1a9594..33098582 100644
--- a/simd/jsimd_arm64_neon.S
+++ b/simd/jsimd_arm64_neon.S
@@ -217,8 +217,9 @@ asm_function jsimd_idct_islow_neon
 
     sub             sp, sp, #64
     adr             x15, Ljsimd_idct_islow_neon_consts
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
+    mov             x10, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
     ld1             {v0.8h, v1.8h}, [x15]
     ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
     ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
@@ -243,7 +244,6 @@ asm_function jsimd_idct_islow_neon
     shl             v10.8h, v2.8h, #(PASS1_BITS)
     sqxtn           v16.8b, v15.8h
     mov             TMP1, v16.d[0]
-    sub             sp, sp, #64
     mvn             TMP2, TMP1
 
     cbnz            TMP2, 2f
@@ -1117,18 +1117,12 @@ asm_function jsimd_idct_4x4_neon
     uxtw x3, w3
 
     /* Save all used NEON registers */
-    sub             sp, sp, 272
-    str             x15, [sp], 16
+    sub             sp, sp, 64
+    mov             x9, sp
     /* Load constants (v3.4h is just used for padding) */
     adr             TMP4, Ljsimd_idct_4x4_neon_consts
-    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
-    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
-    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
     ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
 
     /* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1237,16 +1231,8 @@ asm_function jsimd_idct_4x4_neon
 #endif
 
     /* vpop            {v8.4h - v15.4h}    ;not available */
-    sub             sp, sp, #272
-    ldr             x15, [sp], 16
-    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
-    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
-    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
     blr             x30
 
     .unreq          DCT_TABLE
@@ -1320,18 +1306,13 @@ asm_function jsimd_idct_2x2_neon
     uxtw x3, w3
 
     /* vpush           {v8.4h - v15.4h}            ; not available */
-    sub             sp, sp, 208
-    str             x15, [sp], 16
+    sub             sp, sp, 64
+    mov             x9, sp
 
     /* Load constants */
     adr             TMP2, Ljsimd_idct_2x2_neon_consts
-    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    st1             {v21.8b, v22.8b}, [sp], 16
-    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    st1             {v30.8b, v31.8b}, [sp], 16
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
     ld1             {v14.4h}, [TMP2]
 
     /* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1431,15 +1412,8 @@ asm_function jsimd_idct_2x2_neon
     st1             {v26.b}[1], [TMP2], 1
     st1             {v27.b}[5], [TMP2], 1
 
-    sub             sp, sp, #208
-    ldr             x15, [sp], 16
-    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    ld1             {v21.8b, v22.8b}, [sp], 16
-    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    ld1             {v30.8b, v31.8b}, [sp], 16
     blr             x30
 
     .unreq          DCT_TABLE
@@ -1719,13 +1693,13 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
     INPUT_BUF2      .req x1
 
     RGB             .req x7
-    Y               .req x8
-    U               .req x9
-    V               .req x10
+    Y               .req x9
+    U               .req x10
+    V               .req x11
     N               .req w15
 
-    sub             sp, sp, 336
-    str             x15, [sp], 16
+    sub             sp, sp, 64
+    mov             x9, sp
 
     /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
     .if \fast_st3 == 1
@@ -1735,23 +1709,11 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
     .endif
 
     /* Save NEON registers */
-    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
-    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
-    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
     ld1             {v0.4h, v1.4h}, [x15], 16
     ld1             {v2.8h}, [x15]
 
-    /* Save ARM registers and handle input arguments */
-    /* push            {x4, x5, x6, x7, x8, x9, x10, x30} */
-    stp             x4, x5, [sp], 16
-    stp             x6, x7, [sp], 16
-    stp             x8, x9, [sp], 16
-    stp             x10, x30, [sp], 16
     ldr             INPUT_BUF0, [INPUT_BUF]
     ldr             INPUT_BUF1, [INPUT_BUF, #8]
     ldr             INPUT_BUF2, [INPUT_BUF, #16]
@@ -1818,21 +1780,8 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
     b.gt            0b
 9:
     /* Restore all registers and return */
-    sub             sp, sp, #336
-    ldr             x15, [sp], 16
-    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
-    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
-    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
-    /* pop             {r4, r5, r6, r7, r8, r9, r10, pc} */
-    ldp             x4, x5, [sp], 16
-    ldp             x6, x7, [sp], 16
-    ldp             x8, x9, [sp], 16
-    ldp             x10, x30, [sp], 16
     br              x30
     .unreq          OUTPUT_WIDTH
     .unreq          INPUT_ROW
@@ -2101,8 +2050,9 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
 
     /* Save NEON registers */
     sub             sp, sp, #64
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    mov             x9, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
 
     /* Outer loop over scanlines */
     cmp             NUM_ROWS, #1
@@ -2155,7 +2105,6 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
     b.gt            0b
 9:
     /* Restore all registers and return */
-    sub             sp, sp, #64
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
     br              x30
@@ -2359,8 +2308,9 @@ asm_function jsimd_fdct_islow_neon
 
     /* Save NEON registers */
     sub             sp, sp, #64
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    mov             x10, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
 
     /* Load all DATA into NEON registers with the following allocation:
      *       0 1 2 3 | 4 5 6 7
@@ -2590,7 +2540,6 @@ asm_function jsimd_fdct_islow_neon
     st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
 
     /* Restore NEON registers */
-    sub             sp, sp, #64
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
 
@@ -3104,7 +3053,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
     sub             sp, sp, 272
     sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
     /* Save ARM registers */
-    stp             x19, x20, [sp], 16
+    stp             x19, x20, [sp]
 .if \fast_tbl == 1
     adr             x15, Ljsimd_huff_encode_one_block_neon_consts
 .else
@@ -3318,7 +3267,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
     and             v18.16b, v18.16b, v23.16b
       add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
     and             v20.16b, v20.16b, v23.16b
-      add             x15, sp, #0x80           /* x15 = t2 */
+      add             x15, sp, #0x90           /* x15 = t2 */
     and             v22.16b, v22.16b, v23.16b
       ldr             w10, [x4, x12, lsl #2]
     addp            v16.16b, v16.16b, v18.16b
@@ -3341,7 +3290,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
     rbit            x9, x9             /* x9 = index0 */
     ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
     cmp             w12, #(64-8)
-    mov             x11, sp
+    add             x11, sp, #16
     b.lt            4f
     cbz             x9, 6f
     st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
@@ -3445,7 +3394,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
     put_bits        x3, x11
     cbnz            x9, 1b
 6:
-    add             x13, sp, #0xfe
+    add             x13, sp, #0x10e
     cmp             x15, x13
     b.hs            1f
     ldr             w12, [x5]
@@ -3453,7 +3402,6 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
     checkbuf47
     put_bits        x12, x14
 1:
-    sub             sp, sp, 16
     str             PUT_BUFFER, [x0, #0x10]
     str             PUT_BITSw, [x0, #0x18]
     ldp             x19, x20, [sp], 16