diff --git a/ChangeLog.md b/ChangeLog.md
index 43d8961f..e71f476e 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,7 +1,7 @@
 2.1 pre-beta
 ============
 
-### Significant changes relative to 2.0.3:
+### Significant changes relative to 2.0.4:
 
 1. The build system, x86-64 SIMD extensions, and accelerated Huffman codec now
 support the x32 ABI on Linux, which allows for using x86-64 instructions with
@@ -43,19 +43,14 @@ longer supports 32-bit Java virtual machines.  Oracle no longer provides a
 32-bit JVM for macOS, and Apple's implementation of Java 1.6 (Java for OS X
 systems) is long obsolete.
 
-5. Fixed a regression in the Windows packaging system (introduced by
-2.0 beta1[2]) whereby, if both the 64-bit libjpeg-turbo SDK for GCC and the
-64-bit libjpeg-turbo SDK for Visual C++ were installed on the same system, only
-one of them could be uninstalled.
-
-6. The SSE2 (x86 SIMD) and C Huffman encoding algorithms have been
+5. The SSE2 (x86 SIMD) and C Huffman encoding algorithms have been
 significantly optimized, resulting in a measured average overall compression
 speedup of 12-28% for 64-bit code and 22-52% for 32-bit code on various Intel
 and AMD CPUs, as well as a measured average overall compression speedup of
 0-23% on platforms that do not have a SIMD-accelerated Huffman encoding
 implementation.
 
-7. When decompressing progressive Huffman-encoded JPEG images, the block
+6. When decompressing progressive Huffman-encoded JPEG images, the block
 smoothing algorithm that the libjpeg API library optionally applies is now more
 fault-tolerant.  Previously, if a particular scan was incomplete, then the
 smoothing parameters for the incomplete scan would be applied to the entire
@@ -66,42 +61,16 @@ higher-frequency scan.  libjpeg-turbo now applies block smoothing parameters to
 each iMCU row based on which scan generated the pixels in that row, rather than
 always using the block smoothing parameters for the most recent scan.
 
-8. Fixed a signed integer overflow and subsequent segfault that occurred when
-attempting to decompress images with more than 715827882 pixels using the
-64-bit C version of TJBench.
-
-9. Fixed out-of-bounds write in `tjDecompressToYUV2()` and
-`tjDecompressToYUVPlanes()` (sometimes manifesting as a double free) that
-occurred when attempting to decompress grayscale JPEG images that were
-compressed with a sampling factor other than 1 (for instance, with
-`cjpeg -grayscale -sample 2x2`).
-
-10. Fixed a regression introduced by 2.0.2[5] that caused the TurboJPEG API to
-incorrectly identify some JPEG images with unusual sampling factors as 4:4:4
-JPEG images.  This was known to cause a buffer overflow when attempting to
-decompress some such images using `tjDecompressToYUV2()` or
-`tjDecompressToYUVPlanes()`.
-
-11. Fixed an issue, detected by ASan, whereby attempting to losslessly
-transform a specially-crafted malformed JPEG image containing an
-extremely-high-frequency coefficient block (junk image data that could never be
-generated by a legitimate JPEG compressor) could cause the Huffman encoder's
-local buffer to be overrun. (Refer to 1.4.0[9] and 1.4beta1[15].)  Given that
-the buffer overrun was fully contained within the stack and did not cause a
-segfault or other user-visible errant behavior, and given that the lossless
-transformer (unlike the decompressor) is not generally exposed to arbitrary
-data exploits, this issue did not likely pose a security risk.
-
-12. Added SIMD acceleration for progressive Huffman encoding on ARM 64-bit
+7. Added SIMD acceleration for progressive Huffman encoding on ARM 64-bit
 (ARMv8) platforms.  This speeds up the compression of full-color progressive
 JPEGs by about 30-40% on average (relative to libjpeg-turbo 2.0.x) when using
 modern ARMv8 CPUs.
 
-13. Added configure-time and run-time auto-detection of Loongson MMI SIMD
+8. Added configure-time and run-time auto-detection of Loongson MMI SIMD
 instructions, so that the Loongson MMI SIMD extensions can be included in any
 MIPS64 libjpeg-turbo build.
 
-14. Added fault tolerance features to djpeg and jpegtran, mainly to demonstrate
+9. Added fault tolerance features to djpeg and jpegtran, mainly to demonstrate
 methods by which applications can guard against the exploits of the JPEG format
 described in the report
 ["Two Issues with the JPEG Standard"](https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
@@ -112,6 +81,47 @@ limit the number of allowable scans in the input file.
 treat all warnings as fatal.
 
 
+2.0.4
+=====
+
+### Significant changes relative to 2.0.3:
+
+1. Fixed a regression in the Windows packaging system (introduced by
+2.0 beta1[2]) whereby, if both the 64-bit libjpeg-turbo SDK for GCC and the
+64-bit libjpeg-turbo SDK for Visual C++ were installed on the same system, only
+one of them could be uninstalled.
+
+2. Fixed a signed integer overflow and subsequent segfault that occurred when
+attempting to decompress images with more than 715827882 pixels using the
+64-bit C version of TJBench.
+
+3. Fixed out-of-bounds write in `tjDecompressToYUV2()` and
+`tjDecompressToYUVPlanes()` (sometimes manifesting as a double free) that
+occurred when attempting to decompress grayscale JPEG images that were
+compressed with a sampling factor other than 1 (for instance, with
+`cjpeg -grayscale -sample 2x2`).
+
+4. Fixed a regression introduced by 2.0.2[5] that caused the TurboJPEG API to
+incorrectly identify some JPEG images with unusual sampling factors as 4:4:4
+JPEG images.  This was known to cause a buffer overflow when attempting to
+decompress some such images using `tjDecompressToYUV2()` or
+`tjDecompressToYUVPlanes()`.
+
+5. Fixed an issue, detected by ASan, whereby attempting to losslessly transform
+a specially-crafted malformed JPEG image containing an extremely-high-frequency
+coefficient block (junk image data that could never be generated by a
+legitimate JPEG compressor) could cause the Huffman encoder's local buffer to
+be overrun. (Refer to 1.4.0[9] and 1.4beta1[15].)  Given that the buffer
+overrun was fully contained within the stack and did not cause a segfault or
+other user-visible errant behavior, and given that the lossless transformer
+(unlike the decompressor) is not generally exposed to arbitrary data exploits,
+this issue did not likely pose a security risk.
+
+6. The ARM 64-bit (ARMv8) NEON SIMD assembly code now stores constants in a
+separate read-only data section rather than in the text section, to support
+execute-only memory layouts.
+
+
 2.0.3
 =====
 
diff --git a/README.md b/README.md
index c61b8556..6df6c5ea 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,14 @@
 Background
 ==========
 
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
-AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression
-on x86, x86-64, ARM, and PowerPC systems, as well as progressive JPEG
-compression on x86 and x86-64 systems.  On such systems, libjpeg-turbo is
-generally 2-6x as fast as libjpeg, all else being equal.  On other types of
-systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
-virtue of its highly-optimized Huffman coding routines.  In many cases, the
-performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
+baseline JPEG compression and decompression on x86, x86-64, ARM, PowerPC, and
+MIPS systems, as well as progressive JPEG compression on x86, x86-64, and ARMv8
+systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
+all else being equal.  On other types of systems, libjpeg-turbo can still
+outperform libjpeg by a significant amount, by virtue of its highly-optimized
+Huffman coding routines.  In many cases, the performance of libjpeg-turbo
+rivals that of proprietary high-speed JPEG codecs.
 
 libjpeg-turbo implements both the traditional libjpeg API as well as the less
 powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features
@@ -145,14 +145,14 @@ supported and which aren't.
 
 #### Fully supported
 
-- **libjpeg: IDCT scaling extensions in decompressor**<br>
+- **libjpeg API: IDCT scaling extensions in decompressor**<br>
   libjpeg-turbo supports IDCT scaling with scaling factors of 1/8, 1/4, 3/8,
   1/2, 5/8, 3/4, 7/8, 9/8, 5/4, 11/8, 3/2, 13/8, 7/4, 15/8, and 2/1 (only 1/4
   and 1/2 are SIMD-accelerated.)
 
-- **libjpeg: Arithmetic coding**
+- **libjpeg API: Arithmetic coding**
 
-- **libjpeg: In-memory source and destination managers**<br>
+- **libjpeg API: In-memory source and destination managers**<br>
   See notes below.
 
 - **cjpeg: Separate quality settings for luminance and chrominance**<br>
@@ -184,14 +184,14 @@ means of quality improvement.  The reader is invited to peruse the research at
 but it is the general belief of our project that these features have not
 demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
 
-- **libjpeg: DCT scaling in compressor**<br>
+- **libjpeg API: DCT scaling in compressor**<br>
   `cinfo.scale_num` and `cinfo.scale_denom` are silently ignored.
   There is no technical reason why DCT scaling could not be supported when
   emulating the libjpeg v7+ API/ABI, but without the SmartScale extension (see
   below), only scaling factors of 1/2, 8/15, 4/7, 8/13, 2/3, 8/11, 4/5, and
   8/9 would be available, which is of limited usefulness.
 
-- **libjpeg: SmartScale**<br>
+- **libjpeg API: SmartScale**<br>
   `cinfo.block_size` is silently ignored.
   SmartScale is an extension to the JPEG format that allows for DCT block
   sizes other than 8x8.  Providing support for this new format would be
@@ -204,7 +204,7 @@ demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
   interest in providing this feature would be as a means of supporting
   additional DCT scaling factors.
 
-- **libjpeg: Fancy downsampling in compressor**<br>
+- **libjpeg API: Fancy downsampling in compressor**<br>
   `cinfo.do_fancy_downsampling` is silently ignored.
   This requires the DCT scaling feature, which is not supported.
 
@@ -252,8 +252,8 @@ building libjpeg-turbo.  This will restore the pre-1.3 behavior, in which
 libjpeg v8 API/ABI.
 
 On Un*x systems, including the in-memory source/destination managers changes
-the dynamic library version from 62.1.0 to 62.2.0 if using libjpeg v6b API/ABI
-emulation and from 7.1.0 to 7.2.0 if using libjpeg v7 API/ABI emulation.
+the dynamic library version from 62.2.0 to 62.3.0 if using libjpeg v6b API/ABI
+emulation and from 7.2.0 to 7.3.0 if using libjpeg v7 API/ABI emulation.
 
 Note that, on most Un*x systems, the dynamic linker will not look for a
 function in a library until that function is actually used.  Thus, if a program
@@ -329,7 +329,7 @@ in a way that makes the rest of the libjpeg infrastructure happy, so it is
 necessary to use the slow Huffman decoder when decompressing a JPEG image that
 has restart markers.  This can cause the decompression performance to drop by
 as much as 20%, but the performance will still be much greater than that of
-libjpeg.  Many consumer packages, such as PhotoShop, use restart markers when
+libjpeg.  Many consumer packages, such as Photoshop, use restart markers when
 generating JPEG images, so images generated by those programs will experience
 this issue.
 
diff --git a/djpeg.c b/djpeg.c
index 4d0fdeb3..6352fefc 100644
--- a/djpeg.c
+++ b/djpeg.c
@@ -538,7 +538,9 @@ main(int argc, char **argv)
   FILE *input_file;
   FILE *output_file;
   unsigned char *inbuffer = NULL;
+#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
   unsigned long insize = 0;
+#endif
   JDIMENSION num_scanlines;
 
   /* On Mac, fetch a command line. */
diff --git a/release/ReadMe.txt b/release/ReadMe.txt
index cf9012af..b7c22dee 100644
--- a/release/ReadMe.txt
+++ b/release/ReadMe.txt
@@ -1,4 +1,4 @@
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2, AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression on x86, x86-64, ARM, and PowerPC systems, as well as progressive JPEG compression on x86 and x86-64 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines.  In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate baseline JPEG compression and decompression on x86, x86-64, ARM, PowerPC, and MIPS systems, as well as progressive JPEG compression on x86, x86-64, and ARMv8 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines.  In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
 
 libjpeg-turbo implements both the traditional libjpeg API as well as the less powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features colorspace extensions that allow it to compress from/decompress to 32-bit and big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java interface.
 
diff --git a/release/deb-control.in b/release/deb-control.in
index 08131c66..5415a86c 100644
--- a/release/deb-control.in
+++ b/release/deb-control.in
@@ -8,15 +8,14 @@ Maintainer: @PKGVENDOR@ <@PKGEMAIL@>
 Homepage: @PKGURL@
 Installed-Size: {__SIZE}
 Description: A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs
- libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
- AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression
- on x86, x86-64, ARM, and PowerPC systems, as well as progressive JPEG
- compression on x86 and x86-64 systems.  On such systems, libjpeg-turbo is
- generally 2-6x as fast as libjpeg, all else being equal.  On other types of
- systems, libjpeg-turbo can still outperform libjpeg by a significant amount,
- by virtue of its highly-optimized Huffman coding routines.  In many cases, the
- performance of libjpeg-turbo rivals that of proprietary high-speed JPEG
- codecs.
+ libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
+ baseline JPEG compression and decompression on x86, x86-64, ARM, PowerPC, and
+ MIPS systems, as well as progressive JPEG compression on x86, x86-64, and
+ ARMv8 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as
+ libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can
+ still outperform libjpeg by a significant amount, by virtue of its
+ highly-optimized Huffman coding routines.  In many cases, the performance of
+ libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
  .
  libjpeg-turbo implements both the traditional libjpeg API as well as the less
  powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features
diff --git a/release/rpm.spec.in b/release/rpm.spec.in
index e5730e6c..889fafec 100644
--- a/release/rpm.spec.in
+++ b/release/rpm.spec.in
@@ -51,14 +51,14 @@ Provides: %{name} = %{version}-%{release}, @CMAKE_PROJECT_NAME@ = %{version}-%{r
 %endif
 
 %description
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
-AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression
-on x86, x86-64, ARM, and PowerPC systems, as well as progressive JPEG
-compression on x86 and x86-64 systems.  On such systems, libjpeg-turbo is
-generally 2-6x as fast as libjpeg, all else being equal.  On other types of
-systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
-virtue of its highly-optimized Huffman coding routines.  In many cases, the
-performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
+baseline JPEG compression and decompression on x86, x86-64, ARM, PowerPC, and
+MIPS systems, as well as progressive JPEG compression on x86, x86-64, and ARMv8
+systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
+all else being equal.  On other types of systems, libjpeg-turbo can still
+outperform libjpeg by a significant amount, by virtue of its highly-optimized
+Huffman coding routines.  In many cases, the performance of libjpeg-turbo
+rivals that of proprietary high-speed JPEG codecs.
 
 libjpeg-turbo implements both the traditional libjpeg API as well as the less
 powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features
diff --git a/simd/arm64/jsimd_neon.S b/simd/arm64/jsimd_neon.S
index c45bed71..ccda4e31 100644
--- a/simd/arm64/jsimd_neon.S
+++ b/simd/arm64/jsimd_neon.S
@@ -31,6 +31,265 @@
 .section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
 #endif
 
+#if defined(__APPLE__)
+.section __DATA,__const
+#else
+.section .rodata, "a", %progbits
+#endif
+
+/* Constants for jsimd_idct_islow_neon() */
+
+#define F_0_298   2446  /* FIX(0.298631336) */
+#define F_0_390   3196  /* FIX(0.390180644) */
+#define F_0_541   4433  /* FIX(0.541196100) */
+#define F_0_765   6270  /* FIX(0.765366865) */
+#define F_0_899   7373  /* FIX(0.899976223) */
+#define F_1_175   9633  /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_idct_islow_neon_consts:
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_idct_ifast_neon() */
+
+.balign 16
+Ljsimd_idct_ifast_neon_consts:
+  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
+  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
+  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
+  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
+
+/* Constants for jsimd_idct_4x4_neon() and jsimd_idct_2x2_neon() */
+
+#define CONST_BITS  13
+
+#define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
+#define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
+#define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
+#define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
+#define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
+#define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
+#define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
+#define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
+#define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
+#define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
+#define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
+#define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
+#define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
+#define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
+
+.balign 16
+Ljsimd_idct_4x4_neon_consts:
+  .short FIX_1_847759065        /* v0.h[0] */
+  .short -FIX_0_765366865       /* v0.h[1] */
+  .short -FIX_0_211164243       /* v0.h[2] */
+  .short FIX_1_451774981        /* v0.h[3] */
+  .short -FIX_2_172734803       /* d1[0] */
+  .short FIX_1_061594337        /* d1[1] */
+  .short -FIX_0_509795579       /* d1[2] */
+  .short -FIX_0_601344887       /* d1[3] */
+  .short FIX_0_899976223        /* v2.h[0] */
+  .short FIX_2_562915447        /* v2.h[1] */
+  .short 1 << (CONST_BITS + 1)  /* v2.h[2] */
+  .short 0                      /* v2.h[3] */
+
+.balign 8
+Ljsimd_idct_2x2_neon_consts:
+  .short -FIX_0_720959822  /* v14[0] */
+  .short FIX_0_850430095   /* v14[1] */
+  .short -FIX_1_272758580  /* v14[2] */
+  .short FIX_3_624509785   /* v14[3] */
+
+/* Constants for jsimd_ycc_*_neon() */
+
+.balign 16
+Ljsimd_ycc_rgb_neon_consts:
+  .short 0,      0,     0,      0
+  .short 22971, -11277, -23401, 29033
+  .short -128,  -128,   -128,   -128
+  .short -128,  -128,   -128,   -128
+
+/* Constants for jsimd_*_ycc_neon() */
+
+.balign 16
+Ljsimd_rgb_ycc_neon_consts:
+  .short 19595, 38470, 7471, 11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128, 32767, 128
+  .short 32767, 128, 32767, 128
+
+/* Constants for jsimd_fdct_islow_neon() */
+
+#define F_0_298   2446  /* FIX(0.298631336) */
+#define F_0_390   3196  /* FIX(0.390180644) */
+#define F_0_541   4433  /* FIX(0.541196100) */
+#define F_0_765   6270  /* FIX(0.765366865) */
+#define F_0_899   7373  /* FIX(0.899976223) */
+#define F_1_175   9633  /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_fdct_islow_neon_consts:
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_fdct_ifast_neon() */
+
+.balign 16
+Ljsimd_fdct_ifast_neon_consts:
+  .short (98 * 128)               /* XFIX_0_382683433 */
+  .short (139 * 128)              /* XFIX_0_541196100 */
+  .short (181 * 128)              /* XFIX_0_707106781 */
+  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
+
+/* Constants for jsimd_h2*_downsample_neon() */
+
+.balign 16
+Ljsimd_h2_downsample_neon_consts:
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
+        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
+        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
+        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
+        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
+  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
+        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
+  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
+        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
+
+/* Constants for jsimd_huff_encode_one_block_neon() */
+
+.balign 16
+Ljsimd_huff_encode_one_block_neon_consts:
+    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
+            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
+    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
+            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
+    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
+           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
+    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
+            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
+    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
+            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
+    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
+            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
+    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
+            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
+    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
+            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
+    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
+    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
+
+/* Constants for jsimd_encode_mcu_AC_first_prepare_neon() */
+
+.balign 16
+Ljsimd_encode_mcu_AC_first_prepare_neon_consts:
+    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+
+/* Constants for jsimd_encode_mcu_AC_refine_prepare_neon() */
+
+.balign 16
+Ljsimd_encode_mcu_AC_refine_prepare_neon_consts:
+    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+
 .text
 
 
@@ -55,6 +314,17 @@ _\fname:
 #endif
 .endm
 
+/* Get symbol location */
+.macro get_symbol_loc reg, symbol
+#ifdef __APPLE__
+    adrp            \reg, \symbol@PAGE
+    add             \reg, \reg, \symbol@PAGEOFF
+#else
+    adrp            \reg, \symbol
+    add             \reg, \reg, :lo12:\symbol
+#endif
+.endm
+
 /* Transpose elements of single 128 bit registers */
 .macro transpose_single x0, x1, xi, xilen, literal
     ins             \xi\xilen[0], \x0\xilen[0]
@@ -139,51 +409,6 @@ _\fname:
 #define CONST_BITS  13
 #define PASS1_BITS  2
 
-#define F_0_298   2446  /* FIX(0.298631336) */
-#define F_0_390   3196  /* FIX(0.390180644) */
-#define F_0_541   4433  /* FIX(0.541196100) */
-#define F_0_765   6270  /* FIX(0.765366865) */
-#define F_0_899   7373  /* FIX(0.899976223) */
-#define F_1_175   9633  /* FIX(1.175875602) */
-#define F_1_501  12299  /* FIX(1.501321110) */
-#define F_1_847  15137  /* FIX(1.847759065) */
-#define F_1_961  16069  /* FIX(1.961570560) */
-#define F_2_053  16819  /* FIX(2.053119869) */
-#define F_2_562  20995  /* FIX(2.562915447) */
-#define F_3_072  25172  /* FIX(3.072711026) */
-
-.balign 16
-Ljsimd_idct_islow_neon_consts:
-  .short F_0_298
-  .short -F_0_390
-  .short F_0_541
-  .short F_0_765
-  .short - F_0_899
-  .short F_1_175
-  .short F_1_501
-  .short - F_1_847
-  .short - F_1_961
-  .short F_2_053
-  .short - F_2_562
-  .short F_3_072
-  .short 0          /* padding */
-  .short 0
-  .short 0
-  .short 0
-
-#undef F_0_298
-#undef F_0_390
-#undef F_0_541
-#undef F_0_765
-#undef F_0_899
-#undef F_1_175
-#undef F_1_501
-#undef F_1_847
-#undef F_1_961
-#undef F_2_053
-#undef F_2_562
-#undef F_3_072
-
 #define XFIX_P_0_298  v0.h[0]
 #define XFIX_N_0_390  v0.h[1]
 #define XFIX_P_0_541  v0.h[2]
@@ -217,7 +442,7 @@ asm_function jsimd_idct_islow_neon
     uxtw x3, w3
 
     sub             sp, sp, #64
-    adr             x15, Ljsimd_idct_islow_neon_consts
+    get_symbol_loc  x15, Ljsimd_idct_islow_neon_consts
     mov             x10, sp
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
@@ -791,13 +1016,6 @@ asm_function jsimd_idct_islow_neon
 #define XFIX_1_847759065  v0.h[2]
 #define XFIX_2_613125930  v0.h[3]
 
-.balign 16
-Ljsimd_idct_ifast_neon_consts:
-  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
-  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
-  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
-  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
-
 asm_function jsimd_idct_ifast_neon
 
     DCT_TABLE       .req x0
@@ -832,7 +1050,7 @@ asm_function jsimd_idct_ifast_neon
      *   7 | d30     | d31     ( v23.8h )
      */
     /* Save NEON registers used in fast IDCT */
-    adr             TMP5, Ljsimd_idct_ifast_neon_consts
+    get_symbol_loc  TMP5, Ljsimd_idct_ifast_neon_consts
     ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
     ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
     ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
@@ -1023,38 +1241,6 @@ asm_function jsimd_idct_ifast_neon
  *       but readability will suffer somewhat.
  */
 
-#define CONST_BITS  13
-
-#define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
-#define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
-#define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
-#define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
-#define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
-#define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
-#define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
-#define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
-#define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
-#define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
-#define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
-#define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
-#define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
-#define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
-
-.balign 16
-Ljsimd_idct_4x4_neon_consts:
-  .short FIX_1_847759065        /* v0.h[0] */
-  .short -FIX_0_765366865       /* v0.h[1] */
-  .short -FIX_0_211164243       /* v0.h[2] */
-  .short FIX_1_451774981        /* v0.h[3] */
-  .short -FIX_2_172734803       /* d1[0] */
-  .short FIX_1_061594337        /* d1[1] */
-  .short -FIX_0_509795579       /* d1[2] */
-  .short -FIX_0_601344887       /* d1[3] */
-  .short FIX_0_899976223        /* v2.h[0] */
-  .short FIX_2_562915447        /* v2.h[1] */
-  .short 1 << (CONST_BITS + 1)  /* v2.h[2] */
-  .short 0                      /* v2.h[3] */
-
 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
     smull           v28.4s, \x4, v2.h[2]
     smlal           v28.4s, \x8, v0.h[0]
@@ -1121,7 +1307,7 @@ asm_function jsimd_idct_4x4_neon
     sub             sp, sp, 64
     mov             x9, sp
     /* Load constants (v3.4h is just used for padding) */
-    adr             TMP4, Ljsimd_idct_4x4_neon_consts
+    get_symbol_loc  TMP4, Ljsimd_idct_4x4_neon_consts
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
     ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
@@ -1264,13 +1450,6 @@ asm_function jsimd_idct_4x4_neon
  *       bit exact compatibility with jpeg-6b.
  */
 
-.balign 8
-Ljsimd_idct_2x2_neon_consts:
-  .short -FIX_0_720959822  /* v14[0] */
-  .short FIX_0_850430095   /* v14[1] */
-  .short -FIX_1_272758580  /* v14[2] */
-  .short FIX_3_624509785   /* v14[3] */
-
 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
     sshll           v15.4s, \x4, #15
     smull           v26.4s, \x6, v14.h[3]
@@ -1311,7 +1490,7 @@ asm_function jsimd_idct_2x2_neon
     mov             x9, sp
 
     /* Load constants */
-    adr             TMP2, Ljsimd_idct_2x2_neon_consts
+    get_symbol_loc  TMP2, Ljsimd_idct_2x2_neon_consts
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
     ld1             {v14.4h}, [TMP2]
@@ -1663,21 +1842,6 @@ asm_function jsimd_idct_2x2_neon
     do_yuv_to_rgb_stage2
 .endm
 
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.balign 16
-.if \fast_st3 == 1
-Ljsimd_ycc_\colorid\()_neon_consts:
-.else
-Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
-.endif
-  .short 0,      0,     0,      0
-  .short 22971, -11277, -23401, 29033
-  .short -128,  -128,   -128,   -128
-  .short -128,  -128,   -128,   -128
-
 .if \fast_st3 == 1
 asm_function jsimd_ycc_\colorid\()_convert_neon
 .else
@@ -1703,11 +1867,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
     mov             x9, sp
 
     /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
-    .if \fast_st3 == 1
-      adr           x15, Ljsimd_ycc_\colorid\()_neon_consts
-    .else
-      adr           x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
-    .endif
+    get_symbol_loc  x15, Ljsimd_ycc_rgb_neon_consts
 
     /* Save NEON registers */
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
@@ -2004,17 +2164,6 @@ generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,
     do_rgb_to_yuv_stage1
 .endm
 
-.balign 16
-.if \fast_ld3 == 1
-Ljsimd_\colorid\()_ycc_neon_consts:
-.else
-Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
-.endif
-  .short 19595, 38470, 7471, 11059
-  .short 21709, 32768, 27439, 5329
-  .short 32767, 128, 32767, 128
-  .short 32767, 128, 32767, 128
-
 .if \fast_ld3 == 1
 asm_function jsimd_\colorid\()_ycc_convert_neon
 .else
@@ -2037,11 +2186,7 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
     N               .req w12
 
     /* Load constants to d0, d1, d2, d3 */
-    .if \fast_ld3 == 1
-      adr           x13, Ljsimd_\colorid\()_ycc_neon_consts
-    .else
-      adr           x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
-    .endif
+    get_symbol_loc  x13, Ljsimd_rgb_ycc_neon_consts
     ld1             {v0.8h, v1.8h}, [x13]
 
     ldr             OUTPUT_BUF0, [OUTPUT_BUF]
@@ -2241,50 +2386,6 @@ asm_function jsimd_convsamp_neon
 #define DESCALE_P1  (CONST_BITS - PASS1_BITS)
 #define DESCALE_P2  (CONST_BITS + PASS1_BITS)
 
-#define F_0_298   2446  /* FIX(0.298631336) */
-#define F_0_390   3196  /* FIX(0.390180644) */
-#define F_0_541   4433  /* FIX(0.541196100) */
-#define F_0_765   6270  /* FIX(0.765366865) */
-#define F_0_899   7373  /* FIX(0.899976223) */
-#define F_1_175   9633  /* FIX(1.175875602) */
-#define F_1_501  12299  /* FIX(1.501321110) */
-#define F_1_847  15137  /* FIX(1.847759065) */
-#define F_1_961  16069  /* FIX(1.961570560) */
-#define F_2_053  16819  /* FIX(2.053119869) */
-#define F_2_562  20995  /* FIX(2.562915447) */
-#define F_3_072  25172  /* FIX(3.072711026) */
-
-.balign 16
-Ljsimd_fdct_islow_neon_consts:
-  .short F_0_298
-  .short -F_0_390
-  .short F_0_541
-  .short F_0_765
-  .short - F_0_899
-  .short F_1_175
-  .short F_1_501
-  .short - F_1_847
-  .short - F_1_961
-  .short F_2_053
-  .short - F_2_562
-  .short F_3_072
-  .short 0          /* padding */
-  .short 0
-  .short 0
-  .short 0
-
-#undef F_0_298
-#undef F_0_390
-#undef F_0_541
-#undef F_0_765
-#undef F_0_899
-#undef F_1_175
-#undef F_1_501
-#undef F_1_847
-#undef F_1_961
-#undef F_2_053
-#undef F_2_562
-#undef F_3_072
 #define XFIX_P_0_298  v0.h[0]
 #define XFIX_N_0_390  v0.h[1]
 #define XFIX_P_0_541  v0.h[2]
@@ -2304,7 +2405,7 @@ asm_function jsimd_fdct_islow_neon
     TMP             .req x9
 
     /* Load constants */
-    adr             TMP, Ljsimd_fdct_islow_neon_consts
+    get_symbol_loc  TMP, Ljsimd_fdct_islow_neon_consts
     ld1             {v0.8h, v1.8h}, [TMP]
 
     /* Save NEON registers */
@@ -2583,20 +2684,13 @@ asm_function jsimd_fdct_islow_neon
 #define XFIX_0_707106781  v0.h[2]
 #define XFIX_1_306562965  v0.h[3]
 
-.balign 16
-Ljsimd_fdct_ifast_neon_consts:
-  .short (98 * 128)               /* XFIX_0_382683433 */
-  .short (139 * 128)              /* XFIX_0_541196100 */
-  .short (181 * 128)              /* XFIX_0_707106781 */
-  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
-
 asm_function jsimd_fdct_ifast_neon
 
     DATA            .req x0
     TMP             .req x9
 
     /* Load constants */
-    adr             TMP, Ljsimd_fdct_ifast_neon_consts
+    get_symbol_loc  TMP, Ljsimd_fdct_ifast_neon_consts
     ld1             {v0.4h}, [TMP]
 
     /* Load all DATA into NEON registers with the following allocation:
@@ -2775,41 +2869,6 @@ asm_function jsimd_quantize_neon
  *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
  */
 
-.balign 16
-Ljsimd_h2_downsample_neon_consts:
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
-        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
-        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
-        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
-        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
-  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
-        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
-  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
-        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
-  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
-
 asm_function jsimd_h2v1_downsample_neon
     IMAGE_WIDTH     .req x0
     MAX_V_SAMP      .req x1
@@ -2827,7 +2886,7 @@ asm_function jsimd_h2v1_downsample_neon
     mov             TMPDUP, #0x10000
     lsl             TMP2, BLOCK_WIDTH, #4
     sub             TMP2, TMP2, IMAGE_WIDTH
-    adr             TMP3, Ljsimd_h2_downsample_neon_consts
+    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
     add             TMP3, TMP3, TMP2, lsl #4
     dup             v16.4s, TMPDUP
     ld1             {v18.16b}, [TMP3]
@@ -2906,7 +2965,7 @@ asm_function jsimd_h2v2_downsample_neon
     lsl             TMP2, BLOCK_WIDTH, #4
     lsl             TMPDUP, TMPDUP, #17
     sub             TMP2, TMP2, IMAGE_WIDTH
-    adr             TMP3, Ljsimd_h2_downsample_neon_consts
+    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
     orr             TMPDUP, TMPDUP, #1
     add             TMP3, TMP3, TMP2, lsl #4
     dup             v16.4s, TMPDUP
@@ -3012,41 +3071,6 @@ asm_function jsimd_h2v2_downsample_neon
 
 .macro generate_jsimd_huff_encode_one_block fast_tbl
 
-.balign 16
-.if \fast_tbl == 1
-Ljsimd_huff_encode_one_block_neon_consts:
-.else
-Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
-.endif
-    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
-          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-.if \fast_tbl == 1
-    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
-            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
-    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
-            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
-    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
-           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
-    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
-            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
-    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
-            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
-    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
-            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
-    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
-            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
-    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
-            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
-    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
-           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
-    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
-             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
-    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
-           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
-    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
-           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
-.endif
-
 .if \fast_tbl == 1
 asm_function jsimd_huff_encode_one_block_neon
 .else
@@ -3056,11 +3080,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
     sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
     /* Save ARM registers */
     stp             x19, x20, [sp]
-.if \fast_tbl == 1
-    adr             x15, Ljsimd_huff_encode_one_block_neon_consts
-.else
-    adr             x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
-.endif
+    get_symbol_loc  x15, Ljsimd_huff_encode_one_block_neon_consts
     ldr             PUT_BUFFER, [x0, #0x10]
     ldr             PUT_BITSw, [x0, #0x18]
     ldrsh           w12, [x2]               /* load DC coeff in w12 */
@@ -3724,13 +3744,8 @@ generate_jsimd_huff_encode_one_block 0
     LENEND          .req w9
     BITS            .req x5
 
-.balign 16
-Ljsimd_encode_mcu_AC_first_prepare_neon_consts:
-    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
-          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-
 asm_function jsimd_encode_mcu_AC_first_prepare_neon
-    adr             T0, Ljsimd_encode_mcu_AC_first_prepare_neon_consts
+    get_symbol_loc  T0, Ljsimd_encode_mcu_AC_first_prepare_neon_consts
     neg             w3, w3                        /* Al = -Al */
     eor             ZERO.16b, ZERO.16b, ZERO.16b
     ld1             {ANDMASK.16b}, [T0]
@@ -3869,13 +3884,8 @@ asm_function jsimd_encode_mcu_AC_first_prepare_neon
     LENEND          .req w9
     BITS            .req x5
 
-.balign 16
-Ljsimd_encode_mcu_AC_refine_prepare_neon_consts:
-    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
-          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-
 asm_function jsimd_encode_mcu_AC_refine_prepare_neon
-    adr             T0, Ljsimd_encode_mcu_AC_refine_prepare_neon_consts
+    get_symbol_loc  T0, Ljsimd_encode_mcu_AC_refine_prepare_neon_consts
     neg             w3, w3                        /* Al = -Al */
     movi            ONE.8h, #1
     eor             SIGN, SIGN, SIGN
diff --git a/turbojpeg.c b/turbojpeg.c
index fc471e9b..7f607d14 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c
@@ -1906,10 +1906,11 @@ DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
     if (xinfo[i].crop) {
       if ((t[i].r.x % xinfo[i].iMCU_sample_width) != 0 ||
           (t[i].r.y % xinfo[i].iMCU_sample_height) != 0) {
-        snprintf(errStr, JMSG_LENGTH_MAX,
+        snprintf(this->errStr, JMSG_LENGTH_MAX,
                  "To crop this JPEG image, x must be a multiple of %d\n"
                  "and y must be a multiple of %d.\n",
                  xinfo[i].iMCU_sample_width, xinfo[i].iMCU_sample_height);
+        this->isInstanceError = TRUE;
         retval = -1;  goto bailout;
       }
     }