diff --git a/BUILDING.md b/BUILDING.md
index b11ab8b3..a4ae1e06 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -15,13 +15,18 @@ Build Requirements
   * If using NASM, 2.10 or later is required.
   * If using NASM, 2.10 or later (except 2.11.08) is required for an x86-64 Mac
     build (2.11.08 does not work properly with libjpeg-turbo's x86-64 SIMD code
-    when building macho64 objects.)  NASM or YASM can be obtained from
-    [MacPorts](http://www.macports.org/) or [Homebrew](http://brew.sh/).
+    when building macho64 objects.)
   * If using YASM, 1.2.0 or later is required.
+  * If building on macOS, NASM or YASM can be obtained from
+    [MacPorts](http://www.macports.org/) or [Homebrew](http://brew.sh/).
      - NOTE: Currently, if it is desirable to hide the SIMD function symbols in
        Mac executables or shared libraries that statically link with
-       libjpeg-turbo, then YASM must be used when building libjpeg-turbo.
+       libjpeg-turbo, then NASM 2.14 or later or YASM must be used when
+       building libjpeg-turbo.
   * If building on Windows, **nasm.exe**/**yasm.exe** should be in your `PATH`.
+  * NASM and YASM are located in the CRB (Code Ready Builder) repository on
+    Red Hat Enterprise Linux 8 and in the PowerTools repository on CentOS 8,
+    which is not enabled by default.
 
   The binary RPMs released by the NASM project do not work on older Linux
   systems, such as Red Hat Enterprise Linux 5.  On such systems, you can easily
@@ -48,8 +53,9 @@ Build Requirements
   install the Java Developer Package, which can be downloaded from
   <http://developer.apple.com/downloads> (Apple ID required.)  For other
   systems, you can obtain the Oracle Java Development Kit from
-  <http://www.java.com>.
+  <http://www.oracle.com/technetwork/java/javase/downloads>.
 
+  * If using JDK 11 or later, CMake 3.10.x or later must also be used.
 
 ### Windows
 
@@ -83,7 +89,10 @@ Build Requirements
   appropriate compiler paths automatically set.
 
 - If building the TurboJPEG Java wrapper, JDK 1.5 or later is required.  This
-  can be downloaded from <http://www.java.com>.
+  can be downloaded from
+  <http://www.oracle.com/technetwork/java/javase/downloads>.
+
+  * If using JDK 11 or later, CMake 3.10.x or later must also be used.
 
 
 Out-of-Tree Builds
@@ -521,7 +530,7 @@ a universal library.
 Building libjpeg-turbo for Android
 ----------------------------------
 
-Building libjpeg-turbo for Android platforms requires the
+Building libjpeg-turbo for Android platforms requires v13b or later of the
 [Android NDK](https://developer.android.com/tools/sdk/ndk).
 
 
@@ -531,35 +540,21 @@ The following is a general recipe script that can be modified for your specific
 needs.
 
     # Set these variables to suit your needs
-    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/sdk/ndk-bundle}
-    BUILD_PLATFORM={the platform name for the NDK package you installed--
-      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
-    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
-      toolchain directory under ${NDK_PATH}/toolchains/.}
-    ANDROID_VERSION={The minimum version of Android to support-- for example,
+    NDK_PATH={full path to the NDK directory-- for example,
+      /opt/android/android-ndk-r16b}
+    TOOLCHAIN={"gcc" or "clang"-- "gcc" must be used with NDK r16b and earlier,
+      and "clang" must be used with NDK r17c and later}
+    ANDROID_VERSION={the minimum version of Android to support-- for example,
       "16", "19", etc.}
 
-    # It should not be necessary to modify the rest
-    HOST=arm-linux-androideabi
-    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-arm
-    export CFLAGS="-march=armv7-a -mfloat-abi=softfp -fprefetch-loop-arrays \
-      -D__ANDROID_API__=${ANDROID_VERSION} --sysroot=${SYSROOT} \
-      -isystem ${NDK_PATH}/sysroot/usr/include \
-      -isystem ${NDK_PATH}/sysroot/usr/include/${HOST}"
-    export LDFLAGS=-pie
-    TOOLCHAIN=${NDK_PATH}/toolchains/${HOST}-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
-
     cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Linux)
-    set(CMAKE_SYSTEM_PROCESSOR arm)
-    set(CMAKE_C_COMPILER ${TOOLCHAIN}/bin/${HOST}-gcc)
-    set(CMAKE_FIND_ROOT_PATH ${TOOLCHAIN}/${HOST})
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_POSITION_INDEPENDENT_CODE=1 \
+    cmake -G"Unix Makefiles" \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_MODE=arm \
+      -DANDROID_PLATFORM=android-${ANDROID_VERSION} \
+      -DANDROID_TOOLCHAIN=${TOOLCHAIN} \
+      -DCMAKE_ASM_FLAGS="--target=arm-linux-androideabi${ANDROID_VERSION}" \
+      -DCMAKE_TOOLCHAIN_FILE=${NDK_PATH}/build/cmake/android.toolchain.cmake \
       [additional CMake flags] {source_directory}
     make
 
@@ -570,34 +565,21 @@ The following is a general recipe script that can be modified for your specific
 needs.
 
     # Set these variables to suit your needs
-    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/sdk/ndk-bundle}
-    BUILD_PLATFORM={the platform name for the NDK package you installed--
-      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
-    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
-      toolchain directory under ${NDK_PATH}/toolchains/.}
-    ANDROID_VERSION={The minimum version of Android to support.  "21" or later
+    NDK_PATH={full path to the NDK directory-- for example,
+      /opt/android/android-ndk-r16b}
+    TOOLCHAIN={"gcc" or "clang"-- "gcc" must be used with NDK r14b and earlier,
+      and "clang" must be used with NDK r17c and later}
+    ANDROID_VERSION={the minimum version of Android to support.  "21" or later
       is required for a 64-bit build.}
 
-    # It should not be necessary to modify the rest
-    HOST=aarch64-linux-android
-    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-arm64
-    export CFLAGS="-D__ANDROID_API__=${ANDROID_VERSION} --sysroot=${SYSROOT} \
-      -isystem ${NDK_PATH}/sysroot/usr/include \
-      -isystem ${NDK_PATH}/sysroot/usr/include/${HOST}"
-    export LDFLAGS=-pie
-    TOOLCHAIN=${NDK_PATH}/toolchains/${HOST}-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
-
     cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Linux)
-    set(CMAKE_SYSTEM_PROCESSOR aarch64)
-    set(CMAKE_C_COMPILER ${TOOLCHAIN}/bin/${HOST}-gcc)
-    set(CMAKE_FIND_ROOT_PATH ${TOOLCHAIN}/${HOST})
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_POSITION_INDEPENDENT_CODE=1 \
+    cmake -G"Unix Makefiles" \
+      -DANDROID_ABI=arm64-v8a \
+      -DANDROID_ARM_MODE=arm \
+      -DANDROID_PLATFORM=android-${ANDROID_VERSION} \
+      -DANDROID_TOOLCHAIN=${TOOLCHAIN} \
+      -DCMAKE_ASM_FLAGS="--target=aarch64-linux-android${ANDROID_VERSION}" \
+      -DCMAKE_TOOLCHAIN_FILE=${NDK_PATH}/build/cmake/android.toolchain.cmake \
       [additional CMake flags] {source_directory}
     make
 
@@ -608,34 +590,19 @@ The following is a general recipe script that can be modified for your specific
 needs.
 
     # Set these variables to suit your needs
-    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/sdk/ndk-bundle}
-    BUILD_PLATFORM={the platform name for the NDK package you installed--
-      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
-    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
-      toolchain directory under ${NDK_PATH}/toolchains/.}
+    NDK_PATH={full path to the NDK directory-- for example,
+      /opt/android/android-ndk-r16b}
+    TOOLCHAIN={"gcc" or "clang"-- "gcc" must be used with NDK r14b and earlier,
+      and "clang" must be used with NDK r17c and later}
     ANDROID_VERSION={The minimum version of Android to support-- for example,
       "16", "19", etc.}
 
-    # It should not be necessary to modify the rest
-    HOST=i686-linux-android
-    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-x86
-    export CFLAGS="-D__ANDROID_API__=${ANDROID_VERSION} --sysroot=${SYSROOT} \
-      -isystem ${NDK_PATH}/sysroot/usr/include \
-      -isystem ${NDK_PATH}/sysroot/usr/include/${HOST}"
-    export LDFLAGS=-pie
-    TOOLCHAIN=${NDK_PATH}/toolchains/x86-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
-
     cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Linux)
-    set(CMAKE_SYSTEM_PROCESSOR i386)
-    set(CMAKE_C_COMPILER ${TOOLCHAIN}/bin/${HOST}-gcc)
-    set(CMAKE_FIND_ROOT_PATH ${TOOLCHAIN}/${HOST})
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_POSITION_INDEPENDENT_CODE=1 \
+    cmake -G"Unix Makefiles" \
+      -DANDROID_ABI=x86 \
+      -DANDROID_PLATFORM=android-${ANDROID_VERSION} \
+      -DANDROID_TOOLCHAIN=${TOOLCHAIN} \
+      -DCMAKE_TOOLCHAIN_FILE=${NDK_PATH}/build/cmake/android.toolchain.cmake \
       [additional CMake flags] {source_directory}
     make
 
@@ -646,45 +613,23 @@ The following is a general recipe script that can be modified for your specific
 needs.
 
     # Set these variables to suit your needs
-    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/sdk/ndk-bundle}
-    BUILD_PLATFORM={the platform name for the NDK package you installed--
-      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
-    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
-      toolchain directory under ${NDK_PATH}/toolchains/.}
-    ANDROID_VERSION={The minimum version of Android to support.  "21" or later
+    NDK_PATH={full path to the NDK directory-- for example,
+      /opt/android/android-ndk-r16b}
+    TOOLCHAIN={"gcc" or "clang"-- "gcc" must be used with NDK r14b and earlier,
+      and "clang" must be used with NDK r17c and later}
+    ANDROID_VERSION={the minimum version of Android to support.  "21" or later
       is required for a 64-bit build.}
 
-    # It should not be necessary to modify the rest
-    HOST=x86_64-linux-android
-    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-x86_64
-    export CFLAGS="-D__ANDROID_API__=${ANDROID_VERSION} --sysroot=${SYSROOT} \
-      -isystem ${NDK_PATH}/sysroot/usr/include \
-      -isystem ${NDK_PATH}/sysroot/usr/include/${HOST}"
-    export LDFLAGS=-pie
-    TOOLCHAIN=${NDK_PATH}/toolchains/x86_64-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
-
     cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Linux)
-    set(CMAKE_SYSTEM_PROCESSOR x86_64)
-    set(CMAKE_C_COMPILER ${TOOLCHAIN}/bin/${HOST}-gcc)
-    set(CMAKE_FIND_ROOT_PATH ${TOOLCHAIN}/${HOST})
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_POSITION_INDEPENDENT_CODE=1 \
+    cmake -G"Unix Makefiles" \
+      -DANDROID_ABI=x86_64 \
+      -DANDROID_PLATFORM=android-${ANDROID_VERSION} \
+      -DANDROID_TOOLCHAIN=${TOOLCHAIN} \
+      -DCMAKE_TOOLCHAIN_FILE=${NDK_PATH}/build/cmake/android.toolchain.cmake \
       [additional CMake flags] {source_directory}
     make
 
 
-If building for Android 4.0.x (API level < 16) or earlier, remove
-`-DCMAKE_POSITION_INDEPENDENT_CODE=1` from the CMake arguments and `-pie` from
-`LDFLAGS`.
-
-If building on Windows, add `.exe` to the end of `CMAKE_C_COMPILER`.
-
-
 Advanced CMake Options
 ----------------------
 
diff --git a/Brewfile b/Brewfile
deleted file mode 100644
index 4a9cb3d2..00000000
--- a/Brewfile
+++ /dev/null
@@ -1,4 +0,0 @@
-brew 'yasm'
-brew 'gcc@5'
-brew 'md5sha1sum'
-cask 'Caskroom/versions/java6'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1a4d1137..51f4714b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -109,8 +109,6 @@ endif()
 
 include(cmakescripts/GNUInstallDirs.cmake)
 
-set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_FULL_LIBDIR})
-
 macro(report_directory var)
   if(CMAKE_INSTALL_${var} STREQUAL CMAKE_INSTALL_FULL_${var})
     message(STATUS "CMAKE_INSTALL_${var} = ${CMAKE_INSTALL_${var}}")
@@ -193,6 +191,10 @@ endif()
 report_option(ENABLE_SHARED "Shared libraries")
 report_option(ENABLE_STATIC "Static libraries")
 
+if(ENABLE_SHARED)
+  set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_FULL_LIBDIR})
+endif()
+
 if(WITH_12BIT)
   set(WITH_ARITH_DEC 0)
   set(WITH_ARITH_ENC 0)
@@ -333,7 +335,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
       endif()
       if(${var} MATCHES "-xO2")
         string(REGEX REPLACE "-xO2" "-xO5" ${var} "${${var}}")
-  endif()
+      endif()
     endforeach()
   endif()
 endif()
@@ -470,8 +472,8 @@ if(UNIX AND NOT APPLE)
       # still work.
       file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/conftest.map
         "VERS_1 { global: foo;  local: *; }; VERS_2 { global: foo2; } VERS_1;")
-      set(CMAKE_REQUIRED_FLAGS "-Wl,-M,${CMAKE_CURRENT_BINARY_DIR}/conftest.map")
-      check_c_source_compiles("void foo() {} void foo2() {} int main(void) { return 0; }"
+      set(CMAKE_REQUIRED_FLAGS "-Wl,-M,${CMAKE_CURRENT_BINARY_DIR}/conftest.map -shared")
+      check_c_source_compiles("int foo() { return 0; } int foo2() { return 2; }"
         HAVE_MAPFILE)
       set(CMAKE_REQUIRED_FLAGS)
       file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/conftest.map)
@@ -536,7 +538,7 @@ elseif(NOT WITH_12BIT)
 endif()
 if(WITH_SIMD)
   message(STATUS "SIMD extensions: ${CPU_TYPE} (WITH_SIMD = ${WITH_SIMD})")
-  if(MSVC_IDE)
+  if(MSVC_IDE OR XCODE)
     set_source_files_properties(${SIMD_OBJS} PROPERTIES GENERATED 1)
   endif()
 else()
@@ -565,16 +567,16 @@ if(WITH_TURBOJPEG)
       turbojpeg.c transupp.c jdatadst-tj.c jdatasrc-tj.c rdbmp.c rdppm.c
       wrbmp.c wrppm.c)
     set(TJMAPFILE ${CMAKE_CURRENT_SOURCE_DIR}/turbojpeg-mapfile)
-  if(WITH_JAVA)
-    set(TURBOJPEG_SOURCES ${TURBOJPEG_SOURCES} turbojpeg-jni.c)
-    include_directories(${JAVA_INCLUDE_PATH} ${JAVA_INCLUDE_PATH2})
+    if(WITH_JAVA)
+      set(TURBOJPEG_SOURCES ${TURBOJPEG_SOURCES} turbojpeg-jni.c)
+      include_directories(${JAVA_INCLUDE_PATH} ${JAVA_INCLUDE_PATH2})
       set(TJMAPFILE ${CMAKE_CURRENT_SOURCE_DIR}/turbojpeg-mapfile.jni)
-  endif()
+    endif()
     add_library(turbojpeg SHARED ${TURBOJPEG_SOURCES})
     set_property(TARGET turbojpeg PROPERTY COMPILE_FLAGS
       "-DBMP_SUPPORTED -DPPM_SUPPORTED")
     if(WIN32)
-    set_target_properties(turbojpeg PROPERTIES DEFINE_SYMBOL DLLDEFINE)
+      set_target_properties(turbojpeg PROPERTIES DEFINE_SYMBOL DLLDEFINE)
     endif()
     if(MINGW)
       set_target_properties(turbojpeg PROPERTIES LINK_FLAGS -Wl,--kill-at)
@@ -609,7 +611,7 @@ if(WITH_TURBOJPEG)
     target_link_libraries(tjexample turbojpeg)
     if(UNIX)
       target_link_libraries(tjexample m)
-    endif()
+  endif()
   endif()
 
   if(ENABLE_STATIC)
@@ -699,7 +701,7 @@ add_executable(wrjpgcom wrjpgcom.c)
 
 add_subdirectory(md5)
 
-if(MSVC_IDE)
+if(MSVC_IDE OR XCODE)
   set(OBJDIR "\${CTEST_CONFIGURATION_TYPE}/")
 else()
   set(OBJDIR "")
@@ -715,7 +717,7 @@ if(WITH_12BIT)
   set(MD5_JPEG_422_IFAST_OPT 7322e3bd2f127f7de4b40d4480ce60e4)
   set(MD5_PPM_422_IFAST 79807fa552899e66a04708f533e16950)
   set(MD5_PPM_422M_IFAST 07737bfe8a7c1c87aaa393a0098d16b0)
-  set(MD5_JPEG_420_IFAST_Q100_PROG a1da220b5604081863a504297ed59e55)
+  set(MD5_JPEG_420_IFAST_Q100_PROG 008ab68d6ddbba04a8f01deee4e0f9f8)
   set(MD5_PPM_420_Q100_IFAST 1b3730122709f53d007255e8dfd3305e)
   set(MD5_PPM_420M_Q100_IFAST 980a1a3c5bf9510022869d30b7d26566)
   set(MD5_JPEG_GRAY_ISLOW 235c90707b16e2e069f37c888b2636d9)
@@ -765,7 +767,7 @@ else()
   set(MD5_PPM_422M_IFAST 8dbc65323d62cca7c91ba02dd1cfa81d)
   set(MD5_BMP_422M_IFAST_565 3294bd4d9a1f2b3d08ea6020d0db7065)
   set(MD5_BMP_422M_IFAST_565D da98c9c7b6039511be4a79a878a9abc1)
-  set(MD5_JPEG_420_IFAST_Q100_PROG 990cbe0329c882420a2094da7e5adade)
+  set(MD5_JPEG_420_IFAST_Q100_PROG e59bb462016a8d9a748c330a3474bb55)
   set(MD5_PPM_420_Q100_IFAST 5a732542015c278ff43635e473a8a294)
   set(MD5_PPM_420M_Q100_IFAST ff692ee9323a3b424894862557c092f1)
   set(MD5_JPEG_GRAY_ISLOW 72b51f894b8f4a10b3ee3066770aa38d)
@@ -1032,6 +1034,8 @@ foreach(libtype ${TEST_LIBTYPES})
 
   add_test(djpeg-${libtype}-rgb-islow-icc-cmp
     ${MD5CMP} b06a39d730129122e85c1363ed1bbc9e testout_rgb_islow.icc)
+  set_tests_properties(djpeg-${libtype}-rgb-islow-icc-cmp PROPERTIES
+    DEPENDS djpeg-${libtype}-rgb-islow)
 
   add_bittest(jpegtran icc "-copy;all;-icc;${TESTIMAGES}/test2.icc"
     testout_rgb_islow2.jpg testout_rgb_islow.jpg ${MD5_JPEG_RGB_ISLOW2})
@@ -1078,7 +1082,7 @@ foreach(libtype ${TEST_LIBTYPES})
 
   # CC: RGB->YCC  SAMP: fullsize/h2v2  FDCT: ifast  ENT: prog huff
   add_bittest(cjpeg 420-q100-ifast-prog
-    "-sample;2x2;-quality;100;-dct;fast;-prog"
+    "-sample;2x2;-quality;100;-dct;fast;-scans;${TESTIMAGES}/test.scan"
     testout_420_q100_ifast_prog.jpg ${TESTIMAGES}/testorig.ppm
     ${MD5_JPEG_420_IFAST_Q100_PROG})
 
@@ -1126,12 +1130,12 @@ foreach(libtype ${TEST_LIBTYPES})
     ${MD5_JPEG_420S_IFAST_OPT})
 
   if(FLOATTEST)
-  # CC: RGB->YCC  SAMP: fullsize/int  FDCT: float  ENT: prog huff
+    # CC: RGB->YCC  SAMP: fullsize/int  FDCT: float  ENT: prog huff
     add_bittest(cjpeg 3x2-float-prog "-sample;3x2;-dct;float;-prog"
       testout_3x2_float_prog.jpg ${TESTIMAGES}/testorig.ppm
       ${MD5_JPEG_3x2_FLOAT_PROG_${FLOATTEST_UC}})
 
-  # CC: YCC->RGB  SAMP: fullsize/int  IDCT: float  ENT: prog huff
+    # CC: YCC->RGB  SAMP: fullsize/int  IDCT: float  ENT: prog huff
     add_bittest(djpeg 3x2-float-prog "-dct;float"
       testout_3x2_float.ppm testout_3x2_float_prog.jpg
       ${MD5_PPM_3x2_FLOAT_${FLOATTEST_UC}} cjpeg-${libtype}-3x2-float-prog)
@@ -1321,6 +1325,8 @@ if(WITH_TURBOJPEG)
       COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -yuv -alloc
       COMMAND echo tjbenchtest -progressive
       COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -progressive
+      COMMAND echo tjbenchtest -progressive -yuv
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -progressive -yuv
       COMMAND echo tjexampletest
       COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjexampletest
       COMMAND echo tjbenchtest.java
@@ -1329,6 +1335,9 @@ if(WITH_TURBOJPEG)
       COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java -yuv
       COMMAND echo tjbenchtest.java -progressive
       COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java -progressive
+      COMMAND echo tjexampletest.java -progressive -yuv
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java
+        -progressive -yuv
       COMMAND echo tjexampletest.java
       COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjexampletest.java
       DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest
@@ -1344,6 +1353,10 @@ if(WITH_TURBOJPEG)
       COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -yuv
       COMMAND echo tjbenchtest -yuv -alloc
       COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -yuv -alloc
+      COMMAND echo tjbenchtest -progressive
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -progressive
+      COMMAND echo tjbenchtest -progressive -yuv
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -progressive -yuv
       COMMAND echo tjexampletest
       COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjexampletest
       DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest)
@@ -1363,12 +1376,22 @@ if(WITH_TURBOJPEG)
       ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
       LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
       RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+    if(NOT CMAKE_VERSION VERSION_LESS "3.1" AND MSVC AND
+      CMAKE_C_LINKER_SUPPORTS_PDB)
+      install(FILES "$<TARGET_PDB_FILE:turbojpeg>"
+        DESTINATION ${CMAKE_INSTALL_BINDIR} OPTIONAL)
+    endif()
   endif()
   if(ENABLE_STATIC)
     install(TARGETS turbojpeg-static ARCHIVE
       DESTINATION ${CMAKE_INSTALL_LIBDIR})
     if(NOT ENABLE_SHARED)
-      install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/tjbench-static${EXE}
+      if(MSVC_IDE OR XCODE)
+        set(DIR "${CMAKE_CURRENT_BINARY_DIR}/\${CMAKE_INSTALL_CONFIG_NAME}")
+      else()
+        set(DIR ${CMAKE_CURRENT_BINARY_DIR})
+      endif()
+      install(PROGRAMS ${DIR}/tjbench-static${EXE}
         DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME tjbench${EXE})
     endif()
   endif()
@@ -1379,11 +1402,16 @@ endif()
 if(ENABLE_STATIC)
   install(TARGETS jpeg-static ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
   if(NOT ENABLE_SHARED)
-    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/cjpeg-static${EXE}
+    if(MSVC_IDE OR XCODE)
+      set(DIR "${CMAKE_CURRENT_BINARY_DIR}/\${CMAKE_INSTALL_CONFIG_NAME}")
+    else()
+      set(DIR ${CMAKE_CURRENT_BINARY_DIR})
+    endif()
+    install(PROGRAMS ${DIR}/cjpeg-static${EXE}
       DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME cjpeg${EXE})
-    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/djpeg-static${EXE}
+    install(PROGRAMS ${DIR}/djpeg-static${EXE}
       DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME djpeg${EXE})
-    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/jpegtran-static${EXE}
+    install(PROGRAMS ${DIR}/jpegtran-static${EXE}
       DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME jpegtran${EXE})
   endif()
 endif()
@@ -1408,10 +1436,10 @@ if(UNIX OR MINGW)
     ${CMAKE_CURRENT_SOURCE_DIR}/rdjpgcom.1
     ${CMAKE_CURRENT_SOURCE_DIR}/wrjpgcom.1
     DESTINATION ${CMAKE_INSTALL_MANDIR}/man1)
-  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/libjpeg.pc
+endif()
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/libjpeg.pc
     ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/libturbojpeg.pc
     DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
-endif()
 
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/jconfig.h
   ${CMAKE_CURRENT_SOURCE_DIR}/jerror.h ${CMAKE_CURRENT_SOURCE_DIR}/jmorecfg.h
diff --git a/ChangeLog.md b/ChangeLog.md
index 8f0d11b1..4d1219e5 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,3 +1,152 @@
+2.0.4
+=====
+
+### Significant changes relative to 2.0.3:
+
+1. Fixed a regression in the Windows packaging system (introduced by
+2.0 beta1[2]) whereby, if both the 64-bit libjpeg-turbo SDK for GCC and the
+64-bit libjpeg-turbo SDK for Visual C++ were installed on the same system, only
+one of them could be uninstalled.
+
+2. Fixed a signed integer overflow and subsequent segfault that occurred when
+attempting to decompress images with more than 715827882 pixels using the
+64-bit C version of TJBench.
+
+3. Fixed out-of-bounds write in `tjDecompressToYUV2()` and
+`tjDecompressToYUVPlanes()` (sometimes manifesting as a double free) that
+occurred when attempting to decompress grayscale JPEG images that were
+compressed with a sampling factor other than 1 (for instance, with
+`cjpeg -grayscale -sample 2x2`).
+
+4. Fixed a regression introduced by 2.0.2[5] that caused the TurboJPEG API to
+incorrectly identify some JPEG images with unusual sampling factors as 4:4:4
+JPEG images.  This was known to cause a buffer overflow when attempting to
+decompress some such images using `tjDecompressToYUV2()` or
+`tjDecompressToYUVPlanes()`.
+
+5. Fixed an issue, detected by ASan, whereby attempting to losslessly transform
+a specially-crafted malformed JPEG image containing an extremely-high-frequency
+coefficient block (junk image data that could never be generated by a
+legitimate JPEG compressor) could cause the Huffman encoder's local buffer to
+be overrun. (Refer to 1.4.0[9] and 1.4beta1[15].)  Given that the buffer
+overrun was fully contained within the stack and did not cause a segfault or
+other user-visible errant behavior, and given that the lossless transformer
+(unlike the decompressor) is not generally exposed to arbitrary data exploits,
+this issue did not likely pose a security risk.
+
+6. The ARM 64-bit (ARMv8) NEON SIMD assembly code now stores constants in a
+separate read-only data section rather than in the text section, to support
+execute-only memory layouts.
+
+
+2.0.3
+=====
+
+### Significant changes relative to 2.0.2:
+
+1. Fixed "using JNI after critical get" errors that occurred on Android
+platforms when passing invalid arguments to certain methods in the TurboJPEG
+Java API.
+
+2. Fixed a regression in the SIMD feature detection code, introduced by
+the AVX2 SIMD extensions (2.0 beta1[1]), that was known to cause an illegal
+instruction exception, in rare cases, on CPUs that lack support for CPUID leaf
+07H (or on which the maximum CPUID leaf has been limited by way of a BIOS
+setting.)
+
+3. The 4:4:0 (h1v2) fancy (smooth) chroma upsampling algorithm in the
+decompressor now uses a similar bias pattern to that of the 4:2:2 (h2v1) fancy
+chroma upsampling algorithm, rounding up or down the upsampled result for
+alternate pixels rather than always rounding down.  This ensures that,
+regardless of whether a 4:2:2 JPEG image is rotated or transposed prior to
+decompression (in the frequency domain) or after decompression (in the spatial
+domain), the final image will be similar.
+
+4. Fixed an integer overflow and subsequent segfault that occurred when
+attempting to compress or decompress images with more than 1 billion pixels
+using the TurboJPEG API.
+
+5. Fixed a regression introduced by 2.0 beta1[15] whereby attempting to
+generate a progressive JPEG image on an SSE2-capable CPU using a scan script
+containing one or more scans with lengths divisible by 16 would result in an
+error ("Missing Huffman code table entry") and an invalid JPEG image.
+
+6. Fixed an issue whereby `tjDecodeYUV()` and `tjDecodeYUVPlanes()` would throw
+an error ("Invalid progressive parameters") or a warning ("Inconsistent
+progression sequence") if passed a TurboJPEG instance that was previously used
+to decompress a progressive JPEG image.
+
+
+2.0.2
+=====
+
+### Significant changes relative to 2.0.1:
+
+1. Fixed a regression introduced by 2.0.1[5] that prevented a runtime search
+path (rpath) from being embedded in the libjpeg-turbo shared libraries and
+executables for macOS and iOS.  This caused a fatal error of the form
+"dyld: Library not loaded" when attempting to use one of the executables,
+unless `DYLD_LIBRARY_PATH` was explicitly set to the location of the
+libjpeg-turbo shared libraries.
+
+2. Fixed an integer overflow and subsequent segfault (CVE-2018-20330) that
+occurred when attempting to load a BMP file with more than 1 billion pixels
+using the `tjLoadImage()` function.
+
+3. Fixed a buffer overrun (CVE-2018-19664) that occurred when attempting to
+decompress a specially-crafted malformed JPEG image to a 256-color BMP using
+djpeg.
+
+4. Fixed a floating point exception that occurred when attempting to
+decompress a specially-crafted malformed JPEG image with a specified image
+width or height of 0 using the C version of TJBench.
+
+5. The TurboJPEG API will now decompress 4:4:4 JPEG images with 2x1, 1x2, 3x1,
+or 1x3 luminance and chrominance sampling factors.  This is a non-standard way
+of specifying 1x subsampling (normally 4:4:4 JPEGs have 1x1 luminance and
+chrominance sampling factors), but the JPEG format and the libjpeg API both
+allow it.
+
+6. Fixed a regression introduced by 2.0 beta1[7] that caused djpeg to generate
+incorrect PPM images when used with the `-colors` option.
+
+7. Fixed an issue whereby a static build of libjpeg-turbo (a build in which
+`ENABLE_SHARED` is `0`) could not be installed using the Visual Studio IDE.
+
+8. Fixed a severe performance issue in the Loongson MMI SIMD extensions that
+occurred when compressing RGB images whose image rows were not 64-bit-aligned.
+
+
+2.0.1
+=====
+
+### Significant changes relative to 2.0.0:
+
+1. Fixed a regression introduced with the new CMake-based Un*x build system,
+whereby jconfig.h could cause compiler warnings of the form
+`"HAVE_*_H" redefined` if it was included by downstream Autotools-based
+projects that used `AC_CHECK_HEADERS()` to check for the existence of locale.h,
+stddef.h, or stdlib.h.
+
+2. The `jsimd_quantize_float_dspr2()` and `jsimd_convsamp_float_dspr2()`
+functions in the MIPS DSPr2 SIMD extensions are now disabled at compile time
+if the soft float ABI is enabled.  Those functions use instructions that are
+incompatible with the soft float ABI.
+
+3. Fixed a regression in the SIMD feature detection code, introduced by
+the AVX2 SIMD extensions (2.0 beta1[1]), that caused libjpeg-turbo to crash on
+Windows 7 if Service Pack 1 was not installed.
+
+4. Fixed out-of-bounds read in cjpeg that occurred when attempting to compress
+a specially-crafted malformed color-index (8-bit-per-sample) Targa file in
+which some of the samples (color indices) exceeded the bounds of the Targa
+file's color table.
+
+5. Fixed an issue whereby installing a fully static build of libjpeg-turbo
+(a build in which `CFLAGS` contains `-static` and `ENABLE_SHARED` is `0`) would
+fail with "No valid ELF RPATH or RUNPATH entry exists in the file."
+
+
 2.0.0
 =====
 
@@ -30,10 +179,11 @@ would produce a "Bogus message code" error message if the underlying bitmap and
 PPM readers/writers threw an error that was specific to the readers/writers
 (as opposed to a general libjpeg API error.)
 
-4. Fixed an issue whereby a specially-crafted malformed BMP file, one in which
-the header specified an image width of 1073741824 pixels, would trigger a
-floating point exception (division by zero) in the `tjLoadImage()` function
-when attempting to load the BMP file into a 4-component image buffer.
+4. Fixed an issue (CVE-2018-1152) whereby a specially-crafted malformed BMP
+file, one in which the header specified an image width of 1073741824 pixels,
+would trigger a floating point exception (division by zero) in the
+`tjLoadImage()` function when attempting to load the BMP file into a
+4-component image buffer.
 
 5. Fixed an issue whereby certain combinations of calls to
 `jpeg_skip_scanlines()` and `jpeg_read_scanlines()` could trigger an infinite
@@ -47,10 +197,10 @@ a 4:2:2 or 4:2:0 JPEG image using the merged (non-fancy) upsampling algorithms
 7. The new CMake-based build system will now disable the MIPS DSPr2 SIMD
 extensions if it detects that the compiler does not support DSPr2 instructions.
 
-8. Fixed out-of-bounds read in cjpeg that occurred when attempting to compress
-a specially-crafted malformed color-index (8-bit-per-sample) BMP file in which
-some of the samples (color indices) exceeded the bounds of the BMP file's color
-table.
+8. Fixed out-of-bounds read in cjpeg (CVE-2018-14498) that occurred when
+attempting to compress a specially-crafted malformed color-index
+(8-bit-per-sample) BMP file in which some of the samples (color indices)
+exceeded the bounds of the BMP file's color table.
 
 9. Fixed a signed integer overflow in the progressive Huffman decoder, detected
 by the Clang and GCC undefined behavior sanitizers, that could be triggered by
@@ -210,8 +360,8 @@ write scanlines in bottom-up order.)  djpeg will now exit gracefully if an
 output format other than PPM/PGM, GIF, or Targa is selected along with the
 `-crop` option.
 
-4. Fixed an issue whereby `jpeg_skip_scanlines()` would segfault if color
-quantization was enabled.
+4. Fixed an issue (CVE-2017-15232) whereby `jpeg_skip_scanlines()` would
+segfault if color quantization was enabled.
 
 5. TJBench (both C and Java versions) will now display usage information if any
 command-line argument is unrecognized.  This prevents the program from silently
@@ -838,13 +988,13 @@ and IDCT algorithms (both are used during JPEG decompression.)  For unknown
 reasons (probably related to clang), this code cannot currently be compiled for
 iOS.
 
-15. Fixed an extremely rare bug that could cause the Huffman encoder's local
-buffer to overrun when a very high-frequency MCU is compressed using quality
-100 and no subsampling, and when the JPEG output buffer is being dynamically
-resized by the destination manager.  This issue was so rare that, even with a
-test program specifically designed to make the bug occur (by injecting random
-high-frequency YUV data into the compressor), it was reproducible only once in
-about every 25 million iterations.
+15. Fixed an extremely rare bug (CVE-2014-9092) that could cause the Huffman
+encoder's local buffer to overrun when a very high-frequency MCU is compressed
+using quality 100 and no subsampling, and when the JPEG output buffer is being
+dynamically resized by the destination manager.  This issue was so rare that,
+even with a test program specifically designed to make the bug occur (by
+injecting random high-frequency YUV data into the compressor), it was
+reproducible only once in about every 25 million iterations.
 
 16. Fixed an oversight in the TurboJPEG C wrapper:  if any of the JPEG
 compression functions was called repeatedly with the same
@@ -879,8 +1029,9 @@ entropy coding (by passing arguments of `-progressive -arithmetic` to cjpeg or
 jpegtran, for instance) would result in an error, `Requested feature was
 omitted at compile time`.
 
-4. Fixed a couple of issues whereby malformed JPEG images would cause
-libjpeg-turbo to use uninitialized memory during decompression.
+4. Fixed a couple of issues (CVE-2013-6629 and CVE-2013-6630) whereby malformed
+JPEG images would cause libjpeg-turbo to use uninitialized memory during
+decompression.
 
 5. Fixed an error (`Buffer passed to JPEG library is too small`) that occurred
 when calling the TurboJPEG YUV encoding function with a very small (< 5x5)
@@ -1019,9 +1170,9 @@ correct behavior of the colorspace extensions when merged upsampling is used.
 upper 64 bits of xmm6 and xmm7 on Win64 platforms, which violated the Win64
 calling conventions.
 
-4. Fixed a regression caused by 1.2.0[6] whereby decompressing corrupt JPEG
-images (specifically, images in which the component count was erroneously set
-to a large value) would cause libjpeg-turbo to segfault.
+4. Fixed a regression (CVE-2012-2806) caused by 1.2.0[6] whereby decompressing
+corrupt JPEG images (specifically, images in which the component count was
+erroneously set to a large value) would cause libjpeg-turbo to segfault.
 
 5. Worked around a severe performance issue with "Bobcat" (AMD Embedded APU)
 processors.  The `MASKMOVDQU` instruction, which was used by the libjpeg-turbo
diff --git a/LICENSE.md b/LICENSE.md
index 0f6ec4b3..99c9aadc 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -14,7 +14,7 @@ libjpeg-turbo is covered by three compatible BSD-style open source licenses:
   This license covers the TurboJPEG API library and associated programs, as
   well as the build system.
 
-- The zlib License, which is listed below
+- The [zlib License](https://opensource.org/licenses/Zlib)
 
   This license is a subset of the other two, and it covers the libjpeg-turbo
   SIMD extensions.
@@ -66,7 +66,7 @@ best of our understanding.
 
     2.  If your binary distribution includes or uses the TurboJPEG API, then
         your product documentation must include the text of the Modified BSD
-        License.
+        License (see below.)
 
         **Origin**
         - Clause 2 of the Modified BSD License
@@ -91,7 +91,8 @@ best of our understanding.
 The Modified (3-clause) BSD License
 ===================================
 
-Copyright (C)\<YEAR\> \<AUTHOR\>.  All Rights Reserved.
+Copyright (C)2009-2020 D. R. Commander.  All Rights Reserved.
+Copyright (C)2015 Viktor Szathmáry.  All Rights Reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
@@ -118,28 +119,6 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 
 
-The zlib License
-================
-
-Copyright (C) \<YEAR\>, \<AUTHOR\>.
-
-This software is provided 'as-is', without any express or implied
-warranty.  In no event will the authors be held liable for any damages
-arising from the use of this software.
-
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it
-freely, subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not
-   claim that you wrote the original software. If you use this software
-   in a product, an acknowledgment in the product documentation would be
-   appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be
-   misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-
-
 Why Three Licenses?
 ===================
 
diff --git a/appveyor.yml b/appveyor.yml
index 971a8de3..df880837 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -17,7 +17,7 @@ before_build:
   - cd cmake_build
   - cmake .. -G "Visual Studio 15 2017" -DPNG_SUPPORTED=NO
 
-build_script: 
+build_script:
   - cd %APPVEYOR_BUILD_FOLDER%
   - msbuild cmake_build\mozjpeg.sln
 
diff --git a/cjpeg.c b/cjpeg.c
index e4fe35b9..99dd5c7d 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -879,12 +879,10 @@ main(int argc, char **argv)
 
   if (memdst) {
     fprintf(stderr, "Compressed size:  %lu bytes\n", outsize);
-    if (outbuffer != NULL)
-      free(outbuffer);
+    free(outbuffer);
   }
 
-  if (icc_profile != NULL)
-    free(icc_profile);
+  free(icc_profile);
 
   /* All done. */
   exit(jerr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS);
diff --git a/cmakescripts/BuildPackages.cmake b/cmakescripts/BuildPackages.cmake
index 57f0672f..395dd989 100644
--- a/cmakescripts/BuildPackages.cmake
+++ b/cmakescripts/BuildPackages.cmake
@@ -83,7 +83,7 @@ endif()
 if(BITS EQUAL 64)
   set(INST_PLATFORM "${INST_PLATFORM} 64-bit")
   set(INST_NAME ${INST_NAME}64)
-  set(INST_REG_NAME ${INST_DIR}64)
+  set(INST_REG_NAME ${INST_REG_NAME}64)
   set(INST_DEFS ${INST_DEFS} -DWIN64)
 endif()
 
@@ -145,6 +145,11 @@ set(DEFAULT_IOS_ARMV8_BUILD ${CMAKE_SOURCE_DIR}/iosarmv8)
 set(IOS_ARMV8_BUILD ${DEFAULT_IOS_ARMV8_BUILD} CACHE PATH
   "Directory containing ARMv8 iOS build to include in universal binaries (default: ${DEFAULT_IOS_ARMV8_BUILD})")
 
+set(OSX_APP_CERT_NAME "" CACHE STRING
+  "Name of the Developer ID Application certificate (in the macOS keychain) that should be used to sign the libjpeg-turbo DMG.  Leave this blank to generate an unsigned DMG.")
+set(OSX_INST_CERT_NAME "" CACHE STRING
+  "Name of the Developer ID Installer certificate (in the macOS keychain) that should be used to sign the libjpeg-turbo installer package.  Leave this blank to generate an unsigned package.")
+
 configure_file(release/makemacpkg.in pkgscripts/makemacpkg)
 configure_file(release/Distribution.xml.in pkgscripts/Distribution.xml)
 configure_file(release/uninstall.in pkgscripts/uninstall)
diff --git a/cmakescripts/GNUInstallDirs.cmake b/cmakescripts/GNUInstallDirs.cmake
index ef564bb6..7c411965 100644
--- a/cmakescripts/GNUInstallDirs.cmake
+++ b/cmakescripts/GNUInstallDirs.cmake
@@ -118,7 +118,7 @@
 #   absolute paths where necessary, using the same logic.
 
 #=============================================================================
-# Copyright 2016 D. R. Commander
+# Copyright 2016, 2019 D. R. Commander
 # Copyright 2016 Dmitry Marakasov
 # Copyright 2016 Roger Leigh
 # Copyright 2015 Alex Turbov
@@ -184,7 +184,7 @@ macro(GNUInstallDirs_set_install_dir var docstring)
     "${docstring} (Default: ${CMAKE_INSTALL_DEFAULT_${var}})"
     ${_GNUInstallDirs_CMAKE_INSTALL_FORCE_${var}})
 
-  if(NOT "${CMAKE_INSTALL_${var}}" STREQUAL "${CMAKE_INSTALL_DEFAULT_${var}}")
+  if(NOT CMAKE_INSTALL_${var} STREQUAL CMAKE_INSTALL_DEFAULT_${var})
     unset(_GNUInstallDirs_CMAKE_INSTALL_DEFAULT_${var} CACHE)
   endif()
 
diff --git a/djpeg.c b/djpeg.c
index 920e90d7..e127cf79 100644
--- a/djpeg.c
+++ b/djpeg.c
@@ -516,7 +516,9 @@ main(int argc, char **argv)
   FILE *input_file;
   FILE *output_file;
   unsigned char *inbuffer = NULL;
+#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
   unsigned long insize = 0;
+#endif
   JDIMENSION num_scanlines;
 
   /* On Mac, fetch a command line. */
@@ -811,7 +813,7 @@ main(int argc, char **argv)
   end_progress_monitor((j_common_ptr)&cinfo);
 #endif
 
-  if (memsrc && inbuffer != NULL)
+  if (memsrc)
     free(inbuffer);
 
   /* All done. */
diff --git a/doc/html/group___turbo_j_p_e_g.html b/doc/html/group___turbo_j_p_e_g.html
index cef856a7..5d67d78a 100644
--- a/doc/html/group___turbo_j_p_e_g.html
+++ b/doc/html/group___turbo_j_p_e_g.html
@@ -2078,7 +2078,7 @@ If you choose option 1, <code>*jpegSize</code> should be set to the size of your
 <p>You should always use this function to free JPEG destination buffer(s) that were automatically (re)allocated by the compression and transform functions or that were manually allocated using <a class="el" href="group___turbo_j_p_e_g.html#gaec627dd4c5f30b7a775a7aea3bec5d83" title="Allocate an image buffer for use with TurboJPEG.">tjAlloc()</a>.</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramname">buffer</td><td>address of the buffer to free</td></tr>
+    <tr><td class="paramname">buffer</td><td>address of the buffer to free. If the address is NULL, then this function has no effect.</td></tr>
   </table>
   </dd>
 </dl>
diff --git a/example.txt b/example.txt
index 04c11fee..bc0ba49d 100644
--- a/example.txt
+++ b/example.txt
@@ -288,12 +288,14 @@ my_error_exit(j_common_ptr cinfo)
 }
 
 
+METHODDEF(int) do_read_JPEG_file(struct jpeg_decompress_struct *cinfo,
+                                 char *filename);
+
 /*
  * Sample routine for JPEG decompression.  We assume that the source file name
  * is passed in.  We want to return 1 on success, 0 on error.
  */
 
-
 GLOBAL(int)
 read_JPEG_file(char *filename)
 {
@@ -301,6 +303,21 @@ read_JPEG_file(char *filename)
    * working space (which is allocated as needed by the JPEG library).
    */
   struct jpeg_decompress_struct cinfo;
+
+  return do_read_JPEG_file(&cinfo, filename);
+}
+
+/*
+ * We call the libjpeg API from within a separate function, because modifying
+ * the local non-volatile jpeg_decompress_struct instance below the setjmp()
+ * return point and then accessing the instance after setjmp() returns would
+ * return in undefined behavior that may potentially overwrite all or part of
+ * the structure.
+ */
+
+METHODDEF(int)
+do_read_JPEG_file(struct jpeg_decompress_struct *cinfo, char *filename)
+{
   /* We use our private extension JPEG error handler.
    * Note that this struct must live as long as the main JPEG parameter
    * struct, to avoid dangling-pointer problems.
@@ -325,27 +342,27 @@ read_JPEG_file(char *filename)
   /* Step 1: allocate and initialize JPEG decompression object */
 
   /* We set up the normal JPEG error routines, then override error_exit. */
-  cinfo.err = jpeg_std_error(&jerr.pub);
+  cinfo->err = jpeg_std_error(&jerr.pub);
   jerr.pub.error_exit = my_error_exit;
   /* Establish the setjmp return context for my_error_exit to use. */
   if (setjmp(jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error.
      * We need to clean up the JPEG object, close the input file, and return.
      */
-    jpeg_destroy_decompress(&cinfo);
+    jpeg_destroy_decompress(cinfo);
     fclose(infile);
     return 0;
   }
   /* Now we can initialize the JPEG decompression object. */
-  jpeg_create_decompress(&cinfo);
+  jpeg_create_decompress(cinfo);
 
   /* Step 2: specify data source (eg, a file) */
 
-  jpeg_stdio_src(&cinfo, infile);
+  jpeg_stdio_src(cinfo, infile);
 
   /* Step 3: read file parameters with jpeg_read_header() */
 
-  (void)jpeg_read_header(&cinfo, TRUE);
+  (void)jpeg_read_header(cinfo, TRUE);
   /* We can ignore the return value from jpeg_read_header since
    *   (a) suspension is not possible with the stdio data source, and
    *   (b) we passed TRUE to reject a tables-only JPEG file as an error.
@@ -360,7 +377,7 @@ read_JPEG_file(char *filename)
 
   /* Step 5: Start decompressor */
 
-  (void)jpeg_start_decompress(&cinfo);
+  (void)jpeg_start_decompress(cinfo);
   /* We can ignore the return value since suspension is not possible
    * with the stdio data source.
    */
@@ -372,30 +389,30 @@ read_JPEG_file(char *filename)
    * In this example, we need to make an output work buffer of the right size.
    */
   /* JSAMPLEs per row in output buffer */
-  row_stride = cinfo.output_width * cinfo.output_components;
+  row_stride = cinfo->output_width * cinfo->output_components;
   /* Make a one-row-high sample array that will go away when done with image */
-  buffer = (*cinfo.mem->alloc_sarray)
-                ((j_common_ptr)&cinfo, JPOOL_IMAGE, row_stride, 1);
+  buffer = (*cinfo->mem->alloc_sarray)
+                ((j_common_ptr)cinfo, JPOOL_IMAGE, row_stride, 1);
 
   /* Step 6: while (scan lines remain to be read) */
   /*           jpeg_read_scanlines(...); */
 
-  /* Here we use the library's state variable cinfo.output_scanline as the
+  /* Here we use the library's state variable cinfo->output_scanline as the
    * loop counter, so that we don't have to keep track ourselves.
    */
-  while (cinfo.output_scanline < cinfo.output_height) {
+  while (cinfo->output_scanline < cinfo->output_height) {
     /* jpeg_read_scanlines expects an array of pointers to scanlines.
      * Here the array is only one element long, but you could ask for
      * more than one scanline at a time if that's more convenient.
      */
-    (void)jpeg_read_scanlines(&cinfo, buffer, 1);
+    (void)jpeg_read_scanlines(cinfo, buffer, 1);
     /* Assume put_scanline_someplace wants a pointer and sample count. */
     put_scanline_someplace(buffer[0], row_stride);
   }
 
   /* Step 7: Finish decompression */
 
-  (void)jpeg_finish_decompress(&cinfo);
+  (void)jpeg_finish_decompress(cinfo);
   /* We can ignore the return value since suspension is not possible
    * with the stdio data source.
    */
@@ -403,7 +420,7 @@ read_JPEG_file(char *filename)
   /* Step 8: Release JPEG decompression object */
 
   /* This is an important step since it will release a good deal of memory. */
-  jpeg_destroy_decompress(&cinfo);
+  jpeg_destroy_decompress(cinfo);
 
   /* After finish_decompress, we can close the input file.
    * Here we postpone it until after no more JPEG errors are possible,
diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt
index b7ee1e4d..578b8162 100644
--- a/java/CMakeLists.txt
+++ b/java/CMakeLists.txt
@@ -58,11 +58,21 @@ endif()
 add_custom_target(javadoc COMMAND
   javadoc -notimestamp -d ${CMAKE_CURRENT_SOURCE_DIR}/doc -sourcepath ${CMAKE_CURRENT_SOURCE_DIR} org.libjpegturbo.turbojpeg)
 set(JAVACLASSPATH ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_FILES_DIRECTORY}/turbojpeg-java.dir)
-add_custom_target(javah
-  COMMAND javah -d ${CMAKE_CURRENT_SOURCE_DIR} -classpath ${JAVACLASSPATH} org.libjpegturbo.turbojpeg.TJ
-  COMMAND javah -d ${CMAKE_CURRENT_SOURCE_DIR} -classpath ${JAVACLASSPATH} org.libjpegturbo.turbojpeg.TJCompressor
-  COMMAND javah -d ${CMAKE_CURRENT_SOURCE_DIR} -classpath ${JAVACLASSPATH} org.libjpegturbo.turbojpeg.TJDecompressor
-  COMMAND javah -d ${CMAKE_CURRENT_SOURCE_DIR} -classpath ${JAVACLASSPATH} org.libjpegturbo.turbojpeg.TJTransformer)
+if(Java_VERSION_MAJOR GREATER 9)
+  add_custom_target(javah
+    COMMAND javac -h ${CMAKE_CURRENT_SOURCE_DIR} -classpath ${JAVACLASSPATH}
+      -d ${CMAKE_CURRENT_BINARY_DIR}/__unused
+      ${CMAKE_CURRENT_SOURCE_DIR}/org/libjpegturbo/turbojpeg/TJ.java
+      ${CMAKE_CURRENT_SOURCE_DIR}/org/libjpegturbo/turbojpeg/TJCompressor.java
+      ${CMAKE_CURRENT_SOURCE_DIR}/org/libjpegturbo/turbojpeg/TJDecompressor.java
+      ${CMAKE_CURRENT_SOURCE_DIR}/org/libjpegturbo/turbojpeg/TJTransformer.java)
+else()
+  add_custom_target(javah
+    COMMAND javah -d ${CMAKE_CURRENT_SOURCE_DIR} -classpath ${JAVACLASSPATH} org.libjpegturbo.turbojpeg.TJ
+    COMMAND javah -d ${CMAKE_CURRENT_SOURCE_DIR} -classpath ${JAVACLASSPATH} org.libjpegturbo.turbojpeg.TJCompressor
+    COMMAND javah -d ${CMAKE_CURRENT_SOURCE_DIR} -classpath ${JAVACLASSPATH} org.libjpegturbo.turbojpeg.TJDecompressor
+    COMMAND javah -d ${CMAKE_CURRENT_SOURCE_DIR} -classpath ${JAVACLASSPATH} org.libjpegturbo.turbojpeg.TJTransformer)
+endif()
 
 if(NOT DEFINED CMAKE_INSTALL_DEFAULT_JAVADIR)
   set(CMAKE_INSTALL_DEFAULT_JAVADIR "<CMAKE_INSTALL_DATAROOTDIR>/java")
diff --git a/java/TJBench.java b/java/TJBench.java
index bd555626..e43645ea 100644
--- a/java/TJBench.java
+++ b/java/TJBench.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2014, 2016-2018 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2014, 2016-2019 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -121,6 +121,8 @@ final class TJBench {
     int rindex = TJ.getRedOffset(pixelFormat);
     int gindex = TJ.getGreenOffset(pixelFormat);
     int bindex = TJ.getBlueOffset(pixelFormat);
+    if ((long)w[0] * (long)h[0] * (long)ps > (long)Integer.MAX_VALUE)
+      throw new Exception("Image is too large");
     byte[] dstBuf = new byte[w[0] * h[0] * ps];
     int pixels = w[0] * h[0], dstPtr = 0, rgbPtr = 0;
 
@@ -175,8 +177,11 @@ final class TJBench {
 
     tjd = new TJDecompressor();
 
-    if (dstBuf == null)
+    if (dstBuf == null) {
+      if ((long)pitch * (long)scaledh > (long)Integer.MAX_VALUE)
+        throw new Exception("Image is too large");
       dstBuf = new byte[pitch * scaledh];
+    }
 
     /* Set the destination buffer to gray so we know whether the decompressor
        attempted to write to it */
@@ -202,7 +207,9 @@ final class TJBench {
           int width = doTile ? Math.min(tilew, w - x) : scaledw;
           int height = doTile ? Math.min(tileh, h - y) : scaledh;
 
-          tjd.setSourceImage(jpegBuf[tile], jpegSize[tile]);
+          try {
+            tjd.setSourceImage(jpegBuf[tile], jpegSize[tile]);
+          } catch (TJException e) { handleTJException(e); }
           if (doYUV) {
             yuvImage.setBuf(yuvImage.getBuf(), width, yuvPad, height, subsamp);
             try {
@@ -329,6 +336,8 @@ final class TJBench {
     String pfStr = PIXFORMATSTR[pf];
     YUVImage yuvImage = null;
 
+    if ((long)pitch * (long)h > (long)Integer.MAX_VALUE)
+      throw new Exception("Image is too large");
     tmpBuf = new byte[pitch * h];
 
     if (quiet == 0)
@@ -469,6 +478,8 @@ final class TJBench {
       if (!compOnly)
         decomp(srcBuf, jpegBuf, jpegSize, tmpBuf, w, h, subsamp, jpegQual,
                fileName, tilew, tileh);
+      else if (quiet == 1)
+        System.out.println("N/A");
 
       if (tilew == w && tileh == h) break;
     }
@@ -489,6 +500,8 @@ final class TJBench {
     int tw, th, ttilew, ttileh, tntilesw, tntilesh, tsubsamp;
 
     FileInputStream fis = new FileInputStream(fileName);
+    if (fis.getChannel().size() > (long)Integer.MAX_VALUE)
+      throw new Exception("Image is too large");
     int srcSize = (int)fis.getChannel().size();
     srcBuf = new byte[srcSize];
     fis.read(srcBuf, 0, srcSize);
@@ -500,7 +513,9 @@ final class TJBench {
 
     tjt = new TJTransformer();
 
-    tjt.setSourceImage(srcBuf, srcSize);
+    try {
+      tjt.setSourceImage(srcBuf, srcSize);
+    } catch (TJException e) { handleTJException(e); }
     w = tjt.getWidth();
     h = tjt.getHeight();
     subsamp = tjt.getSubsamp();
@@ -607,7 +622,9 @@ final class TJBench {
         elapsed = 0.;
         while (true) {
           start = getTime();
-          tjt.transform(jpegBuf, t, flags);
+          try {
+            tjt.transform(jpegBuf, t, flags);
+          } catch (TJException e) { handleTJException(e); }
           jpegSize = tjt.getTransformedSizes();
           elapsed += getTime() - start;
           if (iter >= 0) {
@@ -705,7 +722,7 @@ final class TJBench {
     System.out.println("     bytes to which each row of each plane in the intermediate YUV image is");
     System.out.println("     padded (default = 1)");
     System.out.println("-scale M/N = Scale down the width/height of the decompressed JPEG image by a");
-    System.out.print  ("     factor of M/N (M/N = ");
+    System.out.print("     factor of M/N (M/N = ");
     for (i = 0; i < nsf; i++) {
       System.out.format("%d/%d", scalingFactors[i].getNum(),
                         scalingFactors[i].getDenom());
diff --git a/jchuff.c b/jchuff.c
index 939b3e76..cb05055d 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
+ * Copyright (C) 2009-2011, 2014-2016, 2018-2019, D. R. Commander.
  * Copyright (C) 2015, Matthieu Darbois.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -43,8 +43,8 @@
  */
 
 /* NOTE: Both GCC and Clang define __GNUC__ */
-#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-#if !defined __thumb__ || defined __thumb2__
+#if defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))
+#if !defined(__thumb__) || defined(__thumb2__)
 #define USE_CLZ_INTRINSIC
 #endif
 #endif
@@ -356,6 +356,8 @@ dump_buffer(working_state *state)
   put_buffer = (put_buffer << size) | code; \
 }
 
+#if SIZEOF_SIZE_T != 8 && !defined(_WIN64)
+
 #define CHECKBUF15() { \
   if (put_bits > 15) { \
     EMIT_BYTE() \
@@ -363,6 +365,8 @@ dump_buffer(working_state *state)
   } \
 }
 
+#endif
+
 #define CHECKBUF31() { \
   if (put_bits > 31) { \
     EMIT_BYTE() \
@@ -428,7 +432,7 @@ dump_buffer(working_state *state)
  * scanning order-- 1, 8, 16, etc.), then this will produce an encoded block
  * larger than 200 bytes.
  */
-#define BUFSIZE  (DCTSIZE2 * 4)
+#define BUFSIZE  (DCTSIZE2 * 8)
 
 #define LOAD_BUFFER() { \
   if (state->free_in_buffer < BUFSIZE) { \
diff --git a/jcmaster.c b/jcmaster.c
index 6732cc98..fd605073 100644
--- a/jcmaster.c
+++ b/jcmaster.c
@@ -507,8 +507,8 @@ prepare_for_pass (j_compress_ptr cinfo)
      */
     master->pass_type = output_pass;
     master->pass_number++;
-    /*FALLTHROUGH*/
 #endif
+    /*FALLTHROUGH*/
   case output_pass:
     /* Do a data-output pass. */
     /* We need not repeat per-scan setup if prior optimization pass did it. */
diff --git a/jconfig.h.in b/jconfig.h.in
index 28427548..18a69a48 100644
--- a/jconfig.h.in
+++ b/jconfig.h.in
@@ -10,16 +10,16 @@
 #define LIBJPEG_TURBO_VERSION_NUMBER  @LIBJPEG_TURBO_VERSION_NUMBER@
 
 /* Support arithmetic encoding */
-#cmakedefine C_ARITH_CODING_SUPPORTED
+#cmakedefine C_ARITH_CODING_SUPPORTED 1
 
 /* Support arithmetic decoding */
-#cmakedefine D_ARITH_CODING_SUPPORTED
+#cmakedefine D_ARITH_CODING_SUPPORTED 1
 
 /* Support in-memory source/destination managers */
-#cmakedefine MEM_SRCDST_SUPPORTED
+#cmakedefine MEM_SRCDST_SUPPORTED 1
 
 /* Use accelerated SIMD routines. */
-#cmakedefine WITH_SIMD
+#cmakedefine WITH_SIMD 1
 
 /*
  * Define BITS_IN_JSAMPLE as either
@@ -33,37 +33,37 @@
 #define BITS_IN_JSAMPLE  @BITS_IN_JSAMPLE@      /* use 8 or 12 */
 
 /* Define to 1 if you have the <locale.h> header file. */
-#cmakedefine HAVE_LOCALE_H
+#cmakedefine HAVE_LOCALE_H 1
 
 /* Define to 1 if you have the <stddef.h> header file. */
-#cmakedefine HAVE_STDDEF_H
+#cmakedefine HAVE_STDDEF_H 1
 
 /* Define to 1 if you have the <stdlib.h> header file. */
-#cmakedefine HAVE_STDLIB_H
+#cmakedefine HAVE_STDLIB_H 1
 
 /* Define if you need to include <sys/types.h> to get size_t. */
-#cmakedefine NEED_SYS_TYPES_H
+#cmakedefine NEED_SYS_TYPES_H 1
 
 /* Define if you have BSD-like bzero and bcopy in <strings.h> rather than
    memset/memcpy in <string.h>. */
-#cmakedefine NEED_BSD_STRINGS
+#cmakedefine NEED_BSD_STRINGS 1
 
 /* Define to 1 if the system has the type `unsigned char'. */
-#cmakedefine HAVE_UNSIGNED_CHAR
+#cmakedefine HAVE_UNSIGNED_CHAR 1
 
 /* Define to 1 if the system has the type `unsigned short'. */
-#cmakedefine HAVE_UNSIGNED_SHORT
+#cmakedefine HAVE_UNSIGNED_SHORT 1
 
 /* Compiler does not support pointers to undefined structures. */
-#cmakedefine INCOMPLETE_TYPES_BROKEN
+#cmakedefine INCOMPLETE_TYPES_BROKEN 1
 
 /* Define if your (broken) compiler shifts signed values as if they were
    unsigned. */
-#cmakedefine RIGHT_SHIFT_IS_UNSIGNED
+#cmakedefine RIGHT_SHIFT_IS_UNSIGNED 1
 
 /* Define to 1 if type `char' is unsigned and you are not using gcc.  */
 #ifndef __CHAR_UNSIGNED__
-  #cmakedefine __CHAR_UNSIGNED__
+  #cmakedefine __CHAR_UNSIGNED__ 1
 #endif
 
 /* Define to empty if `const' does not conform to ANSI C. */
diff --git a/jcphuff.c b/jcphuff.c
index 65a04a43..4912283d 100644
--- a/jcphuff.c
+++ b/jcphuff.c
@@ -53,8 +53,8 @@
  */
 
 /* NOTE: Both GCC and Clang define __GNUC__ */
-#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-#if !defined __thumb__ || defined __thumb2__
+#if defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))
+#if !defined(__thumb__) || defined(__thumb2__)
 #define USE_CLZ_INTRINSIC
 #endif
 #endif
diff --git a/jdatadst-tj.c b/jdatadst-tj.c
index 0bd961bd..fdaa2de1 100644
--- a/jdatadst-tj.c
+++ b/jdatadst-tj.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Modified 2009-2012 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2011, 2014, 2016, D. R. Commander.
+ * Copyright (C) 2011, 2014, 2016, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -27,6 +27,8 @@
 extern void *malloc(size_t size);
 extern void free(void *ptr);
 #endif
+void jpeg_mem_dest_tj(j_compress_ptr cinfo, unsigned char **outbuffer,
+                      unsigned long *outsize, boolean alloc);
 
 
 #define OUTPUT_BUF_SIZE  4096   /* choose an efficiently fwrite'able size */
@@ -101,8 +103,7 @@ empty_mem_output_buffer(j_compress_ptr cinfo)
 
   MEMCOPY(nextbuffer, dest->buffer, dest->bufsize);
 
-  if (dest->newbuffer != NULL)
-    free(dest->newbuffer);
+  free(dest->newbuffer);
 
   dest->newbuffer = nextbuffer;
 
diff --git a/jdatadst.c b/jdatadst.c
index 9f504fe0..a1a31b65 100644
--- a/jdatadst.c
+++ b/jdatadst.c
@@ -144,8 +144,7 @@ empty_mem_output_buffer (j_compress_ptr cinfo)
 
   MEMCOPY(nextbuffer, dest->buffer, dest->bufsize);
 
-  if (dest->newbuffer != NULL)
-    free(dest->newbuffer);
+  free(dest->newbuffer);
 
   dest->newbuffer = nextbuffer;
 
diff --git a/jdatasrc-tj.c b/jdatasrc-tj.c
index 1c713073..69fb5eaa 100644
--- a/jdatasrc-tj.c
+++ b/jdatasrc-tj.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Modified 2009-2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2011, 2016, D. R. Commander.
+ * Copyright (C) 2011, 2016, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -23,6 +23,9 @@
 #include "jpeglib.h"
 #include "jerror.h"
 
+void jpeg_mem_src_tj(j_decompress_ptr cinfo, const unsigned char *inbuffer,
+                     unsigned long insize);
+
 
 /*
  * Initialize source --- called by jpeg_read_header
diff --git a/jdcolor.c b/jdcolor.c
index fd7f661d..dc0e3b6c 100644
--- a/jdcolor.c
+++ b/jdcolor.c
@@ -592,7 +592,7 @@ ycck_cmyk_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 /* Declarations for ordered dithering
  *
  * We use a 4x4 ordered dither array packed into 32 bits.  This array is
- * sufficent for dithering RGB888 to RGB565.
+ * sufficient for dithering RGB888 to RGB565.
  */
 
 #define DITHER_MASK       0x3
diff --git a/jdhuff.c b/jdhuff.c
index 95f38e54..a1128178 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2016, 2018, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, 2018-2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -589,7 +589,11 @@ decode_mcu_slow(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     if (entropy->dc_needed[blkn]) {
       /* Convert DC difference to actual value, update last_dc_val */
       int ci = cinfo->MCU_membership[blkn];
-      s += state.last_dc_val[ci];
+      /* This is really just
+       *   s += state.last_dc_val[ci];
+       * It is written this way in order to shut up UBSan.
+       */
+      s = (int)((unsigned int)s + (unsigned int)state.last_dc_val[ci]);
       state.last_dc_val[ci] = s;
       if (block) {
         /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
@@ -684,7 +688,7 @@ decode_mcu_fast(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 
     if (entropy->dc_needed[blkn]) {
       int ci = cinfo->MCU_membership[blkn];
-      s += state.last_dc_val[ci];
+      s = (int)((unsigned int)s + (unsigned int)state.last_dc_val[ci]);
       state.last_dc_val[ci] = s;
       if (block)
         (*block)[0] = (JCOEF)s;
diff --git a/jdmerge.c b/jdmerge.c
index d67faec0..dff5a350 100644
--- a/jdmerge.c
+++ b/jdmerge.c
@@ -429,8 +429,6 @@ h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 #define PACK_TWO_PIXELS_LE(l, r)    ((r << 16) | l)
 #define PACK_TWO_PIXELS_BE(l, r)    ((l << 16) | r)
 
-#define PACK_NEED_ALIGNMENT(ptr)    (((size_t)(ptr)) & 3)
-
 #define WRITE_TWO_PIXELS_LE(addr, pixels) { \
   ((INT16 *)(addr))[0] = (INT16)(pixels); \
   ((INT16 *)(addr))[1] = (INT16)((pixels) >> 16); \
@@ -448,7 +446,7 @@ h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 /* Declarations for ordered dithering
  *
  * We use a 4x4 ordered dither array packed into 32 bits.  This array is
- * sufficent for dithering RGB888 to RGB565.
+ * sufficient for dithering RGB888 to RGB565.
  */
 
 #define DITHER_MASK       0x3
diff --git a/jdsample.c b/jdsample.c
index 52ee9af4..50a68b30 100644
--- a/jdsample.c
+++ b/jdsample.c
@@ -8,6 +8,7 @@
  * Copyright (C) 2010, 2015-2016, D. R. Commander.
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2019, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -315,9 +316,9 @@ h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr0, inptr1, outptr;
 #if BITS_IN_JSAMPLE == 8
-  int thiscolsum;
+  int thiscolsum, bias;
 #else
-  JLONG thiscolsum;
+  JLONG thiscolsum, bias;
 #endif
   JDIMENSION colctr;
   int inrow, outrow, v;
@@ -327,15 +328,18 @@ h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     for (v = 0; v < 2; v++) {
       /* inptr0 points to nearest input row, inptr1 points to next nearest */
       inptr0 = input_data[inrow];
-      if (v == 0)               /* next nearest is row above */
+      if (v == 0) {             /* next nearest is row above */
         inptr1 = input_data[inrow - 1];
-      else                      /* next nearest is row below */
+        bias = 1;
+      } else {                  /* next nearest is row below */
         inptr1 = input_data[inrow + 1];
+        bias = 2;
+      }
       outptr = output_data[outrow++];
 
       for (colctr = 0; colctr < compptr->downsampled_width; colctr++) {
         thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-        *outptr++ = (JSAMPLE)((thiscolsum + 1) >> 2);
+        *outptr++ = (JSAMPLE)((thiscolsum + bias) >> 2);
       }
     }
     inrow++;
diff --git a/jfdctint.c b/jfdctint.c
index c0391a92..b47c3061 100644
--- a/jfdctint.c
+++ b/jfdctint.c
@@ -1,7 +1,7 @@
 /*
  * jfdctint.c
  *
- * This file was part of the Independent JPEG Group's software.
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015, D. R. Commander.
diff --git a/jidctint.c b/jidctint.c
index 55573429..98425d5f 100644
--- a/jidctint.c
+++ b/jidctint.c
@@ -1,7 +1,7 @@
 /*
  * jidctint.c
  *
- * This file was part of the Independent JPEG Group's software.
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modification developed 2002-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
diff --git a/jidctred.c b/jidctred.c
index 1ff352f8..1dd65a94 100644
--- a/jidctred.c
+++ b/jidctred.c
@@ -1,7 +1,7 @@
 /*
  * jidctred.c
  *
- * This file was part of the Independent JPEG Group's software.
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015, D. R. Commander.
diff --git a/jpegtran.c b/jpegtran.c
index f67e7225..6ed99893 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -129,7 +129,7 @@ select_transform (JXFORM_CODE transform)
 
 LOCAL(int)
 parse_switches (j_compress_ptr cinfo, int argc, char **argv,
-                int last_file_arg_seen, boolean for_real)
+               int last_file_arg_seen, boolean for_real)
 /* Parse optional switches.
  * Returns argv[] index of first file-name argument (== argc if none).
  * Any file names with indexes <= last_file_arg_seen are ignored;
diff --git a/jquant1.c b/jquant1.c
index a336abdb..40bbb28c 100644
--- a/jquant1.c
+++ b/jquant1.c
@@ -154,7 +154,7 @@ typedef struct {
    */
   boolean is_padded;            /* is the colorindex padded for odither? */
 
-  int Ncolors[MAX_Q_COMPS];     /* # of values alloced to each component */
+  int Ncolors[MAX_Q_COMPS];     /* # of values allocated to each component */
 
   /* Variables for ordered dithering */
   int row_index;                /* cur row's vertical index in dither matrix */
diff --git a/jversion.h b/jversion.h
index 40fd9701..22cf316e 100644
--- a/jversion.h
+++ b/jversion.h
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2012-2018, D. R. Commander.
+ * Copyright (C) 2010, 2012-2020, D. R. Commander.
  * mozjpeg Modifications:
  * Copyright (C) 2014, Mozilla Corporation.
  * For conditions of distribution and use, see the accompanying README file.
@@ -37,18 +37,18 @@
  */
 
 #define JCOPYRIGHT \
-  "Copyright (C) 2009-2018 D. R. Commander\n" \
-                        "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
+  "Copyright (C) 2009-2020 D. R. Commander\n" \
+  "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
   "Copyright (C) 2015-2016, 2018 Matthieu Darbois\n" \
   "Copyright (C) 2015 Intel Corporation\n" \
-                        "Copyright (C) 2015 Google, Inc.\n" \
+  "Copyright (C) 2015 Google, Inc.\n" \
                         "Copyright (C) 2014 Mozilla Corporation\n" \
-                        "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
-                        "Copyright (C) 2013 Linaro Limited\n" \
-                        "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
-                        "Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \
-                        "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
+  "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
+  "Copyright (C) 2013 Linaro Limited\n" \
+  "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
+  "Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \
+  "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
   "Copyright (C) 1991-2016 Thomas G. Lane, Guido Vollbeding"
 
 #define JCOPYRIGHT_SHORT \
-  "Copyright (C) 1991-2018 The libjpeg-turbo Project and many others"
+  "Copyright (C) 1991-2020 The libjpeg-turbo Project and many others"
diff --git a/md5/md5hl.c b/md5/md5hl.c
index ecd2e236..8a4a762f 100644
--- a/md5/md5hl.c
+++ b/md5/md5hl.c
@@ -6,7 +6,7 @@
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  * libjpeg-turbo Modifications:
- * Copyright (C)2016, 2018 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2016, 2018-2019 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -56,7 +56,7 @@
 
 #include "./md5.h"
 
-char *MD5End(MD5_CTX *ctx, char *buf)
+static char *MD5End(MD5_CTX *ctx, char *buf)
 {
   int i;
   unsigned char digest[LENGTH];
@@ -89,7 +89,7 @@ char *MD5FileChunk(const char *filename, char *buf, off_t ofs, off_t len)
   off_t n;
 
   MD5Init(&ctx);
-#if _WIN32
+#ifdef _WIN32
   f = _open(filename, O_RDONLY | O_BINARY);
 #else
   f = open(filename, O_RDONLY);
@@ -123,12 +123,3 @@ char *MD5FileChunk(const char *filename, char *buf, off_t ofs, off_t len)
     return 0;
   return (MD5End(&ctx, buf));
 }
-
-char *MD5Data(const void *data, unsigned int len, char *buf)
-{
-  MD5_CTX ctx;
-
-  MD5Init(&ctx);
-  MD5Update(&ctx, (unsigned char *)data, len);
-  return (MD5End(&ctx, buf));
-}
diff --git a/rdtarga.c b/rdtarga.c
index e0c6947b..37bd286a 100644
--- a/rdtarga.c
+++ b/rdtarga.c
@@ -3,8 +3,9 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * Modified 2017 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2018, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -66,6 +67,7 @@ typedef struct _tga_source_struct {
   U_CHAR tga_pixel[4];
 
   int pixel_size;               /* Bytes per Targa pixel (1 to 4) */
+  int cmap_length;              /* colormap length */
 
   /* State info for reading RLE-coded pixels; both counts must be init to 0 */
   int block_count;              /* # of pixels remaining in RLE block */
@@ -196,11 +198,14 @@ get_8bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
   register JSAMPROW ptr;
   register JDIMENSION col;
   register JSAMPARRAY colormap = source->colormap;
+  int cmaplen = source->cmap_length;
 
   ptr = source->pub.buffer[0];
   for (col = cinfo->image_width; col > 0; col--) {
     (*source->read_pixel) (source); /* Load next pixel into tga_pixel */
     t = UCH(source->tga_pixel[0]);
+    if (t >= cmaplen)
+      ERREXIT(cinfo, JERR_TGA_BADPARMS);
     *ptr++ = colormap[0][t];
     *ptr++ = colormap[1][t];
     *ptr++ = colormap[2][t];
@@ -452,12 +457,14 @@ start_input_tga(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
     /* Allocate space to store the colormap */
     source->colormap = (*cinfo->mem->alloc_sarray)
       ((j_common_ptr)cinfo, JPOOL_IMAGE, (JDIMENSION)maplen, (JDIMENSION)3);
+    source->cmap_length = (int)maplen;
     /* and read it from the file */
     read_colormap(source, (int)maplen, UCH(targaheader[7]));
   } else {
     if (cmaptype)               /* but you promised a cmap! */
       ERREXIT(cinfo, JERR_TGA_BADPARMS);
     source->colormap = NULL;
+    source->cmap_length = 0;
   }
 
   cinfo->input_components = components;
diff --git a/release/ReadMe.txt b/release/ReadMe.txt
index cf9012af..0a087114 100644
--- a/release/ReadMe.txt
+++ b/release/ReadMe.txt
@@ -1,4 +1,4 @@
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2, AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression on x86, x86-64, ARM, and PowerPC systems, as well as progressive JPEG compression on x86 and x86-64 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines.  In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate baseline JPEG compression and decompression on x86, x86-64, ARM, PowerPC, and MIPS systems, as well as progressive JPEG compression on x86 and x86-64 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines.  In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
 
 libjpeg-turbo implements both the traditional libjpeg API as well as the less powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features colorspace extensions that allow it to compress from/decompress to 32-bit and big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java interface.
 
diff --git a/release/installer.nsi.in b/release/installer.nsi.in
index 00cd5f87..44419fa8 100755
--- a/release/installer.nsi.in
+++ b/release/installer.nsi.in
@@ -62,15 +62,15 @@ Section "@CMAKE_PROJECT_NAME@ SDK for @INST_PLATFORM@ (required)"
 	File "@CMAKE_CURRENT_BINARY_DIR@\libturbojpeg.a"
 	File "@CMAKE_CURRENT_BINARY_DIR@\libjpeg.dll.a"
 	File "@CMAKE_CURRENT_BINARY_DIR@\libjpeg.a"
-	SetOutPath $INSTDIR\lib\pkgconfig
-	File "@CMAKE_CURRENT_BINARY_DIR@\pkgscripts\libjpeg.pc"
-	File "@CMAKE_CURRENT_BINARY_DIR@\pkgscripts\libturbojpeg.pc"
 !else
 	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}turbojpeg.lib"
 	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}turbojpeg-static.lib"
 	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}jpeg.lib"
 	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}jpeg-static.lib"
 !endif
+	SetOutPath $INSTDIR\lib\pkgconfig
+	File "@CMAKE_CURRENT_BINARY_DIR@\pkgscripts\libjpeg.pc"
+	File "@CMAKE_CURRENT_BINARY_DIR@\pkgscripts\libturbojpeg.pc"
 !ifdef JAVA
 	SetOutPath $INSTDIR\classes
 	File "@CMAKE_CURRENT_BINARY_DIR@\java\turbojpeg.jar"
@@ -130,8 +130,6 @@ Section "Uninstall"
 	Delete $INSTDIR\lib\libturbojpeg.a
 	Delete $INSTDIR\lib\libjpeg.dll.a
 	Delete $INSTDIR\lib\libjpeg.a
-	Delete $INSTDIR\lib\pkgconfig\libjpeg.pc
-	Delete $INSTDIR\lib\pkgconfig\libturbojpeg.pc
 !else
 	Delete $INSTDIR\bin\jpeg@SO_MAJOR_VERSION@.dll
 	Delete $INSTDIR\bin\turbojpeg.dll
@@ -141,6 +139,8 @@ Section "Uninstall"
 	Delete $INSTDIR\lib\turbojpeg.lib
 	Delete $INSTDIR\lib\turbojpeg-static.lib
 !endif
+	Delete $INSTDIR\lib\pkgconfig\libjpeg.pc
+	Delete $INSTDIR\lib\pkgconfig\libturbojpeg.pc
 !ifdef JAVA
 	Delete $INSTDIR\classes\turbojpeg.jar
 !endif
@@ -175,9 +175,7 @@ Section "Uninstall"
 !endif
 
 	RMDir "$INSTDIR\include"
-!ifdef GCC
 	RMDir "$INSTDIR\lib\pkgconfig"
-!endif
 	RMDir "$INSTDIR\lib"
 	RMDir "$INSTDIR\doc"
 !ifdef GCC
diff --git a/release/makemacpkg.in b/release/makemacpkg.in
index 6ede08b4..fb16f41d 100644
--- a/release/makemacpkg.in
+++ b/release/makemacpkg.in
@@ -58,6 +58,8 @@ BUILDDIRARMV7=@IOS_ARMV7_BUILD@
 BUILDDIRARMV7S=@IOS_ARMV7S_BUILD@
 BUILDDIRARMV8=@IOS_ARMV8_BUILD@
 WITH_JAVA=@WITH_JAVA@
+OSX_APP_CERT_NAME="@OSX_APP_CERT_NAME@"
+OSX_INST_CERT_NAME="@OSX_INST_CERT_NAME@"
 LIPO=lipo
 
 PREFIX=@CMAKE_INSTALL_PREFIX@
@@ -228,7 +230,7 @@ if [ $UNIVERSAL = 1 -a "$BUILDDIRARMV7S" != "" ]; then
 	install_ios $BUILDDIRARMV7S ARMv7s armv7s arm
 fi
 
-if [ $UNIVERSAL = 1 -a "BUILDDIRARMV8" != "" ]; then
+if [ $UNIVERSAL = 1 -a "$BUILDDIRARMV8" != "" ]; then
 	install_ios $BUILDDIRARMV8 ARMv8 armv8 arm64
 	fi
 
@@ -258,11 +260,25 @@ cp $SRCDIR/release/License.rtf $SRCDIR/release/Welcome.rtf $SRCDIR/release/ReadM
 mkdir $TMPDIR/dmg
 pkgbuild --root $PKGROOT --version $VERSION.$BUILD --identifier @PKGID@ \
 	$TMPDIR/pkg/$PKGNAME.pkg
+SUFFIX=
+if [ "$OSX_INST_CERT_NAME" != "" ]; then
+	SUFFIX=-unsigned
+fi
 productbuild --distribution pkgscripts/Distribution.xml \
 	--package-path $TMPDIR/pkg/ --resources $TMPDIR/pkg/ \
-	$TMPDIR/dmg/$PKGNAME.pkg
+	$TMPDIR/dmg/$PKGNAME$SUFFIX.pkg
+if [ "$OSX_INST_CERT_NAME" != "" ]; then
+	productsign --sign "$OSX_INST_CERT_NAME" --timestamp \
+		$TMPDIR/dmg/$PKGNAME$SUFFIX.pkg $TMPDIR/dmg/$PKGNAME.pkg
+	rm -r $TMPDIR/dmg/$PKGNAME$SUFFIX.pkg
+	pkgutil --check-signature $TMPDIR/dmg/$PKGNAME.pkg
+fi
 hdiutil create -fs HFS+ -volname $PKGNAME-$VERSION \
 	-srcfolder "$TMPDIR/dmg" $TMPDIR/$PKGNAME-$VERSION.dmg
+if [ "$OSX_APP_CERT_NAME" != "" ]; then
+	codesign -s "$OSX_APP_CERT_NAME" --timestamp $TMPDIR/$PKGNAME-$VERSION.dmg
+	codesign -vv $TMPDIR/$PKGNAME-$VERSION.dmg
+fi
 cp $TMPDIR/$PKGNAME-$VERSION.dmg .
 
 exit
diff --git a/release/rpm.spec.in b/release/rpm.spec.in
index a22c5e1a..83a1669f 100644
--- a/release/rpm.spec.in
+++ b/release/rpm.spec.in
@@ -1,7 +1,7 @@
+%global _docdir %{_defaultdocdir}/%{name}-%{version}
 %define _prefix @CMAKE_INSTALL_PREFIX@
 %define _bindir @CMAKE_INSTALL_FULL_BINDIR@
 %define _datarootdir @CMAKE_INSTALL_FULL_DATAROOTDIR@
-%define _docdir %{_defaultdocdir}/%{name}-%{version}
 %define _includedir @CMAKE_INSTALL_FULL_INCLUDEDIR@
 %define _javadir @CMAKE_INSTALL_FULL_JAVADIR@
 %define _mandir @CMAKE_INSTALL_FULL_MANDIR@
@@ -43,7 +43,7 @@ Group: System Environment/Libraries
 Release: @BUILD@
 License: BSD-style
 BuildRoot: %{_blddir}/%{name}-buildroot-%{version}-%{release}
-Prereq: /sbin/ldconfig
+Requires: /sbin/ldconfig
 %if "%{_bits}" == "64"
 Provides: %{name} = %{version}-%{release}, @CMAKE_PROJECT_NAME@ = %{version}-%{release}, libturbojpeg.so()(64bit)
 %else
@@ -51,14 +51,14 @@ Provides: %{name} = %{version}-%{release}, @CMAKE_PROJECT_NAME@ = %{version}-%{r
 %endif
 
 %description
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
-AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression
-on x86, x86-64, ARM, and PowerPC systems, as well as progressive JPEG
-compression on x86 and x86-64 systems.  On such systems, libjpeg-turbo is
-generally 2-6x as fast as libjpeg, all else being equal.  On other types of
-systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
-virtue of its highly-optimized Huffman coding routines.  In many cases, the
-performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
+baseline JPEG compression and decompression on x86, x86-64, ARM, PowerPC, and
+MIPS systems, as well as progressive JPEG compression on x86 and x86-64
+systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
+all else being equal.  On other types of systems, libjpeg-turbo can still
+outperform libjpeg by a significant amount, by virtue of its highly-optimized
+Huffman coding routines.  In many cases, the performance of libjpeg-turbo
+rivals that of proprietary high-speed JPEG codecs.
 
 libjpeg-turbo implements both the traditional libjpeg API as well as the less
 powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features
@@ -183,7 +183,7 @@ rm -rf $RPM_BUILD_ROOT
 %if "%{_enable_static}" == "1"
  %{_libdir}/libjpeg.a
 %endif
-%{_libdir}/pkgconfig
+%dir %{_libdir}/pkgconfig
 %{_libdir}/pkgconfig/libjpeg.pc
 %if "%{_with_turbojpeg}" == "1"
  %if "%{_enable_shared}" == "1" || "%{_with_java}" == "1"
diff --git a/sharedlib/CMakeLists.txt b/sharedlib/CMakeLists.txt
index 5a8ae7b4..f0a7d876 100755
--- a/sharedlib/CMakeLists.txt
+++ b/sharedlib/CMakeLists.txt
@@ -23,7 +23,7 @@ foreach(src ${JPEG_SOURCES})
   set(JPEG_SRCS ${JPEG_SRCS} ../${src})
 endforeach()
 
-if(WITH_SIMD AND MSVC_IDE)
+if(WITH_SIMD AND (MSVC_IDE OR XCODE))
   # This tells CMake that the "source" files haven't been generated yet
   set_source_files_properties(${SIMD_OBJS} PROPERTIES GENERATED 1)
 endif()
@@ -55,7 +55,8 @@ if(MAPFLAG)
     LINK_FLAGS "${MAPFLAG}${CMAKE_CURRENT_BINARY_DIR}/../libjpeg.map")
 endif()
 if(MSVC)
-  set_target_properties(jpeg PROPERTIES SUFFIX ${SO_MAJOR_VERSION}.dll)
+  set_target_properties(jpeg PROPERTIES
+    RUNTIME_OUTPUT_NAME jpeg${SO_MAJOR_VERSION})
   # The jsimd_*.c file is built using /MT, so this prevents a linker warning.
   set_target_properties(jpeg PROPERTIES LINK_FLAGS "/NODEFAULTLIB:LIBCMT /NODEFAULTLIB:LIBCMTD")
 elseif(MINGW)
@@ -94,3 +95,8 @@ install(TARGETS jpeg cjpeg djpeg jpegtran
   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+if(NOT CMAKE_VERSION VERSION_LESS "3.1" AND MSVC AND
+  CMAKE_C_LINKER_SUPPORTS_PDB)
+  install(FILES "$<TARGET_PDB_FILE:jpeg>"
+    DESTINATION ${CMAKE_INSTALL_BINDIR} OPTIONAL)
+endif()
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index 3a4e3c4a..60eda18f 100755
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -38,6 +38,14 @@ elseif(CPU_TYPE STREQUAL "i386")
   endif()
 endif()
 
+if(NOT REQUIRE_SIMD)
+  include(CheckLanguage)
+  check_language(ASM_NASM)
+  if(NOT CMAKE_ASM_NASM_COMPILER)
+    simd_fail("SIMD extensions disabled: could not find NASM compiler")
+    return()
+  endif()
+endif()
 enable_language(ASM_NASM)
 message(STATUS "CMAKE_ASM_NASM_COMPILER = ${CMAKE_ASM_NASM_COMPILER}")
 
@@ -74,12 +82,12 @@ if(CMAKE_ASM_NASM_COMPILER_TYPE MATCHES "yasm")
     if(${var} STREQUAL "-g")
       if(CMAKE_ASM_NASM_DEBUG_FORMAT)
         set_property(CACHE ${var} PROPERTY VALUE "-g ${CMAKE_ASM_NASM_DEBUG_FORMAT}")
-  else()
+      else()
         set_property(CACHE ${var} PROPERTY VALUE "")
       endif()
     endif()
   endforeach()
-  endif()
+endif()
 
 if(NOT WIN32 AND (CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED))
   set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DPIC")
@@ -135,6 +143,9 @@ endif()
 if(MSVC_IDE)
   set(OBJDIR "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}")
   string(REGEX REPLACE " " ";" CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS}")
+elseif(XCODE)
+  set(OBJDIR "${CMAKE_CURRENT_BINARY_DIR}")
+  string(REGEX REPLACE " " ";" CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS}")
 endif()
 
 file(GLOB INC_FILES nasm/*.inc)
@@ -162,25 +173,25 @@ foreach(file ${SIMD_SOURCES})
       ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE})
   endif()
   set(OBJECT_DEPENDS ${OBJECT_DEPENDS} ${INC_FILES})
-  if(MSVC_IDE)
+  if(MSVC_IDE OR XCODE)
     # The CMake Visual Studio generators do not work properly with the ASM_NASM
     # language, so we have to go rogue here and use a custom command like we
     # did in prior versions of libjpeg-turbo.  (This is why we can't have nice
     # things.)
     string(REGEX REPLACE "${CPU_TYPE}/" "" filename ${file})
-    set(SIMD_OBJ ${OBJDIR}/${filename}.obj)
+    set(SIMD_OBJ ${OBJDIR}/${filename}${CMAKE_C_OUTPUT_EXTENSION})
     add_custom_command(OUTPUT ${SIMD_OBJ} DEPENDS ${file} ${OBJECT_DEPENDS}
       COMMAND ${CMAKE_ASM_NASM_COMPILER} -f${CMAKE_ASM_NASM_OBJECT_FORMAT}
         ${CMAKE_ASM_NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${file}
         -o${SIMD_OBJ})
-  set(SIMD_OBJS ${SIMD_OBJS} ${SIMD_OBJ})
+    set(SIMD_OBJS ${SIMD_OBJS} ${SIMD_OBJ})
   else()
     set_source_files_properties(${file} PROPERTIES OBJECT_DEPENDS
       "${OBJECT_DEPENDS}")
   endif()
 endforeach()
 
-if(MSVC_IDE)
+if(MSVC_IDE OR XCODE)
 set(SIMD_OBJS ${SIMD_OBJS} PARENT_SCOPE)
   add_library(simd OBJECT ${CPU_TYPE}/jsimd.c)
   add_custom_target(simd-objs DEPENDS ${SIMD_OBJS})
@@ -262,7 +273,7 @@ endif()
 # MIPS (GAS)
 ###############################################################################
 
-elseif(CPU_TYPE STREQUAL "mips")
+elseif(CPU_TYPE STREQUAL "mips" OR CPU_TYPE STREQUAL "mipsel")
 
 enable_language(ASM)
 
@@ -293,7 +304,7 @@ if(NOT HAVE_DSPR2)
   return()
 endif()
 
-add_library(simd OBJECT ${CPU_TYPE}/jsimd_dspr2.S ${CPU_TYPE}/jsimd.c)
+add_library(simd OBJECT mips/jsimd_dspr2.S mips/jsimd.c)
 
 if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
   set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
diff --git a/simd/arm/jsimd.c b/simd/arm/jsimd.c
index 0fb81976..45f9b047 100644
--- a/simd/arm/jsimd.c
+++ b/simd/arm/jsimd.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
  * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2019, Google LLC.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -30,7 +31,7 @@
 static unsigned int simd_support = ~0;
 static unsigned int simd_huffman = 1;
 
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
 
 #define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
 
@@ -105,7 +106,7 @@ init_simd(void)
 #ifndef NO_GETENV
   char *env = NULL;
 #endif
-#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
   int bufsize = 1024; /* an initial guess for the line buffer size limit */
 #endif
 
diff --git a/simd/arm64/jsimd_neon.S b/simd/arm64/jsimd_neon.S
index 218e1ae7..a3aa4066 100644
--- a/simd/arm64/jsimd_neon.S
+++ b/simd/arm64/jsimd_neon.S
@@ -31,6 +31,251 @@
 .section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
 #endif
 
+#if defined(__APPLE__)
+.section __DATA, __const
+#else
+.section .rodata, "a", %progbits
+#endif
+
+/* Constants for jsimd_idct_islow_neon() */
+
+#define F_0_298   2446  /* FIX(0.298631336) */
+#define F_0_390   3196  /* FIX(0.390180644) */
+#define F_0_541   4433  /* FIX(0.541196100) */
+#define F_0_765   6270  /* FIX(0.765366865) */
+#define F_0_899   7373  /* FIX(0.899976223) */
+#define F_1_175   9633  /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_idct_islow_neon_consts:
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_idct_ifast_neon() */
+
+.balign 16
+Ljsimd_idct_ifast_neon_consts:
+  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
+  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
+  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
+  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
+
+/* Constants for jsimd_idct_4x4_neon() and jsimd_idct_2x2_neon() */
+
+#define CONST_BITS  13
+
+#define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
+#define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
+#define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
+#define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
+#define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
+#define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
+#define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
+#define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
+#define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
+#define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
+#define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
+#define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
+#define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
+#define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
+
+.balign 16
+Ljsimd_idct_4x4_neon_consts:
+  .short FIX_1_847759065        /* v0.h[0] */
+  .short -FIX_0_765366865       /* v0.h[1] */
+  .short -FIX_0_211164243       /* v0.h[2] */
+  .short FIX_1_451774981        /* v0.h[3] */
+  .short -FIX_2_172734803       /* d1[0] */
+  .short FIX_1_061594337        /* d1[1] */
+  .short -FIX_0_509795579       /* d1[2] */
+  .short -FIX_0_601344887       /* d1[3] */
+  .short FIX_0_899976223        /* v2.h[0] */
+  .short FIX_2_562915447        /* v2.h[1] */
+  .short 1 << (CONST_BITS + 1)  /* v2.h[2] */
+  .short 0                      /* v2.h[3] */
+
+.balign 8
+Ljsimd_idct_2x2_neon_consts:
+  .short -FIX_0_720959822  /* v14[0] */
+  .short FIX_0_850430095   /* v14[1] */
+  .short -FIX_1_272758580  /* v14[2] */
+  .short FIX_3_624509785   /* v14[3] */
+
+/* Constants for jsimd_ycc_*_neon() */
+
+.balign 16
+Ljsimd_ycc_rgb_neon_consts:
+  .short 0,      0,     0,      0
+  .short 22971, -11277, -23401, 29033
+  .short -128,  -128,   -128,   -128
+  .short -128,  -128,   -128,   -128
+
+/* Constants for jsimd_*_ycc_neon() */
+
+.balign 16
+Ljsimd_rgb_ycc_neon_consts:
+  .short 19595, 38470, 7471, 11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128, 32767, 128
+  .short 32767, 128, 32767, 128
+
+/* Constants for jsimd_fdct_islow_neon() */
+
+#define F_0_298   2446  /* FIX(0.298631336) */
+#define F_0_390   3196  /* FIX(0.390180644) */
+#define F_0_541   4433  /* FIX(0.541196100) */
+#define F_0_765   6270  /* FIX(0.765366865) */
+#define F_0_899   7373  /* FIX(0.899976223) */
+#define F_1_175   9633  /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_fdct_islow_neon_consts:
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_fdct_ifast_neon() */
+
+.balign 16
+Ljsimd_fdct_ifast_neon_consts:
+  .short (98 * 128)               /* XFIX_0_382683433 */
+  .short (139 * 128)              /* XFIX_0_541196100 */
+  .short (181 * 128)              /* XFIX_0_707106781 */
+  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
+
+/* Constants for jsimd_h2*_downsample_neon() */
+
+.balign 16
+Ljsimd_h2_downsample_neon_consts:
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
+        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
+        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
+        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
+        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
+  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
+        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
+  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
+        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
+
+/* Constants for jsimd_huff_encode_one_block_neon() */
+
+.balign 16
+Ljsimd_huff_encode_one_block_neon_consts:
+    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
+            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
+    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
+            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
+    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
+           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
+    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
+            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
+    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
+            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
+    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
+            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
+    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
+            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
+    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
+            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
+    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
+    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
+
 .text
 
 
@@ -55,6 +300,17 @@ _\fname:
 #endif
 .endm
 
+/* Get symbol location */
+.macro get_symbol_loc reg, symbol
+#ifdef __APPLE__
+    adrp            \reg, \symbol@PAGE
+    add             \reg, \reg, \symbol@PAGEOFF
+#else
+    adrp            \reg, \symbol
+    add             \reg, \reg, :lo12:\symbol
+#endif
+.endm
+
 /* Transpose elements of single 128 bit registers */
 .macro transpose_single x0, x1, xi, xilen, literal
     ins             \xi\xilen[0], \x0\xilen[0]
@@ -63,7 +319,7 @@ _\fname:
     trn2            \x1\literal, \xi\literal, \x1\literal
 .endm
 
-/* Transpose elements of 2 differnet registers */
+/* Transpose elements of 2 different registers */
 .macro transpose x0, x1, xi, xilen, literal
     mov             \xi\xilen, \x0\xilen
     trn1            \x0\literal, \x0\literal, \x1\literal
@@ -139,51 +395,6 @@ _\fname:
 #define CONST_BITS  13
 #define PASS1_BITS  2
 
-#define F_0_298   2446  /* FIX(0.298631336) */
-#define F_0_390   3196  /* FIX(0.390180644) */
-#define F_0_541   4433  /* FIX(0.541196100) */
-#define F_0_765   6270  /* FIX(0.765366865) */
-#define F_0_899   7373  /* FIX(0.899976223) */
-#define F_1_175   9633  /* FIX(1.175875602) */
-#define F_1_501  12299  /* FIX(1.501321110) */
-#define F_1_847  15137  /* FIX(1.847759065) */
-#define F_1_961  16069  /* FIX(1.961570560) */
-#define F_2_053  16819  /* FIX(2.053119869) */
-#define F_2_562  20995  /* FIX(2.562915447) */
-#define F_3_072  25172  /* FIX(3.072711026) */
-
-.balign 16
-Ljsimd_idct_islow_neon_consts:
-  .short F_0_298
-  .short -F_0_390
-  .short F_0_541
-  .short F_0_765
-  .short - F_0_899
-  .short F_1_175
-  .short F_1_501
-  .short - F_1_847
-  .short - F_1_961
-  .short F_2_053
-  .short - F_2_562
-  .short F_3_072
-  .short 0          /* padding */
-  .short 0
-  .short 0
-  .short 0
-
-#undef F_0_298
-#undef F_0_390
-#undef F_0_541
-#undef F_0_765
-#undef F_0_899
-#undef F_1_175
-#undef F_1_501
-#undef F_1_847
-#undef F_1_961
-#undef F_2_053
-#undef F_2_562
-#undef F_3_072
-
 #define XFIX_P_0_298  v0.h[0]
 #define XFIX_N_0_390  v0.h[1]
 #define XFIX_P_0_541  v0.h[2]
@@ -217,7 +428,7 @@ asm_function jsimd_idct_islow_neon
     uxtw x3, w3
 
     sub             sp, sp, #64
-    adr             x15, Ljsimd_idct_islow_neon_consts
+    get_symbol_loc  x15, Ljsimd_idct_islow_neon_consts
     mov             x10, sp
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
@@ -791,13 +1002,6 @@ asm_function jsimd_idct_islow_neon
 #define XFIX_1_847759065  v0.h[2]
 #define XFIX_2_613125930  v0.h[3]
 
-.balign 16
-Ljsimd_idct_ifast_neon_consts:
-  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
-  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
-  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
-  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
-
 asm_function jsimd_idct_ifast_neon
 
     DCT_TABLE       .req x0
@@ -832,7 +1036,7 @@ asm_function jsimd_idct_ifast_neon
      *   7 | d30     | d31     ( v23.8h )
      */
     /* Save NEON registers used in fast IDCT */
-    adr             TMP5, Ljsimd_idct_ifast_neon_consts
+    get_symbol_loc  TMP5, Ljsimd_idct_ifast_neon_consts
     ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
     ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
     ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
@@ -1023,38 +1227,6 @@ asm_function jsimd_idct_ifast_neon
  *       but readability will suffer somewhat.
  */
 
-#define CONST_BITS  13
-
-#define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
-#define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
-#define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
-#define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
-#define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
-#define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
-#define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
-#define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
-#define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
-#define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
-#define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
-#define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
-#define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
-#define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
-
-.balign 16
-Ljsimd_idct_4x4_neon_consts:
-  .short FIX_1_847759065        /* v0.h[0] */
-  .short -FIX_0_765366865       /* v0.h[1] */
-  .short -FIX_0_211164243       /* v0.h[2] */
-  .short FIX_1_451774981        /* v0.h[3] */
-  .short -FIX_2_172734803       /* d1[0] */
-  .short FIX_1_061594337        /* d1[1] */
-  .short -FIX_0_509795579       /* d1[2] */
-  .short -FIX_0_601344887       /* d1[3] */
-  .short FIX_0_899976223        /* v2.h[0] */
-  .short FIX_2_562915447        /* v2.h[1] */
-  .short 1 << (CONST_BITS + 1)  /* v2.h[2] */
-  .short 0                      /* v2.h[3] */
-
 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
     smull           v28.4s, \x4, v2.h[2]
     smlal           v28.4s, \x8, v0.h[0]
@@ -1121,7 +1293,7 @@ asm_function jsimd_idct_4x4_neon
     sub             sp, sp, 64
     mov             x9, sp
     /* Load constants (v3.4h is just used for padding) */
-    adr             TMP4, Ljsimd_idct_4x4_neon_consts
+    get_symbol_loc  TMP4, Ljsimd_idct_4x4_neon_consts
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
     ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
@@ -1264,13 +1436,6 @@ asm_function jsimd_idct_4x4_neon
  *       bit exact compatibility with jpeg-6b.
  */
 
-.balign 8
-Ljsimd_idct_2x2_neon_consts:
-  .short -FIX_0_720959822  /* v14[0] */
-  .short FIX_0_850430095   /* v14[1] */
-  .short -FIX_1_272758580  /* v14[2] */
-  .short FIX_3_624509785   /* v14[3] */
-
 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
     sshll           v15.4s, \x4, #15
     smull           v26.4s, \x6, v14.h[3]
@@ -1311,7 +1476,7 @@ asm_function jsimd_idct_2x2_neon
     mov             x9, sp
 
     /* Load constants */
-    adr             TMP2, Ljsimd_idct_2x2_neon_consts
+    get_symbol_loc  TMP2, Ljsimd_idct_2x2_neon_consts
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
     ld1             {v14.4h}, [TMP2]
@@ -1663,21 +1828,6 @@ asm_function jsimd_idct_2x2_neon
     do_yuv_to_rgb_stage2
 .endm
 
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.balign 16
-.if \fast_st3 == 1
-Ljsimd_ycc_\colorid\()_neon_consts:
-.else
-Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
-.endif
-  .short 0,      0,     0,      0
-  .short 22971, -11277, -23401, 29033
-  .short -128,  -128,   -128,   -128
-  .short -128,  -128,   -128,   -128
-
 .if \fast_st3 == 1
 asm_function jsimd_ycc_\colorid\()_convert_neon
 .else
@@ -1703,11 +1853,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
     mov             x9, sp
 
     /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
-    .if \fast_st3 == 1
-      adr           x15, Ljsimd_ycc_\colorid\()_neon_consts
-    .else
-      adr           x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
-    .endif
+    get_symbol_loc  x15, Ljsimd_ycc_rgb_neon_consts
 
     /* Save NEON registers */
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
@@ -2004,17 +2150,6 @@ generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,
     do_rgb_to_yuv_stage1
 .endm
 
-.balign 16
-.if \fast_ld3 == 1
-Ljsimd_\colorid\()_ycc_neon_consts:
-.else
-Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
-.endif
-  .short 19595, 38470, 7471, 11059
-  .short 21709, 32768, 27439, 5329
-  .short 32767, 128, 32767, 128
-  .short 32767, 128, 32767, 128
-
 .if \fast_ld3 == 1
 asm_function jsimd_\colorid\()_ycc_convert_neon
 .else
@@ -2037,11 +2172,7 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
     N               .req w12
 
     /* Load constants to d0, d1, d2, d3 */
-    .if \fast_ld3 == 1
-      adr           x13, Ljsimd_\colorid\()_ycc_neon_consts
-    .else
-      adr           x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
-    .endif
+    get_symbol_loc  x13, Ljsimd_rgb_ycc_neon_consts
     ld1             {v0.8h, v1.8h}, [x13]
 
     ldr             OUTPUT_BUF0, [OUTPUT_BUF]
@@ -2241,50 +2372,6 @@ asm_function jsimd_convsamp_neon
 #define DESCALE_P1  (CONST_BITS - PASS1_BITS)
 #define DESCALE_P2  (CONST_BITS + PASS1_BITS)
 
-#define F_0_298   2446  /* FIX(0.298631336) */
-#define F_0_390   3196  /* FIX(0.390180644) */
-#define F_0_541   4433  /* FIX(0.541196100) */
-#define F_0_765   6270  /* FIX(0.765366865) */
-#define F_0_899   7373  /* FIX(0.899976223) */
-#define F_1_175   9633  /* FIX(1.175875602) */
-#define F_1_501  12299  /* FIX(1.501321110) */
-#define F_1_847  15137  /* FIX(1.847759065) */
-#define F_1_961  16069  /* FIX(1.961570560) */
-#define F_2_053  16819  /* FIX(2.053119869) */
-#define F_2_562  20995  /* FIX(2.562915447) */
-#define F_3_072  25172  /* FIX(3.072711026) */
-
-.balign 16
-Ljsimd_fdct_islow_neon_consts:
-  .short F_0_298
-  .short -F_0_390
-  .short F_0_541
-  .short F_0_765
-  .short - F_0_899
-  .short F_1_175
-  .short F_1_501
-  .short - F_1_847
-  .short - F_1_961
-  .short F_2_053
-  .short - F_2_562
-  .short F_3_072
-  .short 0          /* padding */
-  .short 0
-  .short 0
-  .short 0
-
-#undef F_0_298
-#undef F_0_390
-#undef F_0_541
-#undef F_0_765
-#undef F_0_899
-#undef F_1_175
-#undef F_1_501
-#undef F_1_847
-#undef F_1_961
-#undef F_2_053
-#undef F_2_562
-#undef F_3_072
 #define XFIX_P_0_298  v0.h[0]
 #define XFIX_N_0_390  v0.h[1]
 #define XFIX_P_0_541  v0.h[2]
@@ -2304,7 +2391,7 @@ asm_function jsimd_fdct_islow_neon
     TMP             .req x9
 
     /* Load constants */
-    adr             TMP, Ljsimd_fdct_islow_neon_consts
+    get_symbol_loc  TMP, Ljsimd_fdct_islow_neon_consts
     ld1             {v0.8h, v1.8h}, [TMP]
 
     /* Save NEON registers */
@@ -2583,20 +2670,13 @@ asm_function jsimd_fdct_islow_neon
 #define XFIX_0_707106781  v0.h[2]
 #define XFIX_1_306562965  v0.h[3]
 
-.balign 16
-Ljsimd_fdct_ifast_neon_consts:
-  .short (98 * 128)               /* XFIX_0_382683433 */
-  .short (139 * 128)              /* XFIX_0_541196100 */
-  .short (181 * 128)              /* XFIX_0_707106781 */
-  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
-
 asm_function jsimd_fdct_ifast_neon
 
     DATA            .req x0
     TMP             .req x9
 
     /* Load constants */
-    adr             TMP, Ljsimd_fdct_ifast_neon_consts
+    get_symbol_loc  TMP, Ljsimd_fdct_ifast_neon_consts
     ld1             {v0.4h}, [TMP]
 
     /* Load all DATA into NEON registers with the following allocation:
@@ -2775,41 +2855,6 @@ asm_function jsimd_quantize_neon
  *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
  */
 
-.balign 16
-Ljsimd_h2_downsample_neon_consts:
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
-        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
-        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
-        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
-        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
-  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
-        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
-  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
-        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
-  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
-
 asm_function jsimd_h2v1_downsample_neon
     IMAGE_WIDTH     .req x0
     MAX_V_SAMP      .req x1
@@ -2827,7 +2872,7 @@ asm_function jsimd_h2v1_downsample_neon
     mov             TMPDUP, #0x10000
     lsl             TMP2, BLOCK_WIDTH, #4
     sub             TMP2, TMP2, IMAGE_WIDTH
-    adr             TMP3, Ljsimd_h2_downsample_neon_consts
+    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
     add             TMP3, TMP3, TMP2, lsl #4
     dup             v16.4s, TMPDUP
     ld1             {v18.16b}, [TMP3]
@@ -2906,7 +2951,7 @@ asm_function jsimd_h2v2_downsample_neon
     lsl             TMP2, BLOCK_WIDTH, #4
     lsl             TMPDUP, TMPDUP, #17
     sub             TMP2, TMP2, IMAGE_WIDTH
-    adr             TMP3, Ljsimd_h2_downsample_neon_consts
+    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
     orr             TMPDUP, TMPDUP, #1
     add             TMP3, TMP3, TMP2, lsl #4
     dup             v16.4s, TMPDUP
@@ -3012,41 +3057,6 @@ asm_function jsimd_h2v2_downsample_neon
 
 .macro generate_jsimd_huff_encode_one_block fast_tbl
 
-.balign 16
-.if \fast_tbl == 1
-Ljsimd_huff_encode_one_block_neon_consts:
-.else
-Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
-.endif
-    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
-          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-.if \fast_tbl == 1
-    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
-            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
-    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
-            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
-    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
-           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
-    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
-            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
-    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
-            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
-    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
-            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
-    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
-            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
-    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
-            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
-    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
-           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
-    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
-             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
-    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
-           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
-    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
-           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
-.endif
-
 .if \fast_tbl == 1
 asm_function jsimd_huff_encode_one_block_neon
 .else
@@ -3056,11 +3066,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
     sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
     /* Save ARM registers */
     stp             x19, x20, [sp]
-.if \fast_tbl == 1
-    adr             x15, Ljsimd_huff_encode_one_block_neon_consts
-.else
-    adr             x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
-.endif
+    get_symbol_loc  x15, Ljsimd_huff_encode_one_block_neon_consts
     ldr             PUT_BUFFER, [x0, #0x10]
     ldr             PUT_BITSw, [x0, #0x18]
     ldrsh           w12, [x2]               /* load DC coeff in w12 */
diff --git a/simd/i386/jccolext-avx2.asm b/simd/i386/jccolext-avx2.asm
index 7a8d784a..c46d6844 100644
--- a/simd/i386/jccolext-avx2.asm
+++ b/simd/i386/jccolext-avx2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -110,12 +108,12 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
-    movzx       eax, BYTE [esi+ecx]
+    movzx       eax, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
-    movzx       edx, WORD [esi+ecx]
+    movzx       edx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
diff --git a/simd/i386/jccolext-mmx.asm b/simd/i386/jccolext-mmx.asm
index 9a2c30e2..6357a42b 100644
--- a/simd/i386/jccolext-mmx.asm
+++ b/simd/i386/jccolext-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -111,13 +109,13 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
     xor         eax, eax
-    mov         al, BYTE [esi+ecx]
+    mov         al, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
     xor         edx, edx
-    mov         dx, WORD [esi+ecx]
+    mov         dx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
@@ -127,7 +125,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
     test        cl, SIZEOF_DWORD
     jz          short .column_ld8
     sub         ecx, byte SIZEOF_DWORD
-    movd        mmG, DWORD [esi+ecx]
+    movd        mmG, dword [esi+ecx]
     psllq       mmA, DWORD_BIT
     por         mmA, mmG
 .column_ld8:
@@ -197,7 +195,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
     test        cl, SIZEOF_MMWORD/8
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_MMWORD/8
-    movd        mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
 .column_ld2:
     test        cl, SIZEOF_MMWORD/4
     jz          short .column_ld4
diff --git a/simd/i386/jccolext-sse2.asm b/simd/i386/jccolext-sse2.asm
index e830562c..c6c80852 100644
--- a/simd/i386/jccolext-sse2.asm
+++ b/simd/i386/jccolext-sse2.asm
@@ -12,8 +12,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -109,12 +107,12 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
-    movzx       eax, BYTE [esi+ecx]
+    movzx       eax, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
-    movzx       edx, WORD [esi+ecx]
+    movzx       edx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
diff --git a/simd/i386/jccolor-avx2.asm b/simd/i386/jccolor-avx2.asm
index 958517f3..14944e95 100644
--- a/simd/i386/jccolor-avx2.asm
+++ b/simd/i386/jccolor-avx2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jccolor-mmx.asm b/simd/i386/jccolor-mmx.asm
index 47be9e1e..8cb399bd 100644
--- a/simd/i386/jccolor-mmx.asm
+++ b/simd/i386/jccolor-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jccolor-sse2.asm b/simd/i386/jccolor-sse2.asm
index c0d5d45e..686d222f 100644
--- a/simd/i386/jccolor-sse2.asm
+++ b/simd/i386/jccolor-sse2.asm
@@ -12,8 +12,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jcgray-avx2.asm b/simd/i386/jcgray-avx2.asm
index 4d66242c..560ee0c7 100644
--- a/simd/i386/jcgray-avx2.asm
+++ b/simd/i386/jcgray-avx2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jcgray-mmx.asm b/simd/i386/jcgray-mmx.asm
index 07c7ea6c..79fdf082 100644
--- a/simd/i386/jcgray-mmx.asm
+++ b/simd/i386/jcgray-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jcgray-sse2.asm b/simd/i386/jcgray-sse2.asm
index 4b8c7971..cb4b28e8 100644
--- a/simd/i386/jcgray-sse2.asm
+++ b/simd/i386/jcgray-sse2.asm
@@ -12,8 +12,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jcgryext-avx2.asm b/simd/i386/jcgryext-avx2.asm
index 52e99a84..3fa7973d 100644
--- a/simd/i386/jcgryext-avx2.asm
+++ b/simd/i386/jcgryext-avx2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -102,12 +100,12 @@ EXTN(jsimd_rgb_gray_convert_avx2):
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
-    movzx       eax, BYTE [esi+ecx]
+    movzx       eax, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
-    movzx       edx, WORD [esi+ecx]
+    movzx       edx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
diff --git a/simd/i386/jcgryext-mmx.asm b/simd/i386/jcgryext-mmx.asm
index 4a9ab0da..8af42e5a 100644
--- a/simd/i386/jcgryext-mmx.asm
+++ b/simd/i386/jcgryext-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -103,13 +101,13 @@ EXTN(jsimd_rgb_gray_convert_mmx):
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
     xor         eax, eax
-    mov         al, BYTE [esi+ecx]
+    mov         al, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
     xor         edx, edx
-    mov         dx, WORD [esi+ecx]
+    mov         dx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
@@ -119,7 +117,7 @@ EXTN(jsimd_rgb_gray_convert_mmx):
     test        cl, SIZEOF_DWORD
     jz          short .column_ld8
     sub         ecx, byte SIZEOF_DWORD
-    movd        mmG, DWORD [esi+ecx]
+    movd        mmG, dword [esi+ecx]
     psllq       mmA, DWORD_BIT
     por         mmA, mmG
 .column_ld8:
@@ -189,7 +187,7 @@ EXTN(jsimd_rgb_gray_convert_mmx):
     test        cl, SIZEOF_MMWORD/8
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_MMWORD/8
-    movd        mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
 .column_ld2:
     test        cl, SIZEOF_MMWORD/4
     jz          short .column_ld4
diff --git a/simd/i386/jcgryext-sse2.asm b/simd/i386/jcgryext-sse2.asm
index 04d891cd..c9d6ff1e 100644
--- a/simd/i386/jcgryext-sse2.asm
+++ b/simd/i386/jcgryext-sse2.asm
@@ -12,8 +12,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -101,12 +99,12 @@ EXTN(jsimd_rgb_gray_convert_sse2):
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
-    movzx       eax, BYTE [esi+ecx]
+    movzx       eax, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
-    movzx       edx, WORD [esi+ecx]
+    movzx       edx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
diff --git a/simd/i386/jchuff-sse2.asm b/simd/i386/jchuff-sse2.asm
index 6ea69f6e..79f0ca52 100644
--- a/simd/i386/jchuff-sse2.asm
+++ b/simd/i386/jchuff-sse2.asm
@@ -17,8 +17,6 @@
 ; This file contains an SSE2 implementation for Huffman coding of one block.
 ; The following code is based directly on jchuff.c; see jchuff.c for more
 ; details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
@@ -197,8 +195,8 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     push        ebp
 
     mov         esi, POINTER [eax+8]       ; (working_state *state)
-    mov         put_buffer, DWORD [esi+8]  ; put_buffer = state->cur.put_buffer;
-    mov         put_bits, DWORD [esi+12]   ; put_bits = state->cur.put_bits;
+    mov         put_buffer, dword [esi+8]  ; put_buffer = state->cur.put_buffer;
+    mov         put_bits, dword [esi+12]   ; put_bits = state->cur.put_bits;
     push        esi                        ; esi is now scratch
 
     get_GOT     edx                        ; get GOT address
@@ -214,7 +212,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     ; Encode the DC coefficient difference per section F.1.2.1
     mov         esi, POINTER [esp+block]  ; block
     movsx       ecx, word [esi]           ; temp = temp2 = block[0] - last_dc_val;
-    sub         ecx, DWORD [eax+20]
+    sub         ecx, dword [eax+20]
     mov         esi, ecx
 
     ; This is a well-known technique for obtaining the absolute value
@@ -229,12 +227,12 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     ; For a negative input, want temp2 = bitwise complement of abs(input)
     ; This code assumes we are on a two's complement machine
     add         esi, edx                ; temp2 += temp3;
-    mov         DWORD [esp+temp], esi   ; backup temp2 in temp
+    mov         dword [esp+temp], esi   ; backup temp2 in temp
 
     ; Find the number of bits needed for the magnitude of the coefficient
     movpic      ebp, POINTER [esp+gotptr]                        ; load GOT address (ebp)
     movzx       edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)]  ; nbits = JPEG_NBITS(temp);
-    mov         DWORD [esp+temp2], edx                           ; backup nbits in temp2
+    mov         dword [esp+temp2], edx                           ; backup nbits in temp2
 
     ; Emit the Huffman-coded symbol for the number of bits
     mov         ebp, POINTER [eax+24]         ; After this point, arguments are not accessible anymore
@@ -242,13 +240,13 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     movzx       ecx, byte [ebp + edx + 1024]  ; size = dctbl->ehufsi[nbits];
     EMIT_BITS   eax                           ; EMIT_BITS(code, size)
 
-    mov         ecx, DWORD [esp+temp2]        ; restore nbits
+    mov         ecx, dword [esp+temp2]        ; restore nbits
 
     ; Mask off any extra bits in code
     mov         eax, 1
     shl         eax, cl
     dec         eax
-    and         eax, DWORD [esp+temp]   ; temp2 &= (((JLONG)1)<<nbits) - 1;
+    and         eax, dword [esp+temp]   ; temp2 &= (((JLONG)1)<<nbits) - 1;
 
     ; Emit that number of bits of the value, if positive,
     ; or the complement of its magnitude, if negative.
@@ -291,22 +289,22 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     jz          near .ELOOP
     lea         esi, [esi+ecx*2]        ; k += r;
     shr         edx, cl                 ; index >>= r;
-    mov         DWORD [esp+temp3], edx
+    mov         dword [esp+temp3], edx
 .BRLOOP:
     cmp         ecx, 16                       ; while (r > 15) {
     jl          near .ERLOOP
     sub         ecx, 16                       ; r -= 16;
-    mov         DWORD [esp+temp], ecx
+    mov         dword [esp+temp], ecx
     mov         eax, INT [ebp + 240 * 4]      ; code_0xf0 = actbl->ehufco[0xf0];
     movzx       ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
     EMIT_BITS   eax                           ; EMIT_BITS(code_0xf0, size_0xf0)
-    mov         ecx, DWORD [esp+temp]
+    mov         ecx, dword [esp+temp]
     jmp         .BRLOOP
 .ERLOOP:
     movsx       eax, word [esi]                                  ; temp = t1[k];
     movpic      edx, POINTER [esp+gotptr]                        ; load GOT address (edx)
     movzx       eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)]  ; nbits = JPEG_NBITS(temp);
-    mov         DWORD [esp+temp2], eax
+    mov         dword [esp+temp2], eax
     ; Emit Huffman symbol for run length / number of bits
     shl         ecx, 4                        ; temp3 = (r << 4) + nbits;
     add         ecx, eax
@@ -316,13 +314,13 @@ EXTN(jsimd_huff_encode_one_block_sse2):
 
     movsx       edx, word [esi+DCTSIZE2*2]    ; temp2 = t2[k];
     ; Mask off any extra bits in code
-    mov         ecx, DWORD [esp+temp2]
+    mov         ecx, dword [esp+temp2]
     mov         eax, 1
     shl         eax, cl
     dec         eax
     and         eax, edx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
     EMIT_BITS   eax                     ; PUT_BITS(temp2, nbits)
-    mov         edx, DWORD [esp+temp3]
+    mov         edx, dword [esp+temp3]
     add         esi, 2                  ; ++k;
     shr         edx, 1                  ; index >>= 1;
 
@@ -352,29 +350,29 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     shr         edx, cl                 ; index >>= r;
     add         ecx, eax
     lea         esi, [esi+ecx*2]        ; k += r;
-    mov         DWORD [esp+temp3], edx
+    mov         dword [esp+temp3], edx
     jmp         .BRLOOP2
 .BLOOP2:
     bsf         ecx, edx                ; r = __builtin_ctzl(index);
     jz          near .ELOOP2
     lea         esi, [esi+ecx*2]        ; k += r;
     shr         edx, cl                 ; index >>= r;
-    mov         DWORD [esp+temp3], edx
+    mov         dword [esp+temp3], edx
 .BRLOOP2:
     cmp         ecx, 16                       ; while (r > 15) {
     jl          near .ERLOOP2
     sub         ecx, 16                       ; r -= 16;
-    mov         DWORD [esp+temp], ecx
+    mov         dword [esp+temp], ecx
     mov         eax, INT [ebp + 240 * 4]      ; code_0xf0 = actbl->ehufco[0xf0];
     movzx       ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
     EMIT_BITS   eax                           ; EMIT_BITS(code_0xf0, size_0xf0)
-    mov         ecx, DWORD [esp+temp]
+    mov         ecx, dword [esp+temp]
     jmp         .BRLOOP2
 .ERLOOP2:
     movsx       eax, word [esi]         ; temp = t1[k];
     bsr         eax, eax                ; nbits = 32 - __builtin_clz(temp);
     inc         eax
-    mov         DWORD [esp+temp2], eax
+    mov         dword [esp+temp2], eax
     ; Emit Huffman symbol for run length / number of bits
     shl         ecx, 4                        ; temp3 = (r << 4) + nbits;
     add         ecx, eax
@@ -384,13 +382,13 @@ EXTN(jsimd_huff_encode_one_block_sse2):
 
     movsx       edx, word [esi+DCTSIZE2*2]    ; temp2 = t2[k];
     ; Mask off any extra bits in code
-    mov         ecx, DWORD [esp+temp2]
+    mov         ecx, dword [esp+temp2]
     mov         eax, 1
     shl         eax, cl
     dec         eax
     and         eax, edx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
     EMIT_BITS   eax                     ; PUT_BITS(temp2, nbits)
-    mov         edx, DWORD [esp+temp3]
+    mov         edx, dword [esp+temp3]
     add         esi, 2                  ; ++k;
     shr         edx, 1                  ; index >>= 1;
 
@@ -407,8 +405,8 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     mov         eax, [esp+buffer]
     pop         esi
     ; Save put_buffer & put_bits
-    mov         DWORD [esi+8], put_buffer  ; state->cur.put_buffer = put_buffer;
-    mov         DWORD [esi+12], put_bits   ; state->cur.put_bits = put_bits;
+    mov         dword [esi+8], put_buffer  ; state->cur.put_buffer = put_buffer;
+    mov         dword [esi+12], put_bits   ; state->cur.put_bits = put_bits;
 
     pop         ebp
     pop         edi
diff --git a/simd/i386/jcphuff-sse2.asm b/simd/i386/jcphuff-sse2.asm
index 25c63c7b..8b731783 100644
--- a/simd/i386/jcphuff-sse2.asm
+++ b/simd/i386/jcphuff-sse2.asm
@@ -15,8 +15,6 @@
 ;
 ; This file contains an SSE2 implementation of data preparation for progressive
 ; Huffman encoding.  See jcphuff.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
@@ -329,6 +327,8 @@ EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
     add         LUT, 16*SIZEOF_INT
     dec         K
     jnz         .BLOOP16
+    test        LEN, 15
+    je          .PADDING
 .ELOOP16:
     mov         LENEND, LEN
     and         LENEND, 7
diff --git a/simd/i386/jcsample-avx2.asm b/simd/i386/jcsample-avx2.asm
index 5bcdefd5..0a20802d 100644
--- a/simd/i386/jcsample-avx2.asm
+++ b/simd/i386/jcsample-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jcsample-mmx.asm b/simd/i386/jcsample-mmx.asm
index faf42345..2c223eeb 100644
--- a/simd/i386/jcsample-mmx.asm
+++ b/simd/i386/jcsample-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jcsample-sse2.asm b/simd/i386/jcsample-sse2.asm
index b10fa836..4fea60d2 100644
--- a/simd/i386/jcsample-sse2.asm
+++ b/simd/i386/jcsample-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdcolext-avx2.asm b/simd/i386/jdcolext-avx2.asm
index 46de9b9d..015be041 100644
--- a/simd/i386/jdcolext-avx2.asm
+++ b/simd/i386/jdcolext-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -348,7 +346,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
     vmovd       eax, xmmA
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi], ax
+    mov         word [edi], ax
     add         edi, byte SIZEOF_WORD
     sub         ecx, byte SIZEOF_WORD
     shr         eax, 16
@@ -357,7 +355,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
     ; space.
     test        ecx, ecx
     jz          short .nextrow
-    mov         BYTE [edi], al
+    mov         byte [edi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/i386/jdcolext-mmx.asm b/simd/i386/jdcolext-mmx.asm
index cd2cb3f9..5813cfcb 100644
--- a/simd/i386/jdcolext-mmx.asm
+++ b/simd/i386/jdcolext-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -280,7 +278,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
     movd        eax, mmA
     cmp         ecx, byte SIZEOF_DWORD
     jb          short .column_st2
-    mov         DWORD [edi+0*SIZEOF_DWORD], eax
+    mov         dword [edi+0*SIZEOF_DWORD], eax
     psrlq       mmA, DWORD_BIT
     movd        eax, mmA
     sub         ecx, byte SIZEOF_DWORD
@@ -288,14 +286,14 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
 .column_st2:
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi+0*SIZEOF_WORD], ax
+    mov         word [edi+0*SIZEOF_WORD], ax
     shr         eax, WORD_BIT
     sub         ecx, byte SIZEOF_WORD
     add         edi, byte SIZEOF_WORD
 .column_st1:
     cmp         ecx, byte SIZEOF_BYTE
     jb          short .nextrow
-    mov         BYTE [edi+0*SIZEOF_BYTE], al
+    mov         byte [edi+0*SIZEOF_BYTE], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -367,7 +365,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
 .column_st4:
     cmp         ecx, byte SIZEOF_MMWORD/8
     jb          short .nextrow
-    movd        DWORD [edi+0*SIZEOF_DWORD], mmA
+    movd        dword [edi+0*SIZEOF_DWORD], mmA
 
 %endif  ; RGB_PIXELSIZE ; ---------------
 
diff --git a/simd/i386/jdcolext-sse2.asm b/simd/i386/jdcolext-sse2.asm
index 0fcb0067..d5572b32 100644
--- a/simd/i386/jdcolext-sse2.asm
+++ b/simd/i386/jdcolext-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -320,7 +318,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
     movd        eax, xmmA
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi], ax
+    mov         word [edi], ax
     add         edi, byte SIZEOF_WORD
     sub         ecx, byte SIZEOF_WORD
     shr         eax, 16
@@ -329,7 +327,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
     ; space.
     test        ecx, ecx
     jz          short .nextrow
-    mov         BYTE [edi], al
+    mov         byte [edi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/i386/jdcolor-avx2.asm b/simd/i386/jdcolor-avx2.asm
index d2f86e65..e05b60d0 100644
--- a/simd/i386/jdcolor-avx2.asm
+++ b/simd/i386/jdcolor-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdcolor-mmx.asm b/simd/i386/jdcolor-mmx.asm
index 8f5a3b3f..fb7e7bcc 100644
--- a/simd/i386/jdcolor-mmx.asm
+++ b/simd/i386/jdcolor-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdcolor-sse2.asm b/simd/i386/jdcolor-sse2.asm
index ae553dba..b7362553 100644
--- a/simd/i386/jdcolor-sse2.asm
+++ b/simd/i386/jdcolor-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdmerge-avx2.asm b/simd/i386/jdmerge-avx2.asm
index 17318440..711e6792 100644
--- a/simd/i386/jdmerge-avx2.asm
+++ b/simd/i386/jdmerge-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdmerge-mmx.asm b/simd/i386/jdmerge-mmx.asm
index 607bf39f..6e8311d4 100644
--- a/simd/i386/jdmerge-mmx.asm
+++ b/simd/i386/jdmerge-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdmerge-sse2.asm b/simd/i386/jdmerge-sse2.asm
index ddb1d5e7..e32f90aa 100644
--- a/simd/i386/jdmerge-sse2.asm
+++ b/simd/i386/jdmerge-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdmrgext-avx2.asm b/simd/i386/jdmrgext-avx2.asm
index cde48653..e35f7282 100644
--- a/simd/i386/jdmrgext-avx2.asm
+++ b/simd/i386/jdmrgext-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -354,7 +352,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
     vmovd       eax, xmmA
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi], ax
+    mov         word [edi], ax
     add         edi, byte SIZEOF_WORD
     sub         ecx, byte SIZEOF_WORD
     shr         eax, 16
@@ -363,7 +361,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
     ; space.
     test        ecx, ecx
     jz          short .endcolumn
-    mov         BYTE [edi], al
+    mov         byte [edi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/i386/jdmrgext-mmx.asm b/simd/i386/jdmrgext-mmx.asm
index 4b9e35d8..eb3e36b4 100644
--- a/simd/i386/jdmrgext-mmx.asm
+++ b/simd/i386/jdmrgext-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -283,7 +281,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
     movd        eax, mmA
     cmp         ecx, byte SIZEOF_DWORD
     jb          short .column_st2
-    mov         DWORD [edi+0*SIZEOF_DWORD], eax
+    mov         dword [edi+0*SIZEOF_DWORD], eax
     psrlq       mmA, DWORD_BIT
     movd        eax, mmA
     sub         ecx, byte SIZEOF_DWORD
@@ -291,14 +289,14 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
 .column_st2:
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi+0*SIZEOF_WORD], ax
+    mov         word [edi+0*SIZEOF_WORD], ax
     shr         eax, WORD_BIT
     sub         ecx, byte SIZEOF_WORD
     add         edi, byte SIZEOF_WORD
 .column_st1:
     cmp         ecx, byte SIZEOF_BYTE
     jb          short .endcolumn
-    mov         BYTE [edi+0*SIZEOF_BYTE], al
+    mov         byte [edi+0*SIZEOF_BYTE], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -373,7 +371,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
 .column_st4:
     cmp         ecx, byte SIZEOF_MMWORD/8
     jb          short .endcolumn
-    movd        DWORD [edi+0*SIZEOF_DWORD], mmA
+    movd        dword [edi+0*SIZEOF_DWORD], mmA
 
 %endif  ; RGB_PIXELSIZE ; ---------------
 
diff --git a/simd/i386/jdmrgext-sse2.asm b/simd/i386/jdmrgext-sse2.asm
index ac4697ef..c113dc4d 100644
--- a/simd/i386/jdmrgext-sse2.asm
+++ b/simd/i386/jdmrgext-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -325,7 +323,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
     movd        eax, xmmA
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi], ax
+    mov         word [edi], ax
     add         edi, byte SIZEOF_WORD
     sub         ecx, byte SIZEOF_WORD
     shr         eax, 16
@@ -334,7 +332,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
     ; space.
     test        ecx, ecx
     jz          short .endcolumn
-    mov         BYTE [edi], al
+    mov         byte [edi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/i386/jdsample-avx2.asm b/simd/i386/jdsample-avx2.asm
index 61ce511c..a800c35e 100644
--- a/simd/i386/jdsample-avx2.asm
+++ b/simd/i386/jdsample-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdsample-mmx.asm b/simd/i386/jdsample-mmx.asm
index 1f810fab..12c49f0e 100644
--- a/simd/i386/jdsample-mmx.asm
+++ b/simd/i386/jdsample-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdsample-sse2.asm b/simd/i386/jdsample-sse2.asm
index f0da6269..4e28d2f4 100644
--- a/simd/i386/jdsample-sse2.asm
+++ b/simd/i386/jdsample-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jfdctflt-3dn.asm b/simd/i386/jfdctflt-3dn.asm
index 1d45865b..322ab163 100644
--- a/simd/i386/jfdctflt-3dn.asm
+++ b/simd/i386/jfdctflt-3dn.asm
@@ -17,8 +17,6 @@
 ; This file contains a floating-point implementation of the forward DCT
 ; (Discrete Cosine Transform). The following code is based directly on
 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jfdctflt-sse.asm b/simd/i386/jfdctflt-sse.asm
index 1faf8352..86952c64 100644
--- a/simd/i386/jfdctflt-sse.asm
+++ b/simd/i386/jfdctflt-sse.asm
@@ -17,8 +17,6 @@
 ; This file contains a floating-point implementation of the forward DCT
 ; (Discrete Cosine Transform). The following code is based directly on
 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jfdctfst-mmx.asm b/simd/i386/jfdctfst-mmx.asm
index 0271901c..80645a50 100644
--- a/simd/i386/jfdctfst-mmx.asm
+++ b/simd/i386/jfdctfst-mmx.asm
@@ -18,8 +18,6 @@
 ; the forward DCT (Discrete Cosine Transform). The following code is
 ; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
 ; for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jfdctfst-sse2.asm b/simd/i386/jfdctfst-sse2.asm
index f09dadd9..446fa7a6 100644
--- a/simd/i386/jfdctfst-sse2.asm
+++ b/simd/i386/jfdctfst-sse2.asm
@@ -18,8 +18,6 @@
 ; the forward DCT (Discrete Cosine Transform). The following code is
 ; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
 ; for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jfdctint-avx2.asm b/simd/i386/jfdctint-avx2.asm
index ae258eee..97de2302 100644
--- a/simd/i386/jfdctint-avx2.asm
+++ b/simd/i386/jfdctint-avx2.asm
@@ -18,8 +18,6 @@
 ; forward DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jfdctint.c; see the jfdctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jfdctint-mmx.asm b/simd/i386/jfdctint-mmx.asm
index c6bd9597..3ade9d49 100644
--- a/simd/i386/jfdctint-mmx.asm
+++ b/simd/i386/jfdctint-mmx.asm
@@ -18,8 +18,6 @@
 ; forward DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jfdctint.c; see the jfdctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jfdctint-sse2.asm b/simd/i386/jfdctint-sse2.asm
index d67dcc1b..71b684c4 100644
--- a/simd/i386/jfdctint-sse2.asm
+++ b/simd/i386/jfdctint-sse2.asm
@@ -18,8 +18,6 @@
 ; forward DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jfdctint.c; see the jfdctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jidctflt-3dn.asm b/simd/i386/jidctflt-3dn.asm
index 73aa18d0..87951910 100644
--- a/simd/i386/jidctflt-3dn.asm
+++ b/simd/i386/jidctflt-3dn.asm
@@ -17,8 +17,6 @@
 ; This file contains a floating-point implementation of the inverse DCT
 ; (Discrete Cosine Transform). The following code is based directly on
 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -92,23 +90,23 @@ EXTN(jsimd_idct_float_3dnow):
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     pushpic     ebx                     ; save GOT address
-    mov         ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
-    mov         eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
-    or          ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
-    or          ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    mov         ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    or          ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    or          ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
     or          eax, ebx
     poppic      ebx                     ; restore GOT address
     jnz         short .columnDCT
 
     ; -- AC terms all zero
 
-    movd        mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movd        mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
 
     punpcklwd   mm0, mm0
     psrad       mm0, (DWORD_BIT-WORD_BIT)
@@ -135,10 +133,10 @@ EXTN(jsimd_idct_float_3dnow):
 
     ; -- Even part
 
-    movd        mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
-    movd        mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-    movd        mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
-    movd        mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    movd        mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movd        mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movd        mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movd        mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
 
     punpcklwd   mm0, mm0
     punpcklwd   mm1, mm1
@@ -182,10 +180,10 @@ EXTN(jsimd_idct_float_3dnow):
 
     ; -- Odd part
 
-    movd        mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    movd        mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
-    movd        mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
-    movd        mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    movd        mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movd        mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movd        mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movd        mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
 
     punpcklwd   mm2, mm2
     punpcklwd   mm3, mm3
diff --git a/simd/i386/jidctflt-sse.asm b/simd/i386/jidctflt-sse.asm
index 386650f0..b27ecfdf 100644
--- a/simd/i386/jidctflt-sse.asm
+++ b/simd/i386/jidctflt-sse.asm
@@ -17,8 +17,6 @@
 ; This file contains a floating-point implementation of the inverse DCT
 ; (Discrete Cosine Transform). The following code is based directly on
 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -102,8 +100,8 @@ EXTN(jsimd_idct_float_sse):
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctflt-sse2.asm b/simd/i386/jidctflt-sse2.asm
index 9de71392..c646eaef 100644
--- a/simd/i386/jidctflt-sse2.asm
+++ b/simd/i386/jidctflt-sse2.asm
@@ -17,8 +17,6 @@
 ; This file contains a floating-point implementation of the inverse DCT
 ; (Discrete Cosine Transform). The following code is based directly on
 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -102,8 +100,8 @@ EXTN(jsimd_idct_float_sse2):
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctfst-mmx.asm b/simd/i386/jidctfst-mmx.asm
index d3e8a5d6..24622d43 100644
--- a/simd/i386/jidctfst-mmx.asm
+++ b/simd/i386/jidctfst-mmx.asm
@@ -18,8 +18,6 @@
 ; the inverse DCT (Discrete Cosine Transform). The following code is
 ; based directly on the IJG's original jidctfst.c; see the jidctfst.c
 ; for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -123,8 +121,8 @@ EXTN(jsimd_idct_ifast_mmx):
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctfst-sse2.asm b/simd/i386/jidctfst-sse2.asm
index 83bc414a..19704ffa 100644
--- a/simd/i386/jidctfst-sse2.asm
+++ b/simd/i386/jidctfst-sse2.asm
@@ -18,8 +18,6 @@
 ; the inverse DCT (Discrete Cosine Transform). The following code is
 ; based directly on the IJG's original jidctfst.c; see the jidctfst.c
 ; for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -118,8 +116,8 @@ EXTN(jsimd_idct_ifast_sse2):
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctint-avx2.asm b/simd/i386/jidctint-avx2.asm
index b3b7b14f..c371985c 100644
--- a/simd/i386/jidctint-avx2.asm
+++ b/simd/i386/jidctint-avx2.asm
@@ -18,8 +18,6 @@
 ; inverse DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jidctint.c; see the jidctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -320,8 +318,8 @@ EXTN(jsimd_idct_islow_avx2):
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctint-mmx.asm b/simd/i386/jidctint-mmx.asm
index 6ca6d060..4f07f567 100644
--- a/simd/i386/jidctint-mmx.asm
+++ b/simd/i386/jidctint-mmx.asm
@@ -18,8 +18,6 @@
 ; inverse DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jidctint.c; see the jidctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -136,8 +134,8 @@ EXTN(jsimd_idct_islow_mmx):
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctint-sse2.asm b/simd/i386/jidctint-sse2.asm
index a6bd00a9..e442fdd2 100644
--- a/simd/i386/jidctint-sse2.asm
+++ b/simd/i386/jidctint-sse2.asm
@@ -18,8 +18,6 @@
 ; inverse DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jidctint.c; see the jidctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -131,8 +129,8 @@ EXTN(jsimd_idct_islow_sse2):
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctred-mmx.asm b/simd/i386/jidctred-mmx.asm
index 336ee3b9..e2307e1c 100644
--- a/simd/i386/jidctred-mmx.asm
+++ b/simd/i386/jidctred-mmx.asm
@@ -18,8 +18,6 @@
 ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
 ; The following code is based directly on the IJG's original jidctred.c;
 ; see the jidctred.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -144,8 +142,8 @@ EXTN(jsimd_idct_4x4_mmx):
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -464,16 +462,16 @@ EXTN(jsimd_idct_4x4_mmx):
 
     mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
     mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-    movd        DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
-    movd        DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
+    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
 
     psrlq       mm1, 4*BYTE_BIT
     psrlq       mm0, 4*BYTE_BIT
 
     mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
     mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-    movd        DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
-    movd        DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
+    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
 
     emms                                ; empty MMX state
 
@@ -688,8 +686,8 @@ EXTN(jsimd_idct_2x2_mmx):
 
     mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
     mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-    mov         WORD [edx+eax*SIZEOF_JSAMPLE], bx
-    mov         WORD [esi+eax*SIZEOF_JSAMPLE], cx
+    mov         word [edx+eax*SIZEOF_JSAMPLE], bx
+    mov         word [esi+eax*SIZEOF_JSAMPLE], cx
 
     emms                                ; empty MMX state
 
diff --git a/simd/i386/jidctred-sse2.asm b/simd/i386/jidctred-sse2.asm
index 97838baf..6e56494e 100644
--- a/simd/i386/jidctred-sse2.asm
+++ b/simd/i386/jidctred-sse2.asm
@@ -18,8 +18,6 @@
 ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
 ; The following code is based directly on the IJG's original jidctred.c;
 ; see the jidctred.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -139,8 +137,8 @@ EXTN(jsimd_idct_4x4_sse2):
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -578,8 +576,8 @@ EXTN(jsimd_idct_2x2_sse2):
 
     mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
     mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-    mov         WORD [edx+eax*SIZEOF_JSAMPLE], bx
-    mov         WORD [esi+eax*SIZEOF_JSAMPLE], cx
+    mov         word [edx+eax*SIZEOF_JSAMPLE], bx
+    mov         word [esi+eax*SIZEOF_JSAMPLE], cx
 
     pop         edi
     pop         esi
diff --git a/simd/i386/jquant-3dn.asm b/simd/i386/jquant-3dn.asm
index 1767f444..5cb60caa 100644
--- a/simd/i386/jquant-3dn.asm
+++ b/simd/i386/jquant-3dn.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jquant-mmx.asm b/simd/i386/jquant-mmx.asm
index 98932db6..61305c62 100644
--- a/simd/i386/jquant-mmx.asm
+++ b/simd/i386/jquant-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jquant-sse.asm b/simd/i386/jquant-sse.asm
index cc244c4b..218adc97 100644
--- a/simd/i386/jquant-sse.asm
+++ b/simd/i386/jquant-sse.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jquantf-sse2.asm b/simd/i386/jquantf-sse2.asm
index 8d1201c0..a881ab50 100644
--- a/simd/i386/jquantf-sse2.asm
+++ b/simd/i386/jquantf-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jquanti-avx2.asm b/simd/i386/jquanti-avx2.asm
index ea8e1a1e..5ed6bec2 100644
--- a/simd/i386/jquanti-avx2.asm
+++ b/simd/i386/jquanti-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jquanti-sse2.asm b/simd/i386/jquanti-sse2.asm
index 2a69494b..0a509408 100644
--- a/simd/i386/jquanti-sse2.asm
+++ b/simd/i386/jquanti-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jsimdcpu.asm b/simd/i386/jsimdcpu.asm
index 50a0d516..ddcafa9e 100644
--- a/simd/i386/jsimdcpu.asm
+++ b/simd/i386/jsimdcpu.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
@@ -51,29 +49,14 @@ EXTN(jpeg_simd_cpu_support):
     xor         eax, edx
     jz          near .return            ; CPUID is not supported
 
-    ; Check for MMX instruction support
+    ; Check whether CPUID leaf 07H is supported
+    ; (leaf 07H is used to check for AVX2 instruction support)
     xor         eax, eax
     cpuid
     test        eax, eax
     jz          near .return
-
-    xor         eax, eax
-    inc         eax
-    cpuid
-    mov         eax, edx                ; eax = Standard feature flags
-
-    test        eax, 1<<23              ; bit23:MMX
-    jz          short .no_mmx
-    or          edi, byte JSIMD_MMX
-.no_mmx:
-    test        eax, 1<<25              ; bit25:SSE
-    jz          short .no_sse
-    or          edi, byte JSIMD_SSE
-.no_sse:
-    test        eax, 1<<26              ; bit26:SSE2
-    jz          short .no_sse2
-    or          edi, byte JSIMD_SSE2
-.no_sse2:
+    cmp         eax, 7
+    jl          short .no_avx2          ; Maximum leaf < 07H
 
     ; Check for AVX2 instruction support
     mov         eax, 7
@@ -94,13 +77,34 @@ EXTN(jpeg_simd_cpu_support):
 
     xor         ecx, ecx
     xgetbv
-    test        eax, 6                  ; O/S does not manage XMM/YMM state
+    and         eax, 6
+    cmp         eax, 6                  ; O/S does not manage XMM/YMM state
                                         ; using XSAVE
-    jz          short .no_avx2
+    jnz         short .no_avx2
 
     or          edi, JSIMD_AVX2
 .no_avx2:
 
+    ; Check CPUID leaf 01H for MMX, SSE, and SSE2 support
+    xor         eax, eax
+    inc         eax
+    cpuid
+    mov         eax, edx                ; eax = Standard feature flags
+
+    ; Check for MMX instruction support
+    test        eax, 1<<23              ; bit23:MMX
+    jz          short .no_mmx
+    or          edi, byte JSIMD_MMX
+.no_mmx:
+    test        eax, 1<<25              ; bit25:SSE
+    jz          short .no_sse
+    or          edi, byte JSIMD_SSE
+.no_sse:
+    test        eax, 1<<26              ; bit26:SSE2
+    jz          short .no_sse2
+    or          edi, byte JSIMD_SSE2
+.no_sse2:
+
     ; Check for 3DNow! instruction support
     mov         eax, 0x80000000
     cpuid
diff --git a/simd/loongson/jccolext-mmi.c b/simd/loongson/jccolext-mmi.c
index e1c4e69e..6cdeb5e0 100644
--- a/simd/loongson/jccolext-mmi.c
+++ b/simd/loongson/jccolext-mmi.c
@@ -2,12 +2,13 @@
  * Loongson MMI optimizations for libjpeg-turbo
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  *                          All Rights Reserved.
  * Authors:  ZhuChen     <zhuchen@loongson.cn>
  *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
  *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           ZhangLixia  <zhanglixia-hf@loongson.cn>
  *
  * Based on the x86 SIMD extension for IJG JPEG library
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -184,9 +185,15 @@ void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
               "$14", "memory"
            );
       } else {
-        mmA = _mm_load_si64((__m64 *)&inptr[0]);
-        mmG = _mm_load_si64((__m64 *)&inptr[8]);
-        mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmG = _mm_load_si64((__m64 *)&inptr[8]);
+          mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+        }
         inptr += RGB_PIXELSIZE * 8;
       }
       mmD = mmA;
@@ -268,10 +275,17 @@ void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
             : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
            );
       } else {
-        mmA = _mm_load_si64((__m64 *)&inptr[0]);
-        mmF = _mm_load_si64((__m64 *)&inptr[8]);
-        mmD = _mm_load_si64((__m64 *)&inptr[16]);
-        mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmF = _mm_load_si64((__m64 *)&inptr[8]);
+          mmD = _mm_load_si64((__m64 *)&inptr[16]);
+          mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+        }
         inptr += RGB_PIXELSIZE * 8;
       }
       mmB = mmA;
diff --git a/simd/loongson/loongson-mmintrin.h b/simd/loongson/loongson-mmintrin.h
index 4aea7636..50d166b7 100644
--- a/simd/loongson/loongson-mmintrin.h
+++ b/simd/loongson/loongson-mmintrin.h
@@ -1,8 +1,9 @@
 /*
  * Loongson MMI optimizations for libjpeg-turbo
  *
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  *                          All Rights Reserved.
+ * Copyright (C) 2019, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -41,7 +42,7 @@ typedef float __m32;
 
 /********** Set Operations **********/
 
-extern __inline __m64
+extern __inline __m64 FUNCTION_ATTRIBS
 _mm_setzero_si64(void)
 {
   return 0.0;
@@ -1245,6 +1246,22 @@ _mm_load_si64(const __m64 *src)
   asm("ldc1 %0, %1\n\t"
       : "=f" (ret)
       : "m" (*src)
+      : "memory"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadu_si64(const __m64 *src)
+{
+  __m64 ret;
+
+  asm("gsldlc1 %0,  7(%1)\n\t"
+      "gsldrc1 %0,  0(%1)\n\t"
+      : "=f" (ret)
+      : "r" (src)
+      : "memory"
      );
 
   return ret;
diff --git a/simd/mips/jsimd.c b/simd/mips/jsimd.c
index af886f60..454cc99c 100644
--- a/simd/mips/jsimd.c
+++ b/simd/mips/jsimd.c
@@ -692,8 +692,10 @@ jsimd_can_convsamp_float(void)
   if (sizeof(ISLOW_MULT_TYPE) != 2)
     return 0;
 
+#ifndef __mips_soft_float
   if (simd_support & JSIMD_DSPR2)
     return 1;
+#endif
 
   return 0;
 }
@@ -709,7 +711,9 @@ GLOBAL(void)
 jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
                      FAST_FLOAT *workspace)
 {
+#ifndef __mips_soft_float
   jsimd_convsamp_float_dspr2(sample_data, start_col, workspace);
+#endif
 }
 
 GLOBAL(int)
@@ -805,8 +809,10 @@ jsimd_can_quantize_float(void)
   if (sizeof(ISLOW_MULT_TYPE) != 2)
     return 0;
 
+#ifndef __mips_soft_float
   if (simd_support & JSIMD_DSPR2)
     return 1;
+#endif
 
   return 0;
 }
@@ -821,7 +827,9 @@ GLOBAL(void)
 jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
                      FAST_FLOAT *workspace)
 {
+#ifndef __mips_soft_float
   jsimd_quantize_float_dspr2(coef_block, divisors, workspace);
+#endif
 }
 
 GLOBAL(int)
diff --git a/simd/mips/jsimd_dspr2.S b/simd/mips/jsimd_dspr2.S
index 2ec543e2..a28c1161 100644
--- a/simd/mips/jsimd_dspr2.S
+++ b/simd/mips/jsimd_dspr2.S
@@ -2810,6 +2810,8 @@ LEAF_DSPR2(jsimd_quantize_dspr2)
 END(jsimd_quantize_dspr2)
 
 
+#ifndef __mips_soft_float
+
 /*****************************************************************************/
 LEAF_DSPR2(jsimd_quantize_float_dspr2)
 /*
@@ -2890,6 +2892,8 @@ LEAF_DSPR2(jsimd_quantize_float_dspr2)
 
 END(jsimd_quantize_float_dspr2)
 
+#endif
+
 
 /*****************************************************************************/
 LEAF_DSPR2(jsimd_idct_2x2_dspr2)
@@ -4110,6 +4114,8 @@ LEAF_DSPR2(jsimd_convsamp_dspr2)
 END(jsimd_convsamp_dspr2)
 
 
+#ifndef __mips_soft_float
+
 /*****************************************************************************/
 LEAF_DSPR2(jsimd_convsamp_float_dspr2)
 /*
@@ -4468,4 +4474,6 @@ LEAF_DSPR2(jsimd_convsamp_float_dspr2)
 
 END(jsimd_convsamp_float_dspr2)
 
+#endif
+
 /*****************************************************************************/
diff --git a/simd/nasm/jcolsamp.inc b/simd/nasm/jcolsamp.inc
index a2d5b494..6f6d7f29 100644
--- a/simd/nasm/jcolsamp.inc
+++ b/simd/nasm/jcolsamp.inc
@@ -7,8 +7,6 @@
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; [TAB8]
 
 ; --------------------------------------------------------------------------
 
diff --git a/simd/nasm/jdct.inc b/simd/nasm/jdct.inc
index 79d51460..9192f66f 100644
--- a/simd/nasm/jdct.inc
+++ b/simd/nasm/jdct.inc
@@ -7,8 +7,6 @@
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; [TAB8]
 
 ; Each IDCT routine is responsible for range-limiting its results and
 ; converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
diff --git a/simd/nasm/jsimdext.inc b/simd/nasm/jsimdext.inc
index b40901f0..9930d80c 100644
--- a/simd/nasm/jsimdext.inc
+++ b/simd/nasm/jsimdext.inc
@@ -2,7 +2,7 @@
 ; jsimdext.inc - common declarations
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2010, 2016, D. R. Commander.
+; Copyright (C) 2010, 2016, 2019, D. R. Commander.
 ; Copyright (C) 2018, Matthieu Darbois.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
@@ -24,8 +24,6 @@
 ; 2. Altered source versions must be plainly marked as such, and must not be
 ;    misrepresented as being the original software.
 ; 3. This notice may not be removed or altered from any source distribution.
-;
-; [TAB8]
 
 ; ==========================================================================
 ;  System-dependent configurations
@@ -167,19 +165,19 @@ section .note.GNU-stack noalloc noexec nowrite progbits
 %define XMM_DWORD
 %define XMM_MMWORD
 
-%define SIZEOF_BYTE   1                 ; sizeof(BYTE)
-%define SIZEOF_WORD   2                 ; sizeof(WORD)
-%define SIZEOF_DWORD  4                 ; sizeof(DWORD)
-%define SIZEOF_QWORD  8                 ; sizeof(QWORD)
-%define SIZEOF_OWORD  16                ; sizeof(OWORD)
-%define SIZEOF_YWORD  32                ; sizeof(YWORD)
+%define SIZEOF_BYTE   1                 ; sizeof(byte)
+%define SIZEOF_WORD   2                 ; sizeof(word)
+%define SIZEOF_DWORD  4                 ; sizeof(dword)
+%define SIZEOF_QWORD  8                 ; sizeof(qword)
+%define SIZEOF_OWORD  16                ; sizeof(oword)
+%define SIZEOF_YWORD  32                ; sizeof(yword)
 
 %define BYTE_BIT      8                 ; CHAR_BIT in C
-%define WORD_BIT      16                ; sizeof(WORD)*BYTE_BIT
-%define DWORD_BIT     32                ; sizeof(DWORD)*BYTE_BIT
-%define QWORD_BIT     64                ; sizeof(QWORD)*BYTE_BIT
-%define OWORD_BIT     128               ; sizeof(OWORD)*BYTE_BIT
-%define YWORD_BIT     256               ; sizeof(YWORD)*BYTE_BIT
+%define WORD_BIT      16                ; sizeof(word)*BYTE_BIT
+%define DWORD_BIT     32                ; sizeof(dword)*BYTE_BIT
+%define QWORD_BIT     64                ; sizeof(qword)*BYTE_BIT
+%define OWORD_BIT     128               ; sizeof(oword)*BYTE_BIT
+%define YWORD_BIT     256               ; sizeof(yword)*BYTE_BIT
 
 ; --------------------------------------------------------------------------
 ;  External Symbol Name
@@ -198,6 +196,11 @@ section .note.GNU-stack noalloc noexec nowrite progbits
 %ifdef __YASM_VER__
 %define GLOBAL_FUNCTION(name)  global EXTN(name):private_extern
 %define GLOBAL_DATA(name)      global EXTN(name):private_extern
+%else
+%if __NASM_VERSION_ID__ >= 0x020E0000
+%define GLOBAL_FUNCTION(name)  global EXTN(name):private_extern
+%define GLOBAL_DATA(name)      global EXTN(name):private_extern
+%endif
 %endif
 %endif
 
diff --git a/simd/x86_64/jccolext-avx2.asm b/simd/x86_64/jccolext-avx2.asm
index 5fa3848c..10d28348 100644
--- a/simd/x86_64/jccolext-avx2.asm
+++ b/simd/x86_64/jccolext-avx2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -96,12 +94,12 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         rcx, byte SIZEOF_BYTE
-    movzx       rax, BYTE [rsi+rcx]
+    movzx       rax, byte [rsi+rcx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         rcx, byte SIZEOF_WORD
-    movzx       rdx, WORD [rsi+rcx]
+    movzx       rdx, word [rsi+rcx]
     shl         rax, WORD_BIT
     or          rax, rdx
 .column_ld4:
diff --git a/simd/x86_64/jccolext-sse2.asm b/simd/x86_64/jccolext-sse2.asm
index b1486c0b..2c914d31 100644
--- a/simd/x86_64/jccolext-sse2.asm
+++ b/simd/x86_64/jccolext-sse2.asm
@@ -12,8 +12,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -95,12 +93,12 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         rcx, byte SIZEOF_BYTE
-    movzx       rax, BYTE [rsi+rcx]
+    movzx       rax, byte [rsi+rcx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         rcx, byte SIZEOF_WORD
-    movzx       rdx, WORD [rsi+rcx]
+    movzx       rdx, word [rsi+rcx]
     shl         rax, WORD_BIT
     or          rax, rdx
 .column_ld4:
diff --git a/simd/x86_64/jccolor-avx2.asm b/simd/x86_64/jccolor-avx2.asm
index f9f4be06..16b78298 100644
--- a/simd/x86_64/jccolor-avx2.asm
+++ b/simd/x86_64/jccolor-avx2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jccolor-sse2.asm b/simd/x86_64/jccolor-sse2.asm
index 3e46601d..e2955c21 100644
--- a/simd/x86_64/jccolor-sse2.asm
+++ b/simd/x86_64/jccolor-sse2.asm
@@ -12,8 +12,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jcgray-avx2.asm b/simd/x86_64/jcgray-avx2.asm
index 0ec24104..591255bb 100644
--- a/simd/x86_64/jcgray-avx2.asm
+++ b/simd/x86_64/jcgray-avx2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jcgray-sse2.asm b/simd/x86_64/jcgray-sse2.asm
index edf9222e..e389904f 100644
--- a/simd/x86_64/jcgray-sse2.asm
+++ b/simd/x86_64/jcgray-sse2.asm
@@ -12,8 +12,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jcgryext-avx2.asm b/simd/x86_64/jcgryext-avx2.asm
index 79e2aa01..175b60de 100644
--- a/simd/x86_64/jcgryext-avx2.asm
+++ b/simd/x86_64/jcgryext-avx2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -88,12 +86,12 @@ EXTN(jsimd_rgb_gray_convert_avx2):
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         rcx, byte SIZEOF_BYTE
-    movzx       rax, BYTE [rsi+rcx]
+    movzx       rax, byte [rsi+rcx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         rcx, byte SIZEOF_WORD
-    movzx       rdx, WORD [rsi+rcx]
+    movzx       rdx, word [rsi+rcx]
     shl         rax, WORD_BIT
     or          rax, rdx
 .column_ld4:
diff --git a/simd/x86_64/jcgryext-sse2.asm b/simd/x86_64/jcgryext-sse2.asm
index 9c3ae5ef..873be805 100644
--- a/simd/x86_64/jcgryext-sse2.asm
+++ b/simd/x86_64/jcgryext-sse2.asm
@@ -12,8 +12,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -87,12 +85,12 @@ EXTN(jsimd_rgb_gray_convert_sse2):
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         rcx, byte SIZEOF_BYTE
-    movzx       rax, BYTE [rsi+rcx]
+    movzx       rax, byte [rsi+rcx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         rcx, byte SIZEOF_WORD
-    movzx       rdx, WORD [rsi+rcx]
+    movzx       rdx, word [rsi+rcx]
     shl         rax, WORD_BIT
     or          rax, rdx
 .column_ld4:
diff --git a/simd/x86_64/jchuff-sse2.asm b/simd/x86_64/jchuff-sse2.asm
index 1b091ad1..aa78fd5c 100644
--- a/simd/x86_64/jchuff-sse2.asm
+++ b/simd/x86_64/jchuff-sse2.asm
@@ -17,8 +17,6 @@
 ; This file contains an SSE2 implementation for Huffman coding of one block.
 ; The following code is based directly on jchuff.c; see jchuff.c for more
 ; details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
@@ -200,7 +198,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     mov         buffer, r11                  ; r11 is now sratch
 
     mov         put_buffer, MMWORD [r10+16]  ; put_buffer = state->cur.put_buffer;
-    mov         put_bits,    DWORD [r10+24]  ; put_bits = state->cur.put_bits;
+    mov         put_bits,    dword [r10+24]  ; put_bits = state->cur.put_bits;
     push        r10                          ; r10 is now scratch
 
     ; Encode the DC coefficient difference per section F.1.2.1
@@ -333,7 +331,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     pop         r10
     ; Save put_buffer & put_bits
     mov         MMWORD [r10+16], put_buffer  ; state->cur.put_buffer = put_buffer;
-    mov         DWORD  [r10+24], put_bits    ; state->cur.put_bits = put_bits;
+    mov         dword  [r10+24], put_bits    ; state->cur.put_bits = put_bits;
 
     pop         rbx
     uncollect_args 6
diff --git a/simd/x86_64/jcphuff-sse2.asm b/simd/x86_64/jcphuff-sse2.asm
index b17488a7..8ed44728 100644
--- a/simd/x86_64/jcphuff-sse2.asm
+++ b/simd/x86_64/jcphuff-sse2.asm
@@ -16,8 +16,6 @@
 ;
 ; This file contains an SSE2 implementation of data preparation for progressive
 ; Huffman encoding.  See jcphuff.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
@@ -322,6 +320,8 @@ EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
     add         LUT, 16*SIZEOF_INT
     dec         K
     jnz         .BLOOP16
+    test        LEN, 15
+    je          .PADDING
 .ELOOP16:
     test        LEN, 8
     jz          .TRY7
diff --git a/simd/x86_64/jcsample-avx2.asm b/simd/x86_64/jcsample-avx2.asm
index 9d5a8618..d9922bb4 100644
--- a/simd/x86_64/jcsample-avx2.asm
+++ b/simd/x86_64/jcsample-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jcsample-sse2.asm b/simd/x86_64/jcsample-sse2.asm
index 1b315364..0f107e9a 100644
--- a/simd/x86_64/jcsample-sse2.asm
+++ b/simd/x86_64/jcsample-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jdcolext-avx2.asm b/simd/x86_64/jdcolext-avx2.asm
index e2b96c73..677b8ed8 100644
--- a/simd/x86_64/jdcolext-avx2.asm
+++ b/simd/x86_64/jdcolext-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -334,7 +332,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
     vmovd       eax, xmmA
     cmp         rcx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [rdi], ax
+    mov         word [rdi], ax
     add         rdi, byte SIZEOF_WORD
     sub         rcx, byte SIZEOF_WORD
     shr         rax, 16
@@ -343,7 +341,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
     ; space.
     test        rcx, rcx
     jz          short .nextrow
-    mov         BYTE [rdi], al
+    mov         byte [rdi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/x86_64/jdcolext-sse2.asm b/simd/x86_64/jdcolext-sse2.asm
index a94954bb..071aa629 100644
--- a/simd/x86_64/jdcolext-sse2.asm
+++ b/simd/x86_64/jdcolext-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -306,7 +304,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
     movd        eax, xmmA
     cmp         rcx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [rdi], ax
+    mov         word [rdi], ax
     add         rdi, byte SIZEOF_WORD
     sub         rcx, byte SIZEOF_WORD
     shr         rax, 16
@@ -315,7 +313,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
     ; space.
     test        rcx, rcx
     jz          short .nextrow
-    mov         BYTE [rdi], al
+    mov         byte [rdi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/x86_64/jdcolor-avx2.asm b/simd/x86_64/jdcolor-avx2.asm
index abad1766..43de9db0 100644
--- a/simd/x86_64/jdcolor-avx2.asm
+++ b/simd/x86_64/jdcolor-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jdcolor-sse2.asm b/simd/x86_64/jdcolor-sse2.asm
index e7079f60..b3f1fec0 100644
--- a/simd/x86_64/jdcolor-sse2.asm
+++ b/simd/x86_64/jdcolor-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jdmerge-avx2.asm b/simd/x86_64/jdmerge-avx2.asm
index ca3f063c..9515a170 100644
--- a/simd/x86_64/jdmerge-avx2.asm
+++ b/simd/x86_64/jdmerge-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jdmerge-sse2.asm b/simd/x86_64/jdmerge-sse2.asm
index f3e09fa8..aedccc20 100644
--- a/simd/x86_64/jdmerge-sse2.asm
+++ b/simd/x86_64/jdmerge-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jdmrgext-avx2.asm b/simd/x86_64/jdmrgext-avx2.asm
index 04e8a945..bb733c58 100644
--- a/simd/x86_64/jdmrgext-avx2.asm
+++ b/simd/x86_64/jdmrgext-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -339,7 +337,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
     vmovd       eax, xmmA
     cmp         rcx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [rdi], ax
+    mov         word [rdi], ax
     add         rdi, byte SIZEOF_WORD
     sub         rcx, byte SIZEOF_WORD
     shr         rax, 16
@@ -348,7 +346,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
     ; space.
     test        rcx, rcx
     jz          short .endcolumn
-    mov         BYTE [rdi], al
+    mov         byte [rdi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/x86_64/jdmrgext-sse2.asm b/simd/x86_64/jdmrgext-sse2.asm
index 1cc33455..b176a4cd 100644
--- a/simd/x86_64/jdmrgext-sse2.asm
+++ b/simd/x86_64/jdmrgext-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -310,7 +308,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
     movd        eax, xmmA
     cmp         rcx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [rdi], ax
+    mov         word [rdi], ax
     add         rdi, byte SIZEOF_WORD
     sub         rcx, byte SIZEOF_WORD
     shr         rax, 16
@@ -319,7 +317,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
     ; space.
     test        rcx, rcx
     jz          short .endcolumn
-    mov         BYTE [rdi], al
+    mov         byte [rdi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/x86_64/jdsample-avx2.asm b/simd/x86_64/jdsample-avx2.asm
index 10fa5c45..fc274a95 100644
--- a/simd/x86_64/jdsample-avx2.asm
+++ b/simd/x86_64/jdsample-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jdsample-sse2.asm b/simd/x86_64/jdsample-sse2.asm
index d8ccda9b..20e07670 100644
--- a/simd/x86_64/jdsample-sse2.asm
+++ b/simd/x86_64/jdsample-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jfdctflt-sse.asm b/simd/x86_64/jfdctflt-sse.asm
index 26f9fb6a..ef279664 100644
--- a/simd/x86_64/jfdctflt-sse.asm
+++ b/simd/x86_64/jfdctflt-sse.asm
@@ -17,8 +17,6 @@
 ; This file contains a floating-point implementation of the forward DCT
 ; (Discrete Cosine Transform). The following code is based directly on
 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/x86_64/jfdctfst-sse2.asm b/simd/x86_64/jfdctfst-sse2.asm
index aaf8b9e3..2e1bfe6e 100644
--- a/simd/x86_64/jfdctfst-sse2.asm
+++ b/simd/x86_64/jfdctfst-sse2.asm
@@ -18,8 +18,6 @@
 ; the forward DCT (Discrete Cosine Transform). The following code is
 ; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
 ; for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/x86_64/jfdctint-avx2.asm b/simd/x86_64/jfdctint-avx2.asm
index 448f47d4..6ad4cf0b 100644
--- a/simd/x86_64/jfdctint-avx2.asm
+++ b/simd/x86_64/jfdctint-avx2.asm
@@ -18,8 +18,6 @@
 ; forward DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jfdctint.c; see the jfdctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/x86_64/jfdctint-sse2.asm b/simd/x86_64/jfdctint-sse2.asm
index ef16a52a..5d0de3cf 100644
--- a/simd/x86_64/jfdctint-sse2.asm
+++ b/simd/x86_64/jfdctint-sse2.asm
@@ -18,8 +18,6 @@
 ; forward DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jfdctint.c; see the jfdctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/x86_64/jidctflt-sse2.asm b/simd/x86_64/jidctflt-sse2.asm
index b676ef39..ab95e1a6 100644
--- a/simd/x86_64/jidctflt-sse2.asm
+++ b/simd/x86_64/jidctflt-sse2.asm
@@ -17,8 +17,6 @@
 ; This file contains a floating-point implementation of the inverse DCT
 ; (Discrete Cosine Transform). The following code is based directly on
 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -95,8 +93,8 @@ EXTN(jsimd_idct_float_sse2):
     mov         rcx, DCTSIZE/4          ; ctr
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
diff --git a/simd/x86_64/jidctfst-sse2.asm b/simd/x86_64/jidctfst-sse2.asm
index c6c42f9b..a66a6811 100644
--- a/simd/x86_64/jidctfst-sse2.asm
+++ b/simd/x86_64/jidctfst-sse2.asm
@@ -18,8 +18,6 @@
 ; the inverse DCT (Discrete Cosine Transform). The following code is
 ; based directly on the IJG's original jidctfst.c; see the jidctfst.c
 ; for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -111,8 +109,8 @@ EXTN(jsimd_idct_ifast_sse2):
     mov         rsi, r11                ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
diff --git a/simd/x86_64/jidctint-avx2.asm b/simd/x86_64/jidctint-avx2.asm
index b60b44f2..50270f47 100644
--- a/simd/x86_64/jidctint-avx2.asm
+++ b/simd/x86_64/jidctint-avx2.asm
@@ -18,8 +18,6 @@
 ; inverse DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jidctint.c; see the jidctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -292,8 +290,8 @@ EXTN(jsimd_idct_islow_avx2):
     ; ---- Pass 1: process columns.
 
 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
-    mov         eax, DWORD [DWBLOCK(1,0,r11,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,r11,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,r11,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,r11,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,r11,SIZEOF_JCOEF)]
diff --git a/simd/x86_64/jidctint-sse2.asm b/simd/x86_64/jidctint-sse2.asm
index 83fc344b..034530c2 100644
--- a/simd/x86_64/jidctint-sse2.asm
+++ b/simd/x86_64/jidctint-sse2.asm
@@ -18,8 +18,6 @@
 ; inverse DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jidctint.c; see the jidctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -124,8 +122,8 @@ EXTN(jsimd_idct_islow_sse2):
     mov         rsi, r11                ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
diff --git a/simd/x86_64/jidctred-sse2.asm b/simd/x86_64/jidctred-sse2.asm
index af64fdc2..7fbfcc51 100644
--- a/simd/x86_64/jidctred-sse2.asm
+++ b/simd/x86_64/jidctred-sse2.asm
@@ -18,8 +18,6 @@
 ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
 ; The following code is based directly on the IJG's original jidctred.c;
 ; see the jidctred.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -132,8 +130,8 @@ EXTN(jsimd_idct_4x4_sse2):
     mov         rsi, r11                ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
@@ -562,8 +560,8 @@ EXTN(jsimd_idct_2x2_sse2):
 
     mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
     mov         rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-    mov         WORD [rdx+rax*SIZEOF_JSAMPLE], bx
-    mov         WORD [rsi+rax*SIZEOF_JSAMPLE], cx
+    mov         word [rdx+rax*SIZEOF_JSAMPLE], bx
+    mov         word [rsi+rax*SIZEOF_JSAMPLE], cx
 
     pop         rbx
     uncollect_args 4
diff --git a/simd/x86_64/jquantf-sse2.asm b/simd/x86_64/jquantf-sse2.asm
index 4600eecc..83596a91 100644
--- a/simd/x86_64/jquantf-sse2.asm
+++ b/simd/x86_64/jquantf-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/x86_64/jquanti-avx2.asm b/simd/x86_64/jquanti-avx2.asm
index b7243e43..5f04d223 100644
--- a/simd/x86_64/jquanti-avx2.asm
+++ b/simd/x86_64/jquanti-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/x86_64/jquanti-sse2.asm b/simd/x86_64/jquanti-sse2.asm
index 7ff7275f..bb6fa69e 100644
--- a/simd/x86_64/jquanti-sse2.asm
+++ b/simd/x86_64/jquanti-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/x86_64/jsimdcpu.asm b/simd/x86_64/jsimdcpu.asm
index 42979bef..705f813d 100644
--- a/simd/x86_64/jsimdcpu.asm
+++ b/simd/x86_64/jsimdcpu.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
@@ -38,14 +36,23 @@ EXTN(jpeg_simd_cpu_support):
 
     xor         rdi, rdi                ; simd support flag
 
+    ; Assume that all x86-64 processors support SSE & SSE2 instructions
+    or          rdi, JSIMD_SSE2
+    or          rdi, JSIMD_SSE
+
+    ; Check whether CPUID leaf 07H is supported
+    ; (leaf 07H is used to check for AVX2 instruction support)
+    mov         rax, 0
+    cpuid
+    cmp         rax, 7
+    jl          short .return           ; Maximum leaf < 07H
+
     ; Check for AVX2 instruction support
     mov         rax, 7
     xor         rcx, rcx
     cpuid
     mov         rax, rbx                ; rax = Extended feature flags
 
-    or          rdi, JSIMD_SSE2
-    or          rdi, JSIMD_SSE
     test        rax, 1<<5               ; bit5:AVX2
     jz          short .return
 
@@ -60,9 +67,10 @@ EXTN(jpeg_simd_cpu_support):
 
     xor         rcx, rcx
     xgetbv
-    test        rax, 6                  ; O/S does not manage XMM/YMM state
+    and         rax, 6
+    cmp         rax, 6                  ; O/S does not manage XMM/YMM state
                                         ; using XSAVE
-    jz          short .return
+    jnz         short .return
 
     or          rdi, JSIMD_AVX2
 
diff --git a/testimages/test.scan b/testimages/test.scan
new file mode 100644
index 00000000..563446da
--- /dev/null
+++ b/testimages/test.scan
@@ -0,0 +1,5 @@
+0 1 2: 0 0 0 0;
+0: 1 16 0 0;
+0: 17 63 0 0;
+1: 1 63 0 0;
+2: 1 63 0 0;
diff --git a/tjbench.c b/tjbench.c
index 9ab0ea2a..faad9784 100644
--- a/tjbench.c
+++ b/tjbench.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2018 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2019 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,27 +32,28 @@
 #include <ctype.h>
 #include <math.h>
 #include <errno.h>
+#include <limits.h>
 #include <cdjpeg.h>
 #include "./tjutil.h"
 #include "./turbojpeg.h"
 
 
-#define _throw(op, err) { \
+#define THROW(op, err) { \
   printf("ERROR in line %d while %s:\n%s\n", __LINE__, op, err); \
   retval = -1;  goto bailout; \
 }
-#define _throwunix(m)  _throw(m, strerror(errno))
+#define THROW_UNIX(m)  THROW(m, strerror(errno))
 
 char tjErrorStr[JMSG_LENGTH_MAX] = "\0", tjErrorMsg[JMSG_LENGTH_MAX] = "\0";
 int tjErrorLine = -1, tjErrorCode = -1;
 
-#define _throwtjg(m) { \
+#define THROW_TJG(m) { \
   printf("ERROR in line %d while %s:\n%s\n", __LINE__, m, \
          tjGetErrorStr2(NULL)); \
   retval = -1;  goto bailout; \
 }
 
-#define _throwtj(m) { \
+#define THROW_TJ(m) { \
   int _tjErrorCode = tjGetErrorCode(handle); \
   char *_tjErrorStr = tjGetErrorStr2(handle); \
   \
@@ -60,8 +61,8 @@ int tjErrorLine = -1, tjErrorCode = -1;
     if (strncmp(tjErrorStr, _tjErrorStr, JMSG_LENGTH_MAX) || \
         strncmp(tjErrorMsg, m, JMSG_LENGTH_MAX) || \
         tjErrorCode != _tjErrorCode || tjErrorLine != __LINE__) { \
-      strncpy(tjErrorStr, _tjErrorStr, JMSG_LENGTH_MAX); \
-      strncpy(tjErrorMsg, m, JMSG_LENGTH_MAX); \
+      strncpy(tjErrorStr, _tjErrorStr, JMSG_LENGTH_MAX - 1); \
+      strncpy(tjErrorMsg, m, JMSG_LENGTH_MAX - 1); \
       tjErrorCode = _tjErrorCode; \
       tjErrorLine = __LINE__; \
       printf("WARNING in line %d while %s:\n%s\n", __LINE__, m, _tjErrorStr); \
@@ -95,7 +96,7 @@ int (*customFilter) (short *, tjregion, tjregion, int, int, tjtransform *);
 double benchTime = 5.0, warmup = 1.0;
 
 
-char *formatName(int subsamp, int cs, char *buf)
+static char *formatName(int subsamp, int cs, char *buf)
 {
   if (cs == TJCS_YCbCr)
     return (char *)subNameLong[subsamp];
@@ -107,7 +108,7 @@ char *formatName(int subsamp, int cs, char *buf)
 }
 
 
-char *sigfig(double val, int figs, char *buf, int len)
+static char *sigfig(double val, int figs, char *buf, int len)
 {
   char format[80];
   int digitsAfterDecimal = figs - (int)ceil(log10(fabs(val)));
@@ -122,9 +123,9 @@ char *sigfig(double val, int figs, char *buf, int len)
 
 
 /* Custom DCT filter which produces a negative of the image */
-int dummyDCTFilter(short *coeffs, tjregion arrayRegion, tjregion planeRegion,
-                   int componentIndex, int transformIndex,
-                   tjtransform *transform)
+static int dummyDCTFilter(short *coeffs, tjregion arrayRegion,
+                          tjregion planeRegion, int componentIndex,
+                          int transformIndex, tjtransform *transform)
 {
   int i;
 
@@ -135,11 +136,12 @@ int dummyDCTFilter(short *coeffs, tjregion arrayRegion, tjregion planeRegion,
 
 
 /* Decompression test */
-int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
-           unsigned long *jpegSize, unsigned char *dstBuf, int w, int h,
-           int subsamp, int jpegQual, char *fileName, int tilew, int tileh)
+static int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
+                  unsigned long *jpegSize, unsigned char *dstBuf, int w, int h,
+                  int subsamp, int jpegQual, char *fileName, int tilew,
+                  int tileh)
 {
-  char tempStr[1024], sizeStr[20] = "\0", qualStr[6] = "\0", *ptr;
+  char tempStr[1024], sizeStr[24] = "\0", qualStr[13] = "\0", *ptr;
   FILE *file = NULL;
   tjhandle handle = NULL;
   int row, col, iter = 0, dstBufAlloc = 0, retval = 0;
@@ -152,29 +154,34 @@ int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
   unsigned char *dstPtr, *dstPtr2, *yuvBuf = NULL;
 
   if (jpegQual > 0) {
-    snprintf(qualStr, 6, "_Q%d", jpegQual);
-    qualStr[5] = 0;
+    snprintf(qualStr, 13, "_Q%d", jpegQual);
+    qualStr[12] = 0;
   }
 
   if ((handle = tjInitDecompress()) == NULL)
-    _throwtj("executing tjInitDecompress()");
+    THROW_TJ("executing tjInitDecompress()");
 
   if (dstBuf == NULL) {
-    if ((dstBuf = (unsigned char *)malloc(pitch * scaledh)) == NULL)
-      _throwunix("allocating destination buffer");
+    if ((unsigned long long)pitch * (unsigned long long)scaledh >
+        (unsigned long long)((size_t)-1))
+      THROW("allocating destination buffer", "Image is too large");
+    if ((dstBuf = (unsigned char *)malloc((size_t)pitch * scaledh)) == NULL)
+      THROW_UNIX("allocating destination buffer");
     dstBufAlloc = 1;
   }
   /* Set the destination buffer to gray so we know whether the decompressor
      attempted to write to it */
-  memset(dstBuf, 127, pitch * scaledh);
+  memset(dstBuf, 127, (size_t)pitch * scaledh);
 
   if (doYUV) {
     int width = doTile ? tilew : scaledw;
     int height = doTile ? tileh : scaledh;
-    int yuvSize = tjBufSizeYUV2(width, yuvPad, height, subsamp);
+    unsigned long yuvSize = tjBufSizeYUV2(width, yuvPad, height, subsamp);
 
+    if (yuvSize == (unsigned long)-1)
+      THROW_TJ("allocating YUV buffer");
     if ((yuvBuf = (unsigned char *)malloc(yuvSize)) == NULL)
-      _throwunix("allocating YUV buffer");
+      THROW_UNIX("allocating YUV buffer");
     memset(yuvBuf, 127, yuvSize);
   }
 
@@ -186,7 +193,7 @@ int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
     double start = getTime();
 
     for (row = 0, dstPtr = dstBuf; row < ntilesh;
-         row++, dstPtr += pitch * tileh) {
+         row++, dstPtr += (size_t)pitch * tileh) {
       for (col = 0, dstPtr2 = dstPtr; col < ntilesw;
            col++, tile++, dstPtr2 += ps * tilew) {
         int width = doTile ? min(tilew, w - col * tilew) : scaledw;
@@ -197,16 +204,16 @@ int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
 
           if (tjDecompressToYUV2(handle, jpegBuf[tile], jpegSize[tile], yuvBuf,
                                  width, yuvPad, height, flags) == -1)
-            _throwtj("executing tjDecompressToYUV2()");
+            THROW_TJ("executing tjDecompressToYUV2()");
           startDecode = getTime();
           if (tjDecodeYUV(handle, yuvBuf, yuvPad, subsamp, dstPtr2, width,
                           pitch, height, pf, flags) == -1)
-            _throwtj("executing tjDecodeYUV()");
+            THROW_TJ("executing tjDecodeYUV()");
           if (iter >= 0) elapsedDecode += getTime() - startDecode;
         } else if (tjDecompress2(handle, jpegBuf[tile], jpegSize[tile],
                                  dstPtr2, width, pitch, height, pf,
                                  flags) == -1)
-          _throwtj("executing tjDecompress2()");
+          THROW_TJ("executing tjDecompress2()");
       }
     }
     elapsed += getTime() - start;
@@ -220,7 +227,7 @@ int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
   }
   if (doYUV) elapsed -= elapsedDecode;
 
-  if (tjDestroy(handle) == -1) _throwtj("executing tjDestroy()");
+  if (tjDestroy(handle) == -1) THROW_TJ("executing tjDestroy()");
   handle = NULL;
 
   if (quiet) {
@@ -249,10 +256,10 @@ int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
   if (!doWrite) goto bailout;
 
   if (sf.num != 1 || sf.denom != 1)
-    snprintf(sizeStr, 20, "%d_%d", sf.num, sf.denom);
+    snprintf(sizeStr, 24, "%d_%d", sf.num, sf.denom);
   else if (tilew != w || tileh != h)
-    snprintf(sizeStr, 20, "%dx%d", tilew, tileh);
-  else snprintf(sizeStr, 20, "full");
+    snprintf(sizeStr, 24, "%dx%d", tilew, tileh);
+  else snprintf(sizeStr, 24, "full");
   if (decompOnly)
     snprintf(tempStr, 1024, "%s_%s.%s", fileName, sizeStr, ext);
   else
@@ -260,19 +267,19 @@ int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
              qualStr, sizeStr, ext);
 
   if (tjSaveImage(tempStr, dstBuf, scaledw, 0, scaledh, pf, flags) == -1)
-    _throwtjg("saving bitmap");
+    THROW_TJG("saving bitmap");
   ptr = strrchr(tempStr, '.');
   snprintf(ptr, 1024 - (ptr - tempStr), "-err.%s", ext);
   if (srcBuf && sf.num == 1 && sf.denom == 1) {
     if (!quiet) printf("Compression error written to %s.\n", tempStr);
     if (subsamp == TJ_GRAYSCALE) {
-      int index, index2;
+      unsigned long index, index2;
 
       for (row = 0, index = 0; row < h; row++, index += pitch) {
         for (col = 0, index2 = index; col < w; col++, index2 += ps) {
-          int rindex = index2 + tjRedOffset[pf];
-          int gindex = index2 + tjGreenOffset[pf];
-          int bindex = index2 + tjBlueOffset[pf];
+          unsigned long rindex = index2 + tjRedOffset[pf];
+          unsigned long gindex = index2 + tjGreenOffset[pf];
+          unsigned long bindex = index2 + tjBlueOffset[pf];
           int y = (int)((double)srcBuf[rindex] * 0.299 +
                         (double)srcBuf[gindex] * 0.587 +
                         (double)srcBuf[bindex] * 0.114 + 0.5);
@@ -291,20 +298,20 @@ int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
             abs(dstBuf[pitch * row + col] - srcBuf[pitch * row + col]);
     }
     if (tjSaveImage(tempStr, dstBuf, w, 0, h, pf, flags) == -1)
-      _throwtjg("saving bitmap");
+      THROW_TJG("saving bitmap");
   }
 
 bailout:
   if (file) fclose(file);
   if (handle) tjDestroy(handle);
-  if (dstBuf && dstBufAlloc) free(dstBuf);
-  if (yuvBuf) free(yuvBuf);
+  if (dstBufAlloc) free(dstBuf);
+  free(yuvBuf);
   return retval;
 }
 
 
-int fullTest(unsigned char *srcBuf, int w, int h, int subsamp, int jpegQual,
-             char *fileName)
+static int fullTest(unsigned char *srcBuf, int w, int h, int subsamp,
+                    int jpegQual, char *fileName)
 {
   char tempStr[1024], tempStr2[80];
   FILE *file = NULL;
@@ -313,14 +320,17 @@ int fullTest(unsigned char *srcBuf, int w, int h, int subsamp, int jpegQual,
     *srcPtr2;
   double start, elapsed, elapsedEncode;
   int totalJpegSize = 0, row, col, i, tilew = w, tileh = h, retval = 0;
-  int iter, yuvSize = 0;
-  unsigned long *jpegSize = NULL;
+  int iter;
+  unsigned long *jpegSize = NULL, yuvSize = 0;
   int ps = tjPixelSize[pf];
   int ntilesw = 1, ntilesh = 1, pitch = w * ps;
   const char *pfStr = pixFormatStr[pf];
 
-  if ((tmpBuf = (unsigned char *)malloc(pitch * h)) == NULL)
-    _throwunix("allocating temporary image buffer");
+  if ((unsigned long long)pitch * (unsigned long long)h >
+      (unsigned long long)((size_t)-1))
+    THROW("allocating temporary image buffer", "Image is too large");
+  if ((tmpBuf = (unsigned char *)malloc((size_t)pitch * h)) == NULL)
+    THROW_UNIX("allocating temporary image buffer");
 
   if (!quiet)
     printf(">>>>>  %s (%s) <--> JPEG %s Q%d  <<<<<\n", pfStr,
@@ -336,18 +346,20 @@ int fullTest(unsigned char *srcBuf, int w, int h, int subsamp, int jpegQual,
 
     if ((jpegBuf = (unsigned char **)malloc(sizeof(unsigned char *) *
                                             ntilesw * ntilesh)) == NULL)
-      _throwunix("allocating JPEG tile array");
+      THROW_UNIX("allocating JPEG tile array");
     memset(jpegBuf, 0, sizeof(unsigned char *) * ntilesw * ntilesh);
     if ((jpegSize = (unsigned long *)malloc(sizeof(unsigned long) *
                                             ntilesw * ntilesh)) == NULL)
-      _throwunix("allocating JPEG size array");
+      THROW_UNIX("allocating JPEG size array");
     memset(jpegSize, 0, sizeof(unsigned long) * ntilesw * ntilesh);
 
     if ((flags & TJFLAG_NOREALLOC) != 0)
       for (i = 0; i < ntilesw * ntilesh; i++) {
+        if (tjBufSize(tilew, tileh, subsamp) > (unsigned long)INT_MAX)
+          THROW("getting buffer size", "Image is too large");
         if ((jpegBuf[i] = (unsigned char *)
                           tjAlloc(tjBufSize(tilew, tileh, subsamp))) == NULL)
-          _throwunix("allocating JPEG tiles");
+          THROW_UNIX("allocating JPEG tiles");
       }
 
     /* Compression test */
@@ -358,12 +370,14 @@ int fullTest(unsigned char *srcBuf, int w, int h, int subsamp, int jpegQual,
     for (i = 0; i < h; i++)
       memcpy(&tmpBuf[pitch * i], &srcBuf[w * ps * i], w * ps);
     if ((handle = tjInitCompress()) == NULL)
-      _throwtj("executing tjInitCompress()");
+      THROW_TJ("executing tjInitCompress()");
 
     if (doYUV) {
       yuvSize = tjBufSizeYUV2(tilew, yuvPad, tileh, subsamp);
+      if (yuvSize == (unsigned long)-1)
+        THROW_TJ("allocating YUV buffer");
       if ((yuvBuf = (unsigned char *)malloc(yuvSize)) == NULL)
-        _throwunix("allocating YUV buffer");
+        THROW_UNIX("allocating YUV buffer");
       memset(yuvBuf, 127, yuvSize);
     }
 
@@ -387,17 +401,17 @@ int fullTest(unsigned char *srcBuf, int w, int h, int subsamp, int jpegQual,
 
             if (tjEncodeYUV3(handle, srcPtr2, width, pitch, height, pf, yuvBuf,
                              yuvPad, subsamp, flags) == -1)
-              _throwtj("executing tjEncodeYUV3()");
+              THROW_TJ("executing tjEncodeYUV3()");
             if (iter >= 0) elapsedEncode += getTime() - startEncode;
             if (tjCompressFromYUV(handle, yuvBuf, width, yuvPad, height,
                                   subsamp, &jpegBuf[tile], &jpegSize[tile],
                                   jpegQual, flags) == -1)
-              _throwtj("executing tjCompressFromYUV()");
+              THROW_TJ("executing tjCompressFromYUV()");
           } else {
             if (tjCompress2(handle, srcPtr2, width, pitch, height, pf,
                             &jpegBuf[tile], &jpegSize[tile], subsamp, jpegQual,
                             flags) == -1)
-              _throwtj("executing tjCompress2()");
+              THROW_TJ("executing tjCompress2()");
           }
           totalJpegSize += jpegSize[tile];
         }
@@ -413,7 +427,7 @@ int fullTest(unsigned char *srcBuf, int w, int h, int subsamp, int jpegQual,
     }
     if (doYUV) elapsed -= elapsedEncode;
 
-    if (tjDestroy(handle) == -1) _throwtj("executing tjDestroy()");
+    if (tjDestroy(handle) == -1) THROW_TJ("executing tjDestroy()");
     handle = NULL;
 
     if (quiet == 1) printf("%-5d  %-5d   ", tilew, tileh);
@@ -436,7 +450,7 @@ int fullTest(unsigned char *srcBuf, int w, int h, int subsamp, int jpegQual,
       if (doYUV) {
         printf("Encode YUV    --> Frame rate:         %f fps\n",
                (double)iter / elapsedEncode);
-        printf("                  Output image size:  %d bytes\n", yuvSize);
+        printf("                  Output image size:  %lu bytes\n", yuvSize);
         printf("                  Compression ratio:  %f:1\n",
                (double)(w * h * ps) / (double)yuvSize);
         printf("                  Throughput:         %f Megapixels/sec\n",
@@ -460,9 +474,9 @@ int fullTest(unsigned char *srcBuf, int w, int h, int subsamp, int jpegQual,
       snprintf(tempStr, 1024, "%s_%s_Q%d.jpg", fileName, subName[subsamp],
                jpegQual);
       if ((file = fopen(tempStr, "wb")) == NULL)
-        _throwunix("opening reference image");
+        THROW_UNIX("opening reference image");
       if (fwrite(jpegBuf[0], jpegSize[0], 1, file) != 1)
-        _throwunix("writing reference image");
+        THROW_UNIX("writing reference image");
       fclose(file);  file = NULL;
       if (!quiet) printf("Reference image written to %s\n", tempStr);
     }
@@ -472,10 +486,10 @@ int fullTest(unsigned char *srcBuf, int w, int h, int subsamp, int jpegQual,
       if (decomp(srcBuf, jpegBuf, jpegSize, tmpBuf, w, h, subsamp, jpegQual,
                  fileName, tilew, tileh) == -1)
         goto bailout;
-    }
+    } else if (quiet == 1) printf("N/A\n");
 
     for (i = 0; i < ntilesw * ntilesh; i++) {
-      if (jpegBuf[i]) tjFree(jpegBuf[i]);
+      tjFree(jpegBuf[i]);
       jpegBuf[i] = NULL;
     }
     free(jpegBuf);  jpegBuf = NULL;
@@ -488,23 +502,21 @@ int fullTest(unsigned char *srcBuf, int w, int h, int subsamp, int jpegQual,
   }
 
 bailout:
-  if (file) { fclose(file);  file = NULL; }
+  if (file) fclose(file);
   if (jpegBuf) {
-    for (i = 0; i < ntilesw * ntilesh; i++) {
-      if (jpegBuf[i]) tjFree(jpegBuf[i]);
-      jpegBuf[i] = NULL;
-    }
-    free(jpegBuf);  jpegBuf = NULL;
+    for (i = 0; i < ntilesw * ntilesh; i++)
+      tjFree(jpegBuf[i]);
   }
-  if (yuvBuf) { free(yuvBuf);  yuvBuf = NULL; }
-  if (jpegSize) { free(jpegSize);  jpegSize = NULL; }
-  if (tmpBuf) { free(tmpBuf);  tmpBuf = NULL; }
-  if (handle) { tjDestroy(handle);  handle = NULL; }
+  free(jpegBuf);
+  free(yuvBuf);
+  free(jpegSize);
+  free(tmpBuf);
+  if (handle) tjDestroy(handle);
   return retval;
 }
 
 
-int decompTest(char *fileName)
+static int decompTest(char *fileName)
 {
   FILE *file = NULL;
   tjhandle handle = NULL;
@@ -521,26 +533,28 @@ int decompTest(char *fileName)
   int tw, th, ttilew, ttileh, tntilesw, tntilesh, tsubsamp;
 
   if ((file = fopen(fileName, "rb")) == NULL)
-    _throwunix("opening file");
+    THROW_UNIX("opening file");
   if (fseek(file, 0, SEEK_END) < 0 ||
       (srcSize = ftell(file)) == (unsigned long)-1)
-    _throwunix("determining file size");
+    THROW_UNIX("determining file size");
   if ((srcBuf = (unsigned char *)malloc(srcSize)) == NULL)
-    _throwunix("allocating memory");
+    THROW_UNIX("allocating memory");
   if (fseek(file, 0, SEEK_SET) < 0)
-    _throwunix("setting file position");
+    THROW_UNIX("setting file position");
   if (fread(srcBuf, srcSize, 1, file) < 1)
-    _throwunix("reading JPEG data");
+    THROW_UNIX("reading JPEG data");
   fclose(file);  file = NULL;
 
   temp = strrchr(fileName, '.');
   if (temp != NULL) *temp = '\0';
 
   if ((handle = tjInitTransform()) == NULL)
-    _throwtj("executing tjInitTransform()");
+    THROW_TJ("executing tjInitTransform()");
   if (tjDecompressHeader3(handle, srcBuf, srcSize, &w, &h, &subsamp,
                           &cs) == -1)
-    _throwtj("executing tjDecompressHeader3()");
+    THROW_TJ("executing tjDecompressHeader3()");
+  if (w < 1 || h < 1)
+    THROW("reading JPEG header", "Invalid image dimensions");
   if (cs == TJCS_YCCK || cs == TJCS_CMYK) {
     pf = TJPF_CMYK;  ps = tjPixelSize[pf];
   }
@@ -568,18 +582,21 @@ int decompTest(char *fileName)
 
     if ((jpegBuf = (unsigned char **)malloc(sizeof(unsigned char *) *
                                             ntilesw * ntilesh)) == NULL)
-      _throwunix("allocating JPEG tile array");
+      THROW_UNIX("allocating JPEG tile array");
     memset(jpegBuf, 0, sizeof(unsigned char *) * ntilesw * ntilesh);
     if ((jpegSize = (unsigned long *)malloc(sizeof(unsigned long) *
                                             ntilesw * ntilesh)) == NULL)
-      _throwunix("allocating JPEG size array");
+      THROW_UNIX("allocating JPEG size array");
     memset(jpegSize, 0, sizeof(unsigned long) * ntilesw * ntilesh);
 
-    if ((flags & TJFLAG_NOREALLOC) != 0 || !doTile)
+    if ((flags & TJFLAG_NOREALLOC) != 0 &&
+        (doTile || xformOp != TJXOP_NONE || xformOpt != 0 || customFilter))
       for (i = 0; i < ntilesw * ntilesh; i++) {
+        if (tjBufSize(tilew, tileh, subsamp) > (unsigned long)INT_MAX)
+          THROW("getting buffer size", "Image is too large");
         if ((jpegBuf[i] = (unsigned char *)
                           tjAlloc(tjBufSize(tilew, tileh, subsamp))) == NULL)
-          _throwunix("allocating JPEG tiles");
+          THROW_UNIX("allocating JPEG tiles");
       }
 
     tw = w;  th = h;  ttilew = tilew;  ttileh = tileh;
@@ -599,7 +616,7 @@ int decompTest(char *fileName)
     if (doTile || xformOp != TJXOP_NONE || xformOpt != 0 || customFilter) {
       if ((t = (tjtransform *)malloc(sizeof(tjtransform) * ntilesw *
                                      ntilesh)) == NULL)
-        _throwunix("allocating image transform array");
+        THROW_UNIX("allocating image transform array");
 
       if (xformOp == TJXOP_TRANSPOSE || xformOp == TJXOP_TRANSVERSE ||
           xformOp == TJXOP_ROT90 || xformOp == TJXOP_ROT270) {
@@ -645,7 +662,7 @@ int decompTest(char *fileName)
         start = getTime();
         if (tjTransform(handle, srcBuf, srcSize, tntilesw * tntilesh, jpegBuf,
                         jpegSize, t, flags) == -1)
-          _throwtj("executing tjTransform()");
+          THROW_TJ("executing tjTransform()");
         elapsed += getTime() - start;
         if (iter >= 0) {
           iter++;
@@ -697,32 +714,31 @@ int decompTest(char *fileName)
     } else if (quiet == 1) printf("N/A\n");
 
     for (i = 0; i < ntilesw * ntilesh; i++) {
-      tjFree(jpegBuf[i]);  jpegBuf[i] = NULL;
+      tjFree(jpegBuf[i]);
+      jpegBuf[i] = NULL;
     }
     free(jpegBuf);  jpegBuf = NULL;
-    if (jpegSize) { free(jpegSize);  jpegSize = NULL; }
+    free(jpegSize);  jpegSize = NULL;
 
     if (tilew == w && tileh == h) break;
   }
 
 bailout:
-  if (file) { fclose(file);  file = NULL; }
+  if (file) fclose(file);
   if (jpegBuf) {
-    for (i = 0; i < ntilesw * ntilesh; i++) {
-      if (jpegBuf[i]) tjFree(jpegBuf[i]);
-      jpegBuf[i] = NULL;
-    }
-    free(jpegBuf);  jpegBuf = NULL;
+    for (i = 0; i < ntilesw * ntilesh; i++)
+      tjFree(jpegBuf[i]);
   }
-  if (jpegSize) { free(jpegSize);  jpegSize = NULL; }
-  if (srcBuf) { free(srcBuf);  srcBuf = NULL; }
-  if (t) { free(t);  t = NULL; }
+  free(jpegBuf);
+  free(jpegSize);
+  free(srcBuf);
+  free(t);
   if (handle) { tjDestroy(handle);  handle = NULL; }
   return retval;
 }
 
 
-void usage(char *progName)
+static void usage(char *progName)
 {
   int i;
 
@@ -801,7 +817,7 @@ int main(int argc, char *argv[])
   int minArg = 2, retval = 0, subsamp = -1;
 
   if ((scalingFactors = tjGetScalingFactors(&nsf)) == NULL || nsf == 0)
-    _throw("executing tjGetScalingFactors()", tjGetErrorStr());
+    THROW("executing tjGetScalingFactors()", tjGetErrorStr());
 
   if (argc < minArg) usage(argv[0]);
 
@@ -900,14 +916,14 @@ int main(int argc, char *argv[])
       else if (!strcasecmp(argv[i], "-copynone"))
         xformOpt |= TJXOPT_COPYNONE;
       else if (!strcasecmp(argv[i], "-benchtime") && i < argc - 1) {
-        double temp = atof(argv[++i]);
+        double tempd = atof(argv[++i]);
 
-        if (temp > 0.0) benchTime = temp;
+        if (tempd > 0.0) benchTime = tempd;
         else usage(argv[0]);
       } else if (!strcasecmp(argv[i], "-warmup") && i < argc - 1) {
-        double temp = atof(argv[++i]);
+        double tempd = atof(argv[++i]);
 
-        if (temp >= 0.0) warmup = temp;
+        if (tempd >= 0.0) warmup = tempd;
         else usage(argv[0]);
         printf("Warmup time = %.1f seconds\n\n", warmup);
       } else if (!strcasecmp(argv[i], "-alloc"))
@@ -918,16 +934,16 @@ int main(int argc, char *argv[])
         printf("Testing YUV planar encoding/decoding\n\n");
         doYUV = 1;
       } else if (!strcasecmp(argv[i], "-yuvpad") && i < argc - 1) {
-        int temp = atoi(argv[++i]);
+        int tempi = atoi(argv[++i]);
 
-        if (temp >= 1) yuvPad = temp;
+        if (tempi >= 1) yuvPad = tempi;
       } else if (!strcasecmp(argv[i], "-subsamp") && i < argc - 1) {
         i++;
         if (toupper(argv[i][0]) == 'G') subsamp = TJSAMP_GRAY;
         else {
-          int temp = atoi(argv[i]);
+          int tempi = atoi(argv[i]);
 
-          switch (temp) {
+          switch (tempi) {
           case 444:  subsamp = TJSAMP_444;  break;
           case 422:  subsamp = TJSAMP_422;  break;
           case 440:  subsamp = TJSAMP_440;  break;
@@ -959,7 +975,7 @@ int main(int argc, char *argv[])
 
   if (!decompOnly) {
     if ((srcBuf = tjLoadImage(argv[1], &w, 1, &h, &pf, flags)) == NULL)
-      _throwtjg("loading bitmap");
+      THROW_TJG("loading bitmap");
     temp = strrchr(argv[1], '.');
     if (temp != NULL) *temp = '\0';
   }
@@ -1006,6 +1022,6 @@ int main(int argc, char *argv[])
   }
 
 bailout:
-  if (srcBuf) tjFree(srcBuf);
+  tjFree(srcBuf);
   return retval;
 }
diff --git a/tjexample.c b/tjexample.c
index 61200e60..ef32c939 100644
--- a/tjexample.c
+++ b/tjexample.c
@@ -1,6 +1,6 @@
 /*
- * Copyright (C)2011-2012, 2014-2015, 2017 D. R. Commander.
- *                                         All Rights Reserved.
+ * Copyright (C)2011-2012, 2014-2015, 2017, 2019 D. R. Commander.
+ *                                               All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -44,14 +44,14 @@
 #define strncasecmp  strnicmp
 #endif
 
-#define _throw(action, message) { \
+#define THROW(action, message) { \
   printf("ERROR in line %d while %s:\n%s\n", __LINE__, action, message); \
   retval = -1;  goto bailout; \
 }
 
-#define _throwtj(action)  _throw(action, tjGetErrorStr2(tjInstance))
+#define THROW_TJ(action)  THROW(action, tjGetErrorStr2(tjInstance))
 
-#define _throwunix(action)  _throw(action, strerror(errno))
+#define THROW_UNIX(action)  THROW(action, strerror(errno))
 
 #define DEFAULT_SUBSAMP  TJSAMP_444
 #define DEFAULT_QUALITY  95
@@ -71,9 +71,9 @@ int numScalingFactors = 0;
 
 /* DCT filter example.  This produces a negative of the image. */
 
-int customFilter(short *coeffs, tjregion arrayRegion, tjregion planeRegion,
-                 int componentIndex, int transformIndex,
-                 tjtransform *transform)
+static int customFilter(short *coeffs, tjregion arrayRegion,
+                        tjregion planeRegion, int componentIndex,
+                        int transformIndex, tjtransform *transform)
 {
   int i;
 
@@ -84,7 +84,7 @@ int customFilter(short *coeffs, tjregion arrayRegion, tjregion planeRegion,
 }
 
 
-void usage(char *programName)
+static void usage(char *programName)
 {
   int i;
 
@@ -172,7 +172,7 @@ int main(int argc, char **argv)
   tjhandle tjInstance = NULL;
 
   if ((scalingFactors = tjGetScalingFactors(&numScalingFactors)) == NULL)
-    _throwtj("getting scaling factors");
+    THROW_TJ("getting scaling factors");
   memset(&xform, 0, sizeof(tjtransform));
 
   if (argc < 3)
@@ -266,17 +266,17 @@ int main(int argc, char **argv)
 
     /* Read the JPEG file into memory. */
     if ((jpegFile = fopen(argv[1], "rb")) == NULL)
-      _throwunix("opening input file");
+      THROW_UNIX("opening input file");
     if (fseek(jpegFile, 0, SEEK_END) < 0 || ((size = ftell(jpegFile)) < 0) ||
         fseek(jpegFile, 0, SEEK_SET) < 0)
-      _throwunix("determining input file size");
+      THROW_UNIX("determining input file size");
     if (size == 0)
-      _throw("determining input file size", "Input file contains no data");
+      THROW("determining input file size", "Input file contains no data");
     jpegSize = (unsigned long)size;
     if ((jpegBuf = (unsigned char *)tjAlloc(jpegSize)) == NULL)
-      _throwunix("allocating JPEG buffer");
+      THROW_UNIX("allocating JPEG buffer");
     if (fread(jpegBuf, jpegSize, 1, jpegFile) < 1)
-      _throwunix("reading input file");
+      THROW_UNIX("reading input file");
     fclose(jpegFile);  jpegFile = NULL;
 
     if (doTransform) {
@@ -285,22 +285,22 @@ int main(int argc, char **argv)
       unsigned long dstSize = 0;
 
       if ((tjInstance = tjInitTransform()) == NULL)
-        _throwtj("initializing transformer");
+        THROW_TJ("initializing transformer");
       xform.options |= TJXOPT_TRIM;
       if (tjTransform(tjInstance, jpegBuf, jpegSize, 1, &dstBuf, &dstSize,
                       &xform, flags) < 0)
-        _throwtj("transforming input image");
+        THROW_TJ("transforming input image");
       tjFree(jpegBuf);
       jpegBuf = dstBuf;
       jpegSize = dstSize;
     } else {
       if ((tjInstance = tjInitDecompress()) == NULL)
-        _throwtj("initializing decompressor");
+        THROW_TJ("initializing decompressor");
     }
 
     if (tjDecompressHeader3(tjInstance, jpegBuf, jpegSize, &width, &height,
                             &inSubsamp, &inColorspace) < 0)
-      _throwtj("reading JPEG header");
+      THROW_TJ("reading JPEG header");
 
     printf("%s Image:  %d x %d pixels, %s subsampling, %s colorspace\n",
            (doTransform ? "Transformed" : "Input"), width, height,
@@ -312,9 +312,9 @@ int main(int argc, char **argv)
       /* Input image has been transformed, and no re-compression options
          have been selected.  Write the transformed image to disk and exit. */
       if ((jpegFile = fopen(argv[2], "wb")) == NULL)
-        _throwunix("opening output file");
+        THROW_UNIX("opening output file");
       if (fwrite(jpegBuf, jpegSize, 1, jpegFile) < 1)
-        _throwunix("writing output file");
+        THROW_UNIX("writing output file");
       fclose(jpegFile);  jpegFile = NULL;
       goto bailout;
     }
@@ -330,18 +330,18 @@ int main(int argc, char **argv)
     pixelFormat = TJPF_BGRX;
     if ((imgBuf = (unsigned char *)tjAlloc(width * height *
                                            tjPixelSize[pixelFormat])) == NULL)
-      _throwunix("allocating uncompressed image buffer");
+      THROW_UNIX("allocating uncompressed image buffer");
 
     if (tjDecompress2(tjInstance, jpegBuf, jpegSize, imgBuf, width, 0, height,
                       pixelFormat, flags) < 0)
-      _throwtj("decompressing JPEG image");
+      THROW_TJ("decompressing JPEG image");
     tjFree(jpegBuf);  jpegBuf = NULL;
     tjDestroy(tjInstance);  tjInstance = NULL;
   } else {
     /* Input image is not a JPEG image.  Load it into memory. */
     if ((imgBuf = tjLoadImage(argv[1], &width, 1, &height, &pixelFormat,
                               0)) == NULL)
-      _throwtj("loading input image");
+      THROW_TJ("loading input image");
     if (outSubsamp < 0) {
       if (pixelFormat == TJPF_GRAY)
         outSubsamp = TJSAMP_GRAY;
@@ -355,26 +355,27 @@ int main(int argc, char **argv)
 
   if (!strcasecmp(outFormat, "jpg")) {
     /* Output image format is JPEG.  Compress the uncompressed image. */
-    unsigned char *jpegBuf = NULL;  /* Dynamically allocate the JPEG buffer */
     unsigned long jpegSize = 0;
 
+    jpegBuf = NULL;  /* Dynamically allocate the JPEG buffer */
+
     if (outQual < 0)
       outQual = DEFAULT_QUALITY;
     printf(", %s subsampling, quality = %d\n", subsampName[outSubsamp],
            outQual);
 
     if ((tjInstance = tjInitCompress()) == NULL)
-      _throwtj("initializing compressor");
+      THROW_TJ("initializing compressor");
     if (tjCompress2(tjInstance, imgBuf, width, 0, height, pixelFormat,
                     &jpegBuf, &jpegSize, outSubsamp, outQual, flags) < 0)
-      _throwtj("compressing image");
+      THROW_TJ("compressing image");
     tjDestroy(tjInstance);  tjInstance = NULL;
 
     /* Write the JPEG image to disk. */
     if ((jpegFile = fopen(argv[2], "wb")) == NULL)
-      _throwunix("opening output file");
+      THROW_UNIX("opening output file");
     if (fwrite(jpegBuf, jpegSize, 1, jpegFile) < 1)
-      _throwunix("writing output file");
+      THROW_UNIX("writing output file");
     tjDestroy(tjInstance);  tjInstance = NULL;
     fclose(jpegFile);  jpegFile = NULL;
     tjFree(jpegBuf);  jpegBuf = NULL;
@@ -383,13 +384,13 @@ int main(int argc, char **argv)
        directly to disk. */
     printf("\n");
     if (tjSaveImage(argv[2], imgBuf, width, 0, height, pixelFormat, 0) < 0)
-      _throwtj("saving output image");
+      THROW_TJ("saving output image");
   }
 
 bailout:
-  if (imgBuf) tjFree(imgBuf);
+  tjFree(imgBuf);
   if (tjInstance) tjDestroy(tjInstance);
-  if (jpegBuf) tjFree(jpegBuf);
+  tjFree(jpegBuf);
   if (jpegFile) fclose(jpegFile);
   return retval;
 }
diff --git a/tjunittest.c b/tjunittest.c
index ae72e836..f59939fd 100644
--- a/tjunittest.c
+++ b/tjunittest.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2014, 2017-2018 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2014, 2017-2019 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,7 +46,7 @@
 #endif
 
 
-void usage(char *progName)
+static void usage(char *progName)
 {
   printf("\nUSAGE: %s [options]\n\n", progName);
   printf("Options:\n");
@@ -59,16 +59,16 @@ void usage(char *progName)
 }
 
 
-#define _throwtj() { \
+#define THROW_TJ() { \
   printf("TurboJPEG ERROR:\n%s\n", tjGetErrorStr()); \
-  bailout() \
+  BAILOUT() \
 }
-#define _tj(f) { if ((f) == -1) _throwtj(); }
-#define _throw(m) { printf("ERROR: %s\n", m);  bailout() }
-#define _throwmd5(filename, md5sum, ref) { \
+#define TRY_TJ(f) { if ((f) == -1) THROW_TJ(); }
+#define THROW(m) { printf("ERROR: %s\n", m);  BAILOUT() }
+#define THROW_MD5(filename, md5sum, ref) { \
   printf("\n%s has an MD5 sum of %s.\n   Should be %s.\n", filename, md5sum, \
          ref); \
-  bailout() \
+  BAILOUT() \
 }
 
 const char *subNameLong[TJ_NUMSAMP] = {
@@ -93,10 +93,10 @@ const int _onlyRGB[] = { TJPF_RGB };
 int doYUV = 0, alloc = 0, pad = 4;
 
 int exitStatus = 0;
-#define bailout() { exitStatus = -1;  goto bailout; }
+#define BAILOUT() { exitStatus = -1;  goto bailout; }
 
 
-void initBuf(unsigned char *buf, int w, int h, int pf, int flags)
+static void initBuf(unsigned char *buf, int w, int h, int pf, int flags)
 {
   int roffset = tjRedOffset[pf];
   int goffset = tjGreenOffset[pf];
@@ -151,7 +151,7 @@ void initBuf(unsigned char *buf, int w, int h, int pf, int flags)
 }
 
 
-#define checkval(v, cv) { \
+#define CHECKVAL(v, cv) { \
   if (v < cv - 1 || v > cv + 1) { \
     printf("\nComp. %s at %d,%d should be %d, not %d\n", #v, row, col, cv, \
            v); \
@@ -159,14 +159,14 @@ void initBuf(unsigned char *buf, int w, int h, int pf, int flags)
   } \
 }
 
-#define checkval0(v) { \
+#define CHECKVAL0(v) { \
   if (v > 1) { \
     printf("\nComp. %s at %d,%d should be 0, not %d\n", #v, row, col, v); \
     retval = 0;  exitStatus = -1;  goto bailout; \
   } \
 }
 
-#define checkval255(v) { \
+#define CHECKVAL255(v) { \
   if (v < 254) { \
     printf("\nComp. %s at %d,%d should be 255, not %d\n", #v, row, col, v); \
     retval = 0;  exitStatus = -1;  goto bailout; \
@@ -174,8 +174,8 @@ void initBuf(unsigned char *buf, int w, int h, int pf, int flags)
 }
 
 
-int checkBuf(unsigned char *buf, int w, int h, int pf, int subsamp,
-             tjscalingfactor sf, int flags)
+static int checkBuf(unsigned char *buf, int w, int h, int pf, int subsamp,
+                    tjscalingfactor sf, int flags)
 {
   int roffset = tjRedOffset[pf];
   int goffset = tjGreenOffset[pf];
@@ -200,13 +200,13 @@ int checkBuf(unsigned char *buf, int w, int h, int pf, int subsamp,
         y = buf[index * ps + 2];
         k = buf[index * ps + 3];
         if (((row / blocksize) + (col / blocksize)) % 2 == 0) {
-          checkval255(c);  checkval255(m);  checkval255(y);
-          if (row < halfway) checkval255(k)
-          else checkval0(k)
+          CHECKVAL255(c);  CHECKVAL255(m);  CHECKVAL255(y);
+          if (row < halfway) CHECKVAL255(k)
+          else CHECKVAL0(k)
         } else {
-          checkval255(c);  checkval0(y);  checkval255(k);
-          if (row < halfway) checkval0(m)
-          else checkval255(m)
+          CHECKVAL255(c);  CHECKVAL0(y);  CHECKVAL255(k);
+          if (row < halfway) CHECKVAL0(m)
+          else CHECKVAL255(m)
         }
       }
     }
@@ -225,26 +225,26 @@ int checkBuf(unsigned char *buf, int w, int h, int pf, int subsamp,
       a = aoffset >= 0 ? buf[index * ps + aoffset] : 0xFF;
       if (((row / blocksize) + (col / blocksize)) % 2 == 0) {
         if (row < halfway) {
-          checkval255(r);  checkval255(g);  checkval255(b);
+          CHECKVAL255(r);  CHECKVAL255(g);  CHECKVAL255(b);
         } else {
-          checkval0(r);  checkval0(g);  checkval0(b);
+          CHECKVAL0(r);  CHECKVAL0(g);  CHECKVAL0(b);
         }
       } else {
         if (subsamp == TJSAMP_GRAY) {
           if (row < halfway) {
-            checkval(r, 76);  checkval(g, 76);  checkval(b, 76);
+            CHECKVAL(r, 76);  CHECKVAL(g, 76);  CHECKVAL(b, 76);
           } else {
-            checkval(r, 226);  checkval(g, 226);  checkval(b, 226);
+            CHECKVAL(r, 226);  CHECKVAL(g, 226);  CHECKVAL(b, 226);
           }
         } else {
           if (row < halfway) {
-            checkval255(r);  checkval0(g);  checkval0(b);
+            CHECKVAL255(r);  CHECKVAL0(g);  CHECKVAL0(b);
           } else {
-            checkval255(r);  checkval255(g);  checkval0(b);
+            CHECKVAL255(r);  CHECKVAL255(g);  CHECKVAL0(b);
           }
         }
       }
-      checkval255(a);
+      CHECKVAL255(a);
     }
   }
 
@@ -270,8 +270,8 @@ bailout:
 
 #define PAD(v, p)  ((v + (p) - 1) & (~((p) - 1)))
 
-int checkBufYUV(unsigned char *buf, int w, int h, int subsamp,
-                tjscalingfactor sf)
+static int checkBufYUV(unsigned char *buf, int w, int h, int subsamp,
+                       tjscalingfactor sf)
 {
   int row, col;
   int hsf = tjMCUWidth[subsamp] / 8, vsf = tjMCUHeight[subsamp] / 8;
@@ -287,16 +287,16 @@ int checkBufYUV(unsigned char *buf, int w, int h, int subsamp,
       unsigned char y = buf[ypitch * row + col];
 
       if (((row / blocksize) + (col / blocksize)) % 2 == 0) {
-        if (row < halfway) checkval255(y)
-        else checkval0(y);
+        if (row < halfway) CHECKVAL255(y)
+        else CHECKVAL0(y);
       } else {
-        if (row < halfway) checkval(y, 76)
-        else checkval(y, 226);
+        if (row < halfway) CHECKVAL(y, 76)
+        else CHECKVAL(y, 226);
       }
     }
   }
   if (subsamp != TJSAMP_GRAY) {
-    int halfway = 16 / vsf * sf.num / sf.denom;
+    halfway = 16 / vsf * sf.num / sf.denom;
 
     for (row = 0; row < ch; row++) {
       for (col = 0; col < cw; col++) {
@@ -304,12 +304,12 @@ int checkBufYUV(unsigned char *buf, int w, int h, int subsamp,
           v = buf[ypitch * ph + uvpitch * ch + (uvpitch * row + col)];
 
         if (((row * vsf / blocksize) + (col * hsf / blocksize)) % 2 == 0) {
-          checkval(u, 128);  checkval(v, 128);
+          CHECKVAL(u, 128);  CHECKVAL(v, 128);
         } else {
           if (row < halfway) {
-            checkval(u, 85);  checkval255(v);
+            CHECKVAL(u, 85);  CHECKVAL255(v);
           } else {
-            checkval0(u);  checkval(v, 149);
+            CHECKVAL0(u);  CHECKVAL(v, 149);
           }
         }
       }
@@ -342,13 +342,14 @@ bailout:
 }
 
 
-void writeJPEG(unsigned char *jpegBuf, unsigned long jpegSize, char *filename)
+static void writeJPEG(unsigned char *jpegBuf, unsigned long jpegSize,
+                      char *filename)
 {
   FILE *file = fopen(filename, "wb");
 
   if (!file || fwrite(jpegBuf, jpegSize, 1, file) != 1) {
     printf("ERROR: Could not write to %s.\n%s\n", filename, strerror(errno));
-    bailout()
+    BAILOUT()
   }
 
 bailout:
@@ -356,9 +357,9 @@ bailout:
 }
 
 
-void compTest(tjhandle handle, unsigned char **dstBuf, unsigned long *dstSize,
-              int w, int h, int pf, char *basename, int subsamp, int jpegQual,
-              int flags)
+static void compTest(tjhandle handle, unsigned char **dstBuf,
+                     unsigned long *dstSize, int w, int h, int pf,
+                     char *basename, int subsamp, int jpegQual, int flags)
 {
   char tempStr[1024];
   unsigned char *srcBuf = NULL, *yuvBuf = NULL;
@@ -368,7 +369,7 @@ void compTest(tjhandle handle, unsigned char **dstBuf, unsigned long *dstSize,
   const char *buStr = (flags & TJFLAG_BOTTOMUP) ? "BU" : "TD";
 
   if ((srcBuf = (unsigned char *)malloc(w * h * tjPixelSize[pf])) == NULL)
-    _throw("Memory allocation failure");
+    THROW("Memory allocation failure");
   initBuf(srcBuf, w, h, pf, flags);
 
   if (*dstBuf && *dstSize > 0) memset(*dstBuf, 0, *dstSize);
@@ -379,28 +380,28 @@ void compTest(tjhandle handle, unsigned char **dstBuf, unsigned long *dstSize,
     tjscalingfactor sf = { 1, 1 };
     tjhandle handle2 = tjInitCompress();
 
-    if (!handle2) _throwtj();
+    if (!handle2) THROW_TJ();
 
     if ((yuvBuf = (unsigned char *)malloc(yuvSize)) == NULL)
-      _throw("Memory allocation failure");
+      THROW("Memory allocation failure");
     memset(yuvBuf, 0, yuvSize);
 
     printf("%s %s -> YUV %s ... ", pfStr, buStrLong, subNameLong[subsamp]);
-    _tj(tjEncodeYUV3(handle2, srcBuf, w, 0, h, pf, yuvBuf, pad, subsamp,
-                     flags));
+    TRY_TJ(tjEncodeYUV3(handle2, srcBuf, w, 0, h, pf, yuvBuf, pad, subsamp,
+                        flags));
     tjDestroy(handle2);
     if (checkBufYUV(yuvBuf, w, h, subsamp, sf)) printf("Passed.\n");
     else printf("FAILED!\n");
 
     printf("YUV %s %s -> JPEG Q%d ... ", subNameLong[subsamp], buStrLong,
            jpegQual);
-    _tj(tjCompressFromYUV(handle, yuvBuf, w, pad, h, subsamp, dstBuf, dstSize,
-                          jpegQual, flags));
+    TRY_TJ(tjCompressFromYUV(handle, yuvBuf, w, pad, h, subsamp, dstBuf,
+                             dstSize, jpegQual, flags));
   } else {
     printf("%s %s -> %s Q%d ... ", pfStr, buStrLong, subNameLong[subsamp],
            jpegQual);
-    _tj(tjCompress2(handle, srcBuf, w, 0, h, pf, dstBuf, dstSize, subsamp,
-                    jpegQual, flags));
+    TRY_TJ(tjCompress2(handle, srcBuf, w, 0, h, pf, dstBuf, dstSize, subsamp,
+                       jpegQual, flags));
   }
 
   snprintf(tempStr, 1024, "%s_enc_%s_%s_%s_Q%d.jpg", basename, pfStr, buStr,
@@ -409,14 +410,15 @@ void compTest(tjhandle handle, unsigned char **dstBuf, unsigned long *dstSize,
   printf("Done.\n  Result in %s\n", tempStr);
 
 bailout:
-  if (yuvBuf) free(yuvBuf);
-  if (srcBuf) free(srcBuf);
+  free(yuvBuf);
+  free(srcBuf);
 }
 
 
-void _decompTest(tjhandle handle, unsigned char *jpegBuf,
-                 unsigned long jpegSize, int w, int h, int pf, char *basename,
-                 int subsamp, int flags, tjscalingfactor sf)
+static void _decompTest(tjhandle handle, unsigned char *jpegBuf,
+                        unsigned long jpegSize, int w, int h, int pf,
+                        char *basename, int subsamp, int flags,
+                        tjscalingfactor sf)
 {
   unsigned char *dstBuf = NULL, *yuvBuf = NULL;
   int _hdrw = 0, _hdrh = 0, _hdrsubsamp = -1;
@@ -424,14 +426,14 @@ void _decompTest(tjhandle handle, unsigned char *jpegBuf,
   int scaledHeight = TJSCALED(h, sf);
   unsigned long dstSize = 0;
 
-  _tj(tjDecompressHeader2(handle, jpegBuf, jpegSize, &_hdrw, &_hdrh,
-                          &_hdrsubsamp));
+  TRY_TJ(tjDecompressHeader2(handle, jpegBuf, jpegSize, &_hdrw, &_hdrh,
+                             &_hdrsubsamp));
   if (_hdrw != w || _hdrh != h || _hdrsubsamp != subsamp)
-    _throw("Incorrect JPEG header");
+    THROW("Incorrect JPEG header");
 
   dstSize = scaledWidth * scaledHeight * tjPixelSize[pf];
   if ((dstBuf = (unsigned char *)malloc(dstSize)) == NULL)
-    _throw("Memory allocation failure");
+    THROW("Memory allocation failure");
   memset(dstBuf, 0, dstSize);
 
   if (doYUV) {
@@ -439,26 +441,26 @@ void _decompTest(tjhandle handle, unsigned char *jpegBuf,
                                           subsamp);
     tjhandle handle2 = tjInitDecompress();
 
-    if (!handle2) _throwtj();
+    if (!handle2) THROW_TJ();
 
     if ((yuvBuf = (unsigned char *)malloc(yuvSize)) == NULL)
-      _throw("Memory allocation failure");
+      THROW("Memory allocation failure");
     memset(yuvBuf, 0, yuvSize);
 
     printf("JPEG -> YUV %s ", subNameLong[subsamp]);
     if (sf.num != 1 || sf.denom != 1)
       printf("%d/%d ... ", sf.num, sf.denom);
     else printf("... ");
-    _tj(tjDecompressToYUV2(handle, jpegBuf, jpegSize, yuvBuf, scaledWidth, pad,
-                           scaledHeight, flags));
+    TRY_TJ(tjDecompressToYUV2(handle, jpegBuf, jpegSize, yuvBuf, scaledWidth,
+                              pad, scaledHeight, flags));
     if (checkBufYUV(yuvBuf, scaledWidth, scaledHeight, subsamp, sf))
       printf("Passed.\n");
     else printf("FAILED!\n");
 
     printf("YUV %s -> %s %s ... ", subNameLong[subsamp], pixFormatStr[pf],
            (flags & TJFLAG_BOTTOMUP) ? "Bottom-Up" : "Top-Down ");
-    _tj(tjDecodeYUV(handle2, yuvBuf, pad, subsamp, dstBuf, scaledWidth, 0,
-                    scaledHeight, pf, flags));
+    TRY_TJ(tjDecodeYUV(handle2, yuvBuf, pad, subsamp, dstBuf, scaledWidth, 0,
+                       scaledHeight, pf, flags));
     tjDestroy(handle2);
   } else {
     printf("JPEG -> %s %s ", pixFormatStr[pf],
@@ -466,8 +468,8 @@ void _decompTest(tjhandle handle, unsigned char *jpegBuf,
     if (sf.num != 1 || sf.denom != 1)
       printf("%d/%d ... ", sf.num, sf.denom);
     else printf("... ");
-    _tj(tjDecompress2(handle, jpegBuf, jpegSize, dstBuf, scaledWidth, 0,
-                      scaledHeight, pf, flags));
+    TRY_TJ(tjDecompress2(handle, jpegBuf, jpegSize, dstBuf, scaledWidth, 0,
+                         scaledHeight, pf, flags));
   }
 
   if (checkBuf(dstBuf, scaledWidth, scaledHeight, pf, subsamp, sf, flags))
@@ -476,19 +478,19 @@ void _decompTest(tjhandle handle, unsigned char *jpegBuf,
   printf("\n");
 
 bailout:
-  if (yuvBuf) free(yuvBuf);
-  if (dstBuf) free(dstBuf);
+  free(yuvBuf);
+  free(dstBuf);
 }
 
 
-void decompTest(tjhandle handle, unsigned char *jpegBuf,
-                unsigned long jpegSize, int w, int h, int pf, char *basename,
-                int subsamp, int flags)
+static void decompTest(tjhandle handle, unsigned char *jpegBuf,
+                       unsigned long jpegSize, int w, int h, int pf,
+                       char *basename, int subsamp, int flags)
 {
   int i, n = 0;
   tjscalingfactor *sf = tjGetScalingFactors(&n);
 
-  if (!sf || !n) _throwtj();
+  if (!sf || !n) THROW_TJ();
 
   for (i = 0; i < n; i++) {
     if (subsamp == TJSAMP_444 || subsamp == TJSAMP_GRAY ||
@@ -505,8 +507,8 @@ bailout:
 }
 
 
-void doTest(int w, int h, const int *formats, int nformats, int subsamp,
-            char *basename)
+static void doTest(int w, int h, const int *formats, int nformats, int subsamp,
+                   char *basename)
 {
   tjhandle chandle = NULL, dhandle = NULL;
   unsigned char *dstBuf = NULL;
@@ -517,11 +519,11 @@ void doTest(int w, int h, const int *formats, int nformats, int subsamp,
     size = tjBufSize(w, h, subsamp);
   if (size != 0)
     if ((dstBuf = (unsigned char *)tjAlloc(size)) == NULL)
-      _throw("Memory allocation failure.");
+      THROW("Memory allocation failure.");
 
   if ((chandle = tjInitCompress()) == NULL ||
       (dhandle = tjInitDecompress()) == NULL)
-    _throwtj();
+    THROW_TJ();
 
   for (pfi = 0; pfi < nformats; pfi++) {
     for (i = 0; i < 2; i++) {
@@ -548,18 +550,54 @@ void doTest(int w, int h, const int *formats, int nformats, int subsamp,
 bailout:
   if (chandle) tjDestroy(chandle);
   if (dhandle) tjDestroy(dhandle);
-  if (dstBuf) tjFree(dstBuf);
+  tjFree(dstBuf);
 }
 
 
-void bufSizeTest(void)
+#if SIZEOF_SIZE_T == 8
+#define CHECKSIZE(function) { \
+  if ((unsigned long long)size < (unsigned long long)0xFFFFFFFF) \
+    THROW(#function " overflow"); \
+}
+#else
+#define CHECKSIZE(function) { \
+  if (size != (unsigned long)(-1) || \
+      !strcmp(tjGetErrorStr2(NULL), "No error")) \
+    THROW(#function " overflow"); \
+}
+#endif
+
+static void overflowTest(void)
+{
+  /* Ensure that the various buffer size functions don't overflow */
+  unsigned long size;
+
+  size = tjBufSize(26755, 26755, TJSAMP_444);
+  CHECKSIZE(tjBufSize());
+  size = TJBUFSIZE(26755, 26755);
+  CHECKSIZE(TJBUFSIZE());
+  size = tjBufSizeYUV2(37838, 1, 37838, TJSAMP_444);
+  CHECKSIZE(tjBufSizeYUV2());
+  size = TJBUFSIZEYUV(37838, 37838, TJSAMP_444);
+  CHECKSIZE(TJBUFSIZEYUV());
+  size = tjBufSizeYUV(37838, 37838, TJSAMP_444);
+  CHECKSIZE(tjBufSizeYUV());
+  size = tjPlaneSizeYUV(0, 65536, 0, 65536, TJSAMP_444);
+  CHECKSIZE(tjPlaneSizeYUV());
+
+bailout:
+  return;
+}
+
+
+static void bufSizeTest(void)
 {
   int w, h, i, subsamp;
   unsigned char *srcBuf = NULL, *dstBuf = NULL;
   tjhandle handle = NULL;
   unsigned long dstSize = 0;
 
-  if ((handle = tjInitCompress()) == NULL) _throwtj();
+  if ((handle = tjInitCompress()) == NULL) THROW_TJ();
 
   printf("Buffer size regression test\n");
   for (subsamp = 0; subsamp < TJ_NUMSAMP; subsamp++) {
@@ -569,12 +607,12 @@ void bufSizeTest(void)
       for (h = 1; h < maxh; h++) {
         if (h % 100 == 0) printf("%.4d x %.4d\b\b\b\b\b\b\b\b\b\b\b", w, h);
         if ((srcBuf = (unsigned char *)malloc(w * h * 4)) == NULL)
-          _throw("Memory allocation failure");
+          THROW("Memory allocation failure");
         if (!alloc || doYUV) {
           if (doYUV) dstSize = tjBufSizeYUV2(w, pad, h, subsamp);
           else dstSize = tjBufSize(w, h, subsamp);
           if ((dstBuf = (unsigned char *)tjAlloc(dstSize)) == NULL)
-            _throw("Memory allocation failure");
+            THROW("Memory allocation failure");
         }
 
         for (i = 0; i < w * h * 4; i++) {
@@ -583,12 +621,12 @@ void bufSizeTest(void)
         }
 
         if (doYUV) {
-          _tj(tjEncodeYUV3(handle, srcBuf, w, 0, h, TJPF_BGRX, dstBuf, pad,
-                           subsamp, 0));
+          TRY_TJ(tjEncodeYUV3(handle, srcBuf, w, 0, h, TJPF_BGRX, dstBuf, pad,
+                              subsamp, 0));
         } else {
-          _tj(tjCompress2(handle, srcBuf, w, 0, h, TJPF_BGRX, &dstBuf,
-                          &dstSize, subsamp, 100,
-                          alloc ? 0 : TJFLAG_NOREALLOC));
+          TRY_TJ(tjCompress2(handle, srcBuf, w, 0, h, TJPF_BGRX, &dstBuf,
+                             &dstSize, subsamp, 100,
+                             alloc ? 0 : TJFLAG_NOREALLOC));
         }
         free(srcBuf);  srcBuf = NULL;
         if (!alloc || doYUV) {
@@ -596,12 +634,12 @@ void bufSizeTest(void)
         }
 
         if ((srcBuf = (unsigned char *)malloc(h * w * 4)) == NULL)
-          _throw("Memory allocation failure");
+          THROW("Memory allocation failure");
         if (!alloc || doYUV) {
           if (doYUV) dstSize = tjBufSizeYUV2(h, pad, w, subsamp);
           else dstSize = tjBufSize(h, w, subsamp);
           if ((dstBuf = (unsigned char *)tjAlloc(dstSize)) == NULL)
-            _throw("Memory allocation failure");
+            THROW("Memory allocation failure");
         }
 
         for (i = 0; i < h * w * 4; i++) {
@@ -610,12 +648,12 @@ void bufSizeTest(void)
         }
 
         if (doYUV) {
-          _tj(tjEncodeYUV3(handle, srcBuf, h, 0, w, TJPF_BGRX, dstBuf, pad,
-                           subsamp, 0));
+          TRY_TJ(tjEncodeYUV3(handle, srcBuf, h, 0, w, TJPF_BGRX, dstBuf, pad,
+                              subsamp, 0));
         } else {
-          _tj(tjCompress2(handle, srcBuf, h, 0, w, TJPF_BGRX, &dstBuf,
-                          &dstSize, subsamp, 100,
-                          alloc ? 0 : TJFLAG_NOREALLOC));
+          TRY_TJ(tjCompress2(handle, srcBuf, h, 0, w, TJPF_BGRX, &dstBuf,
+                             &dstSize, subsamp, 100,
+                             alloc ? 0 : TJFLAG_NOREALLOC));
         }
         free(srcBuf);  srcBuf = NULL;
         if (!alloc || doYUV) {
@@ -627,14 +665,14 @@ void bufSizeTest(void)
   printf("Done.      \n");
 
 bailout:
-  if (srcBuf) free(srcBuf);
-  if (dstBuf) tjFree(dstBuf);
+  free(srcBuf);
+  tjFree(dstBuf);
   if (handle) tjDestroy(handle);
 }
 
 
-void initBitmap(unsigned char *buf, int width, int pitch, int height, int pf,
-                int flags)
+static void initBitmap(unsigned char *buf, int width, int pitch, int height,
+                       int pf, int flags)
 {
   int roffset = tjRedOffset[pf];
   int goffset = tjGreenOffset[pf];
@@ -667,8 +705,8 @@ void initBitmap(unsigned char *buf, int width, int pitch, int height, int pf,
 }
 
 
-int cmpBitmap(unsigned char *buf, int width, int pitch, int height, int pf,
-              int flags, int gray2rgb)
+static int cmpBitmap(unsigned char *buf, int width, int pitch, int height,
+                     int pf, int flags, int gray2rgb)
 {
   int roffset = tjRedOffset[pf];
   int goffset = tjGreenOffset[pf];
@@ -718,8 +756,8 @@ int cmpBitmap(unsigned char *buf, int width, int pitch, int height, int pf,
 }
 
 
-int doBmpTest(const char *ext, int width, int align, int height, int pf,
-              int flags)
+static int doBmpTest(const char *ext, int width, int align, int height, int pf,
+                     int flags)
 {
   char filename[80], *md5sum, md5buf[65];
   int ps = tjPixelSize[pf], pitch = PAD(width * ps, align), loadWidth = 0,
@@ -736,20 +774,20 @@ int doBmpTest(const char *ext, int width, int align, int height, int pf,
   }
 
   if ((buf = (unsigned char *)tjAlloc(pitch * height)) == NULL)
-    _throw("Could not allocate memory");
+    THROW("Could not allocate memory");
   initBitmap(buf, width, pitch, height, pf, flags);
 
   snprintf(filename, 80, "test_bmp_%s_%d_%s.%s", pixFormatStr[pf], align,
            (flags & TJFLAG_BOTTOMUP) ? "bu" : "td", ext);
-  _tj(tjSaveImage(filename, buf, width, pitch, height, pf, flags));
+  TRY_TJ(tjSaveImage(filename, buf, width, pitch, height, pf, flags));
   md5sum = MD5File(filename, md5buf);
   if (strcasecmp(md5sum, md5ref))
-    _throwmd5(filename, md5sum, md5ref);
+    THROW_MD5(filename, md5sum, md5ref);
 
   tjFree(buf);  buf = NULL;
   if ((buf = tjLoadImage(filename, &loadWidth, align, &loadHeight, &pf,
                          flags)) == NULL)
-    _throwtj();
+    THROW_TJ();
   if (width != loadWidth || height != loadHeight) {
     printf("\n   Image dimensions of %s are bogus\n", filename);
     retval = -1;  goto bailout;
@@ -763,7 +801,7 @@ int doBmpTest(const char *ext, int width, int align, int height, int pf,
     pf = TJPF_XBGR;
     if ((buf = tjLoadImage(filename, &loadWidth, align, &loadHeight, &pf,
                            flags)) == NULL)
-      _throwtj();
+      THROW_TJ();
     pitch = PAD(width * tjPixelSize[pf], align);
     if (!cmpBitmap(buf, width, pitch, height, pf, flags, 1)) {
       printf("\n   Converting %s to RGB failed\n", filename);
@@ -774,7 +812,7 @@ int doBmpTest(const char *ext, int width, int align, int height, int pf,
     pf = TJPF_CMYK;
     if ((buf = tjLoadImage(filename, &loadWidth, align, &loadHeight, &pf,
                            flags)) == NULL)
-      _throwtj();
+      THROW_TJ();
     pitch = PAD(width * tjPixelSize[pf], align);
     if (!cmpBitmap(buf, width, pitch, height, pf, flags, 1)) {
       printf("\n   Converting %s to CMYK failed\n", filename);
@@ -788,7 +826,7 @@ int doBmpTest(const char *ext, int width, int align, int height, int pf,
   pixelFormat = TJPF_UNKNOWN;
   if ((buf = tjLoadImage(filename, &loadWidth, align, &loadHeight,
                          &pixelFormat, flags)) == NULL)
-    _throwtj();
+    THROW_TJ();
   if ((pf == TJPF_GRAY && pixelFormat != TJPF_GRAY) ||
       (pf != TJPF_GRAY && !strcasecmp(ext, "bmp") &&
        pixelFormat != TJPF_BGR) ||
@@ -801,13 +839,13 @@ int doBmpTest(const char *ext, int width, int align, int height, int pf,
   unlink(filename);
 
 bailout:
-  if (buf) tjFree(buf);
+  tjFree(buf);
   if (exitStatus < 0) return exitStatus;
   return retval;
 }
 
 
-int bmpTest(void)
+static int bmpTest(void)
 {
   int align, width = 35, height = 39, format;
 
@@ -863,6 +901,7 @@ int main(int argc, char *argv[])
   }
   if (alloc) printf("Testing automatic buffer allocation\n");
   if (doYUV) num4bf = 4;
+  overflowTest();
   doTest(35, 39, _3byteFormats, 2, TJSAMP_444, "test");
   doTest(39, 41, _4byteFormats, num4bf, TJSAMP_444, "test");
   doTest(41, 35, _3byteFormats, 2, TJSAMP_422, "test");
diff --git a/tjutil.c b/tjutil.c
index b44086de..2018160b 100644
--- a/tjutil.c
+++ b/tjutil.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011, 2019 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,6 +29,7 @@
 #ifdef _WIN32
 
 #include <windows.h>
+#include "tjutil.h"
 
 static double getFreq(void)
 {
@@ -56,6 +57,7 @@ double getTime(void)
 
 #include <stdlib.h>
 #include <sys/time.h>
+#include "tjutil.h"
 
 double getTime(void)
 {
diff --git a/turbojpeg-jni.c b/turbojpeg-jni.c
index d0ca37d9..93634501 100644
--- a/turbojpeg-jni.c
+++ b/turbojpeg-jni.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011-2017 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011-2019 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,59 +35,58 @@
 #include <jni.h>
 #include "java/org_libjpegturbo_turbojpeg_TJCompressor.h"
 #include "java/org_libjpegturbo_turbojpeg_TJDecompressor.h"
+#include "java/org_libjpegturbo_turbojpeg_TJTransformer.h"
 #include "java/org_libjpegturbo_turbojpeg_TJ.h"
 
-#define PAD(v, p)  ((v + (p) - 1) & (~((p) - 1)))
-
-#define bailif0(f) { \
+#define BAILIF0(f) { \
   if (!(f) || (*env)->ExceptionCheck(env)) { \
     goto bailout; \
   } \
 }
 
-#define _throw(msg, exceptionClass) { \
+#define THROW(msg, exceptionClass) { \
   jclass _exccls = (*env)->FindClass(env, exceptionClass); \
   \
-  bailif0(_exccls); \
+  BAILIF0(_exccls); \
   (*env)->ThrowNew(env, _exccls, msg); \
   goto bailout; \
 }
 
-#define _throwtj() { \
+#define THROW_TJ() { \
   jclass _exccls; \
   jmethodID _excid; \
   jobject _excobj; \
   jstring _errstr; \
   \
-  bailif0(_errstr = (*env)->NewStringUTF(env, tjGetErrorStr2(handle))); \
-  bailif0(_exccls = (*env)->FindClass(env, \
+  BAILIF0(_errstr = (*env)->NewStringUTF(env, tjGetErrorStr2(handle))); \
+  BAILIF0(_exccls = (*env)->FindClass(env, \
     "org/libjpegturbo/turbojpeg/TJException")); \
-  bailif0(_excid = (*env)->GetMethodID(env, _exccls, "<init>", \
+  BAILIF0(_excid = (*env)->GetMethodID(env, _exccls, "<init>", \
                                        "(Ljava/lang/String;I)V")); \
-  bailif0(_excobj = (*env)->NewObject(env, _exccls, _excid, _errstr, \
+  BAILIF0(_excobj = (*env)->NewObject(env, _exccls, _excid, _errstr, \
                                       tjGetErrorCode(handle))); \
   (*env)->Throw(env, _excobj); \
   goto bailout; \
 }
 
-#define _throwarg(msg)  _throw(msg, "java/lang/IllegalArgumentException")
+#define THROW_ARG(msg)  THROW(msg, "java/lang/IllegalArgumentException")
 
-#define _throwmem() \
-  _throw("Memory allocation failure", "java/lang/OutOfMemoryError");
+#define THROW_MEM() \
+  THROW("Memory allocation failure", "java/lang/OutOfMemoryError");
 
-#define gethandle() \
+#define GET_HANDLE() \
   jclass _cls = (*env)->GetObjectClass(env, obj); \
   jfieldID _fid; \
   \
-  bailif0(_cls); \
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "handle", "J")); \
+  BAILIF0(_cls); \
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "handle", "J")); \
   handle = (tjhandle)(size_t)(*env)->GetLongField(env, obj, _fid);
 
 #ifdef _WIN32
 #define setenv(envvar, value, dummy)  _putenv_s(envvar, value)
 #endif
 
-#define prop2env(property, envvar) { \
+#define PROP2ENV(property, envvar) { \
   if ((jName = (*env)->NewStringUTF(env, property)) != NULL && \
       (jValue = (*env)->CallStaticObjectMethod(env, cls, mid, \
                                                jName)) != NULL) { \
@@ -98,21 +97,27 @@
   } \
 }
 
-int ProcessSystemProperties(JNIEnv *env)
+#define SAFE_RELEASE(javaArray, cArray) { \
+  if (javaArray && cArray) \
+    (*env)->ReleasePrimitiveArrayCritical(env, javaArray, (void *)cArray, 0); \
+  cArray = NULL; \
+}
+
+static int ProcessSystemProperties(JNIEnv *env)
 {
   jclass cls;
   jmethodID mid;
   jstring jName, jValue;
   const char *value;
 
-  bailif0(cls = (*env)->FindClass(env, "java/lang/System"));
-  bailif0(mid = (*env)->GetStaticMethodID(env, cls, "getProperty",
+  BAILIF0(cls = (*env)->FindClass(env, "java/lang/System"));
+  BAILIF0(mid = (*env)->GetStaticMethodID(env, cls, "getProperty",
     "(Ljava/lang/String;)Ljava/lang/String;"));
 
-  prop2env("turbojpeg.optimize", "TJ_OPTIMIZE");
-  prop2env("turbojpeg.arithmetic", "TJ_ARITHMETIC");
-  prop2env("turbojpeg.restart", "TJ_RESTART");
-  prop2env("turbojpeg.progressive", "TJ_PROGRESSIVE");
+  PROP2ENV("turbojpeg.optimize", "TJ_OPTIMIZE");
+  PROP2ENV("turbojpeg.arithmetic", "TJ_ARITHMETIC");
+  PROP2ENV("turbojpeg.restart", "TJ_RESTART");
+  PROP2ENV("turbojpeg.progressive", "TJ_PROGRESSIVE");
   return 0;
 
 bailout:
@@ -125,7 +130,7 @@ JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSize
 {
   jint retval = (jint)tjBufSize(width, height, jpegSubsamp);
 
-  if (retval == -1) _throwarg(tjGetErrorStr());
+  if (retval == -1) THROW_ARG(tjGetErrorStr());
 
 bailout:
   return retval;
@@ -137,7 +142,7 @@ JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII
 {
   jint retval = (jint)tjBufSizeYUV2(width, pad, height, subsamp);
 
-  if (retval == -1) _throwarg(tjGetErrorStr());
+  if (retval == -1) THROW_ARG(tjGetErrorStr());
 
 bailout:
   return retval;
@@ -160,7 +165,7 @@ JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeSizeYUV__IIIII
   jint retval = (jint)tjPlaneSizeYUV(componentID, width, stride, height,
                                      subsamp);
 
-  if (retval == -1) _throwarg(tjGetErrorStr());
+  if (retval == -1) THROW_ARG(tjGetErrorStr());
 
 bailout:
   return retval;
@@ -172,7 +177,7 @@ JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeWidth__III
 {
   jint retval = (jint)tjPlaneWidth(componentID, width, subsamp);
 
-  if (retval == -1) _throwarg(tjGetErrorStr());
+  if (retval == -1) THROW_ARG(tjGetErrorStr());
 
 bailout:
   return retval;
@@ -184,7 +189,7 @@ JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeHeight__III
 {
   jint retval = (jint)tjPlaneHeight(componentID, height, subsamp);
 
-  if (retval == -1) _throwarg(tjGetErrorStr());
+  if (retval == -1) THROW_ARG(tjGetErrorStr());
 
 bailout:
   return retval;
@@ -199,10 +204,10 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_init
   tjhandle handle;
 
   if ((handle = tjInitCompress()) == NULL)
-    _throw(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
+    THROW(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
 
-  bailif0(cls = (*env)->GetObjectClass(env, obj));
-  bailif0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
+  BAILIF0(cls = (*env)->GetObjectClass(env, obj));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
   (*env)->SetLongField(env, obj, fid, (size_t)handle);
 
 bailout:
@@ -219,35 +224,38 @@ static jint TJCompressor_compress
   jsize arraySize = 0, actualPitch;
   unsigned char *srcBuf = NULL, *jpegBuf = NULL;
 
-  gethandle();
+  GET_HANDLE();
 
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || width < 1 ||
       height < 1 || pitch < 0)
-    _throwarg("Invalid argument in compress()");
+    THROW_ARG("Invalid argument in compress()");
   if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF)
-    _throwarg("Mismatch between Java and C API");
+    THROW_ARG("Mismatch between Java and C API");
 
   actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
   arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
   if ((*env)->GetArrayLength(env, src) * srcElementSize < arraySize)
-    _throwarg("Source buffer is not large enough");
+    THROW_ARG("Source buffer is not large enough");
   jpegSize = tjBufSize(width, height, jpegSubsamp);
   if ((*env)->GetArrayLength(env, dst) < (jsize)jpegSize)
-    _throwarg("Destination buffer is not large enough");
-
-  bailif0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
-  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+    THROW_ARG("Destination buffer is not large enough");
 
   if (ProcessSystemProperties(env) < 0) goto bailout;
 
+  BAILIF0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
   if (tjCompress2(handle, &srcBuf[y * actualPitch + x * tjPixelSize[pf]],
                   width, pitch, height, pf, &jpegBuf, &jpegSize, jpegSubsamp,
-                  jpegQual, flags | TJFLAG_NOREALLOC) == -1)
-    _throwtj();
+                  jpegQual, flags | TJFLAG_NOREALLOC) == -1) {
+    SAFE_RELEASE(dst, jpegBuf);
+    SAFE_RELEASE(src, srcBuf);
+    THROW_TJ();
+  }
 
 bailout:
-  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
-  if (srcBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
+  SAFE_RELEASE(dst, jpegBuf);
+  SAFE_RELEASE(src, srcBuf);
   return (jint)jpegSize;
 }
 
@@ -278,9 +286,9 @@ JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3
    jint jpegQual, jint flags)
 {
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
-    _throwarg("Invalid argument in compress()");
+    THROW_ARG("Invalid argument in compress()");
   if (tjPixelSize[pf] != sizeof(jint))
-    _throwarg("Pixel format must be 32-bit when compressing from an integer buffer.");
+    THROW_ARG("Pixel format must be 32-bit when compressing from an integer buffer.");
 
   return TJCompressor_compress(env, obj, src, sizeof(jint), x, y, width,
                                stride * sizeof(jint), height, pf, dst,
@@ -297,9 +305,9 @@ JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3
    jint flags)
 {
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
-    _throwarg("Invalid argument in compress()");
+    THROW_ARG("Invalid argument in compress()");
   if (tjPixelSize[pf] != sizeof(jint))
-    _throwarg("Pixel format must be 32-bit when compressing from an integer buffer.");
+    THROW_ARG("Pixel format must be 32-bit when compressing from an integer buffer.");
 
   return TJCompressor_compress(env, obj, src, sizeof(jint), 0, 0, width,
                                stride * sizeof(jint), height, pf, dst,
@@ -323,66 +331,76 @@ JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFrom
   int *srcOffsets = NULL, *srcStrides = NULL;
   int nc = (subsamp == org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY ? 1 : 3), i;
 
-  gethandle();
+  GET_HANDLE();
 
   if (subsamp < 0 || subsamp >= org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
-    _throwarg("Invalid argument in compressFromYUV()");
+    THROW_ARG("Invalid argument in compressFromYUV()");
   if (org_libjpegturbo_turbojpeg_TJ_NUMSAMP != TJ_NUMSAMP)
-    _throwarg("Mismatch between Java and C API");
+    THROW_ARG("Mismatch between Java and C API");
 
   if ((*env)->GetArrayLength(env, srcobjs) < nc)
-    _throwarg("Planes array is too small for the subsampling type");
+    THROW_ARG("Planes array is too small for the subsampling type");
   if ((*env)->GetArrayLength(env, jSrcOffsets) < nc)
-    _throwarg("Offsets array is too small for the subsampling type");
+    THROW_ARG("Offsets array is too small for the subsampling type");
   if ((*env)->GetArrayLength(env, jSrcStrides) < nc)
-    _throwarg("Strides array is too small for the subsampling type");
+    THROW_ARG("Strides array is too small for the subsampling type");
 
   jpegSize = tjBufSize(width, height, subsamp);
   if ((*env)->GetArrayLength(env, dst) < (jsize)jpegSize)
-    _throwarg("Destination buffer is not large enough");
+    THROW_ARG("Destination buffer is not large enough");
 
-  bailif0(srcOffsets = (*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
-  bailif0(srcStrides = (*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
+  if (ProcessSystemProperties(env) < 0) goto bailout;
+
+#define RELEASE_ARRAYS_COMPRESSFROMYUV() { \
+  SAFE_RELEASE(dst, jpegBuf); \
+  for (i = 0; i < nc; i++) \
+    SAFE_RELEASE(jSrcPlanes[i], srcPlanes[i]); \
+  SAFE_RELEASE(jSrcStrides, srcStrides); \
+  SAFE_RELEASE(jSrcOffsets, srcOffsets); \
+}
+
+  BAILIF0(srcOffsets = (*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
+  BAILIF0(srcStrides = (*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
   for (i = 0; i < nc; i++) {
     int planeSize = tjPlaneSizeYUV(i, width, srcStrides[i], height, subsamp);
     int pw = tjPlaneWidth(i, width, subsamp);
 
-    if (planeSize < 0 || pw < 0)
-      _throwarg(tjGetErrorStr());
+    if (planeSize < 0 || pw < 0) {
+      RELEASE_ARRAYS_COMPRESSFROMYUV();
+      THROW_ARG(tjGetErrorStr());
+    }
 
-    if (srcOffsets[i] < 0)
-      _throwarg("Invalid argument in compressFromYUV()");
-    if (srcStrides[i] < 0 && srcOffsets[i] - planeSize + pw < 0)
-      _throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
+    if (srcOffsets[i] < 0) {
+      RELEASE_ARRAYS_COMPRESSFROMYUV();
+      THROW_ARG("Invalid argument in compressFromYUV()");
+    }
+    if (srcStrides[i] < 0 && srcOffsets[i] - planeSize + pw < 0) {
+      RELEASE_ARRAYS_COMPRESSFROMYUV();
+      THROW_ARG("Negative plane stride would cause memory to be accessed below plane boundary");
+    }
 
-    bailif0(jSrcPlanes[i] = (*env)->GetObjectArrayElement(env, srcobjs, i));
-    if ((*env)->GetArrayLength(env, jSrcPlanes[i]) < srcOffsets[i] + planeSize)
-      _throwarg("Source plane is not large enough");
+    BAILIF0(jSrcPlanes[i] = (*env)->GetObjectArrayElement(env, srcobjs, i));
+    if ((*env)->GetArrayLength(env, jSrcPlanes[i]) <
+        srcOffsets[i] + planeSize) {
+      RELEASE_ARRAYS_COMPRESSFROMYUV();
+      THROW_ARG("Source plane is not large enough");
+    }
 
-    bailif0(srcPlanes[i] =
+    BAILIF0(srcPlanes[i] =
             (*env)->GetPrimitiveArrayCritical(env, jSrcPlanes[i], 0));
     srcPlanes[i] = &srcPlanes[i][srcOffsets[i]];
   }
-  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
-
-  if (ProcessSystemProperties(env) < 0) goto bailout;
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
   if (tjCompressFromYUVPlanes(handle, srcPlanes, width, srcStrides, height,
                               subsamp, &jpegBuf, &jpegSize, jpegQual,
-                              flags | TJFLAG_NOREALLOC) == -1)
-    _throwtj();
+                              flags | TJFLAG_NOREALLOC) == -1) {
+    RELEASE_ARRAYS_COMPRESSFROMYUV();
+    THROW_TJ();
+  }
 
 bailout:
-  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
-  for (i = 0; i < nc; i++) {
-    if (srcPlanes[i] && jSrcPlanes[i])
-      (*env)->ReleasePrimitiveArrayCritical(env, jSrcPlanes[i],
-                                            (unsigned char *)srcPlanes[i], 0);
-  }
-  if (srcStrides)
-    (*env)->ReleasePrimitiveArrayCritical(env, jSrcStrides, srcStrides, 0);
-  if (srcOffsets)
-    (*env)->ReleasePrimitiveArrayCritical(env, jSrcOffsets, srcOffsets, 0);
+  RELEASE_ARRAYS_COMPRESSFROMYUV();
   return (jint)jpegSize;
 }
 
@@ -398,68 +416,78 @@ static void TJCompressor_encodeYUV
   int *dstOffsets = NULL, *dstStrides = NULL;
   int nc = (subsamp == org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY ? 1 : 3), i;
 
-  gethandle();
+  GET_HANDLE();
 
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || width < 1 ||
       height < 1 || pitch < 0 || subsamp < 0 ||
       subsamp >= org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
-    _throwarg("Invalid argument in encodeYUV()");
+    THROW_ARG("Invalid argument in encodeYUV()");
   if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF ||
       org_libjpegturbo_turbojpeg_TJ_NUMSAMP != TJ_NUMSAMP)
-    _throwarg("Mismatch between Java and C API");
+    THROW_ARG("Mismatch between Java and C API");
 
   if ((*env)->GetArrayLength(env, dstobjs) < nc)
-    _throwarg("Planes array is too small for the subsampling type");
+    THROW_ARG("Planes array is too small for the subsampling type");
   if ((*env)->GetArrayLength(env, jDstOffsets) < nc)
-    _throwarg("Offsets array is too small for the subsampling type");
+    THROW_ARG("Offsets array is too small for the subsampling type");
   if ((*env)->GetArrayLength(env, jDstStrides) < nc)
-    _throwarg("Strides array is too small for the subsampling type");
+    THROW_ARG("Strides array is too small for the subsampling type");
 
   actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
   arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
   if ((*env)->GetArrayLength(env, src) * srcElementSize < arraySize)
-    _throwarg("Source buffer is not large enough");
+    THROW_ARG("Source buffer is not large enough");
 
-  bailif0(dstOffsets = (*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
-  bailif0(dstStrides = (*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
+#define RELEASE_ARRAYS_ENCODEYUV() { \
+  SAFE_RELEASE(src, srcBuf); \
+  for (i = 0; i < nc; i++) \
+    SAFE_RELEASE(jDstPlanes[i], dstPlanes[i]); \
+  SAFE_RELEASE(jDstStrides, dstStrides); \
+  SAFE_RELEASE(jDstOffsets, dstOffsets); \
+}
+
+  BAILIF0(dstOffsets = (*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
+  BAILIF0(dstStrides = (*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
   for (i = 0; i < nc; i++) {
     int planeSize = tjPlaneSizeYUV(i, width, dstStrides[i], height, subsamp);
     int pw = tjPlaneWidth(i, width, subsamp);
 
-    if (planeSize < 0 || pw < 0)
-      _throwarg(tjGetErrorStr());
+    if (planeSize < 0 || pw < 0) {
+      RELEASE_ARRAYS_ENCODEYUV();
+      THROW_ARG(tjGetErrorStr());
+    }
 
-    if (dstOffsets[i] < 0)
-      _throwarg("Invalid argument in encodeYUV()");
-    if (dstStrides[i] < 0 && dstOffsets[i] - planeSize + pw < 0)
-      _throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
+    if (dstOffsets[i] < 0) {
+      RELEASE_ARRAYS_ENCODEYUV();
+      THROW_ARG("Invalid argument in encodeYUV()");
+    }
+    if (dstStrides[i] < 0 && dstOffsets[i] - planeSize + pw < 0) {
+      RELEASE_ARRAYS_ENCODEYUV();
+      THROW_ARG("Negative plane stride would cause memory to be accessed below plane boundary");
+    }
 
-    bailif0(jDstPlanes[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
-    if ((*env)->GetArrayLength(env, jDstPlanes[i]) < dstOffsets[i] + planeSize)
-      _throwarg("Destination plane is not large enough");
+    BAILIF0(jDstPlanes[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
+    if ((*env)->GetArrayLength(env, jDstPlanes[i]) <
+        dstOffsets[i] + planeSize) {
+      RELEASE_ARRAYS_ENCODEYUV();
+      THROW_ARG("Destination plane is not large enough");
+    }
 
-    bailif0(dstPlanes[i] =
+    BAILIF0(dstPlanes[i] =
             (*env)->GetPrimitiveArrayCritical(env, jDstPlanes[i], 0));
     dstPlanes[i] = &dstPlanes[i][dstOffsets[i]];
   }
-  bailif0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
 
   if (tjEncodeYUVPlanes(handle, &srcBuf[y * actualPitch + x * tjPixelSize[pf]],
                         width, pitch, height, pf, dstPlanes, dstStrides,
-                        subsamp, flags) == -1)
-    _throwtj();
+                        subsamp, flags) == -1) {
+    RELEASE_ARRAYS_ENCODEYUV();
+    THROW_TJ();
+  }
 
 bailout:
-  if (srcBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
-  for (i = 0; i < nc; i++) {
-    if (dstPlanes[i] && jDstPlanes[i])
-      (*env)->ReleasePrimitiveArrayCritical(env, jDstPlanes[i], dstPlanes[i],
-                                            0);
-  }
-  if (dstStrides)
-    (*env)->ReleasePrimitiveArrayCritical(env, jDstStrides, dstStrides, 0);
-  if (dstOffsets)
-    (*env)->ReleasePrimitiveArrayCritical(env, jDstOffsets, dstOffsets, 0);
+  RELEASE_ARRAYS_ENCODEYUV();
 }
 
 /* TurboJPEG 1.4.x: TJCompressor::encodeYUV() byte source */
@@ -479,9 +507,9 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___
    jintArray jDstOffsets, jintArray jDstStrides, jint subsamp, jint flags)
 {
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
-    _throwarg("Invalid argument in encodeYUV()");
+    THROW_ARG("Invalid argument in encodeYUV()");
   if (tjPixelSize[pf] != sizeof(jint))
-    _throwarg("Pixel format must be 32-bit when encoding from an integer buffer.");
+    THROW_ARG("Pixel format must be 32-bit when encoding from an integer buffer.");
 
   TJCompressor_encodeYUV(env, obj, src, sizeof(jint), x, y, width,
                          stride * sizeof(jint), height, pf, dstobjs,
@@ -491,7 +519,7 @@ bailout:
   return;
 }
 
-JNIEXPORT void JNICALL TJCompressor_encodeYUV_12
+static void JNICALL TJCompressor_encodeYUV_12
   (JNIEnv *env, jobject obj, jarray src, jint srcElementSize, jint width,
    jint pitch, jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
 {
@@ -499,31 +527,34 @@ JNIEXPORT void JNICALL TJCompressor_encodeYUV_12
   jsize arraySize = 0;
   unsigned char *srcBuf = NULL, *dstBuf = NULL;
 
-  gethandle();
+  GET_HANDLE();
 
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || width < 1 ||
       height < 1 || pitch < 0)
-    _throwarg("Invalid argument in encodeYUV()");
+    THROW_ARG("Invalid argument in encodeYUV()");
   if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF)
-    _throwarg("Mismatch between Java and C API");
+    THROW_ARG("Mismatch between Java and C API");
 
   arraySize = (pitch == 0) ? width * tjPixelSize[pf] * height : pitch * height;
   if ((*env)->GetArrayLength(env, src) * srcElementSize < arraySize)
-    _throwarg("Source buffer is not large enough");
+    THROW_ARG("Source buffer is not large enough");
   if ((*env)->GetArrayLength(env, dst) <
       (jsize)tjBufSizeYUV(width, height, subsamp))
-    _throwarg("Destination buffer is not large enough");
+    THROW_ARG("Destination buffer is not large enough");
 
-  bailif0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
-  bailif0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+  BAILIF0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
   if (tjEncodeYUV2(handle, srcBuf, width, pitch, height, pf, dstBuf, subsamp,
-                   flags) == -1)
-    _throwtj();
+                   flags) == -1) {
+    SAFE_RELEASE(dst, dstBuf);
+    SAFE_RELEASE(src, srcBuf);
+    THROW_TJ();
+  }
 
 bailout:
-  if (dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-  if (srcBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
+  SAFE_RELEASE(dst, dstBuf);
+  SAFE_RELEASE(src, srcBuf);
 }
 
 /* TurboJPEG 1.2.x: TJCompressor::encodeYUV() byte source */
@@ -541,9 +572,9 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___
    jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
 {
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
-    _throwarg("Invalid argument in encodeYUV()");
+    THROW_ARG("Invalid argument in encodeYUV()");
   if (tjPixelSize[pf] != sizeof(jint))
-    _throwarg("Pixel format must be 32-bit when encoding from an integer buffer.");
+    THROW_ARG("Pixel format must be 32-bit when encoding from an integer buffer.");
 
   TJCompressor_encodeYUV_12(env, obj, src, sizeof(jint), width,
                             stride * sizeof(jint), height, pf, dst, subsamp,
@@ -559,9 +590,9 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy
 {
   tjhandle handle = 0;
 
-  gethandle();
+  GET_HANDLE();
 
-  if (tjDestroy(handle) == -1) _throwtj();
+  if (tjDestroy(handle) == -1) THROW_TJ();
   (*env)->SetLongField(env, obj, _fid, 0);
 
 bailout:
@@ -577,10 +608,10 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_init
   tjhandle handle;
 
   if ((handle = tjInitDecompress()) == NULL)
-    _throw(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
+    THROW(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
 
-  bailif0(cls = (*env)->GetObjectClass(env, obj));
-  bailif0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
+  BAILIF0(cls = (*env)->GetObjectClass(env, obj));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
   (*env)->SetLongField(env, obj, fid, (size_t)handle);
 
 bailout:
@@ -599,17 +630,17 @@ JNIEXPORT jobjectArray JNICALL Java_org_libjpegturbo_turbojpeg_TJ_getScalingFact
   jobjectArray sfjava = NULL;
 
   if ((sf = tjGetScalingFactors(&n)) == NULL || n == 0)
-    _throwarg(tjGetErrorStr());
+    THROW_ARG(tjGetErrorStr());
 
-  bailif0(sfcls = (*env)->FindClass(env,
+  BAILIF0(sfcls = (*env)->FindClass(env,
     "org/libjpegturbo/turbojpeg/TJScalingFactor"));
-  bailif0(sfjava = (jobjectArray)(*env)->NewObjectArray(env, n, sfcls, 0));
+  BAILIF0(sfjava = (jobjectArray)(*env)->NewObjectArray(env, n, sfcls, 0));
 
   for (i = 0; i < n; i++) {
-    bailif0(sfobj = (*env)->AllocObject(env, sfcls));
-    bailif0(fid = (*env)->GetFieldID(env, sfcls, "num", "I"));
+    BAILIF0(sfobj = (*env)->AllocObject(env, sfcls));
+    BAILIF0(fid = (*env)->GetFieldID(env, sfcls, "num", "I"));
     (*env)->SetIntField(env, sfobj, fid, sf[i].num);
-    bailif0(fid = (*env)->GetFieldID(env, sfcls, "denom", "I"));
+    BAILIF0(fid = (*env)->GetFieldID(env, sfcls, "denom", "I"));
     (*env)->SetIntField(env, sfobj, fid, sf[i].denom);
     (*env)->SetObjectArrayElement(env, sfjava, i, sfobj);
   }
@@ -626,33 +657,34 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress
   unsigned char *jpegBuf = NULL;
   int width = 0, height = 0, jpegSubsamp = -1, jpegColorspace = -1;
 
-  gethandle();
+  GET_HANDLE();
 
   if ((*env)->GetArrayLength(env, src) < jpegSize)
-    _throwarg("Source buffer is not large enough");
+    THROW_ARG("Source buffer is not large enough");
 
-  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
 
   if (tjDecompressHeader3(handle, jpegBuf, (unsigned long)jpegSize, &width,
-                          &height, &jpegSubsamp, &jpegColorspace) == -1)
-    _throwtj();
+                          &height, &jpegSubsamp, &jpegColorspace) == -1) {
+    SAFE_RELEASE(src, jpegBuf);
+    THROW_TJ();
+  }
 
-  (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
-  jpegBuf = NULL;
+  SAFE_RELEASE(src, jpegBuf);
 
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
   (*env)->SetIntField(env, obj, _fid, jpegSubsamp);
   if ((_fid = (*env)->GetFieldID(env, _cls, "jpegColorspace", "I")) == 0)
     (*env)->ExceptionClear(env);
   else
     (*env)->SetIntField(env, obj, _fid, jpegColorspace);
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
   (*env)->SetIntField(env, obj, _fid, width);
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
   (*env)->SetIntField(env, obj, _fid, height);
 
 bailout:
-  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
+  SAFE_RELEASE(src, jpegBuf);
 }
 
 static void TJDecompressor_decompress
@@ -664,31 +696,34 @@ static void TJDecompressor_decompress
   jsize arraySize = 0, actualPitch;
   unsigned char *jpegBuf = NULL, *dstBuf = NULL;
 
-  gethandle();
+  GET_HANDLE();
 
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
-    _throwarg("Invalid argument in decompress()");
+    THROW_ARG("Invalid argument in decompress()");
   if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF)
-    _throwarg("Mismatch between Java and C API");
+    THROW_ARG("Mismatch between Java and C API");
 
   if ((*env)->GetArrayLength(env, src) < jpegSize)
-    _throwarg("Source buffer is not large enough");
+    THROW_ARG("Source buffer is not large enough");
   actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
   arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
   if ((*env)->GetArrayLength(env, dst) * dstElementSize < arraySize)
-    _throwarg("Destination buffer is not large enough");
+    THROW_ARG("Destination buffer is not large enough");
 
-  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
-  bailif0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
   if (tjDecompress2(handle, jpegBuf, (unsigned long)jpegSize,
                     &dstBuf[y * actualPitch + x * tjPixelSize[pf]], width,
-                    pitch, height, pf, flags) == -1)
-    _throwtj();
+                    pitch, height, pf, flags) == -1) {
+    SAFE_RELEASE(dst, dstBuf);
+    SAFE_RELEASE(src, jpegBuf);
+    THROW_TJ();
+  }
 
 bailout:
-  if (dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
+  SAFE_RELEASE(dst, dstBuf);
+  SAFE_RELEASE(src, jpegBuf);
 }
 
 /* TurboJPEG 1.3.x: TJDecompressor::decompress() byte destination */
@@ -715,9 +750,9 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress
    jint x, jint y, jint width, jint stride, jint height, jint pf, jint flags)
 {
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
-    _throwarg("Invalid argument in decompress()");
+    THROW_ARG("Invalid argument in decompress()");
   if (tjPixelSize[pf] != sizeof(jint))
-    _throwarg("Pixel format must be 32-bit when decompressing to an integer buffer.");
+    THROW_ARG("Pixel format must be 32-bit when decompressing to an integer buffer.");
 
   TJDecompressor_decompress(env, obj, src, jpegSize, dst, sizeof(jint), x, y,
                             width, stride * sizeof(jint), height, pf, flags);
@@ -732,9 +767,9 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress
    jint width, jint stride, jint height, jint pf, jint flags)
 {
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
-    _throwarg("Invalid argument in decompress()");
+    THROW_ARG("Invalid argument in decompress()");
   if (tjPixelSize[pf] != sizeof(jint))
-    _throwarg("Pixel format must be 32-bit when decompressing to an integer buffer.");
+    THROW_ARG("Pixel format must be 32-bit when decompressing to an integer buffer.");
 
   TJDecompressor_decompress(env, obj, src, jpegSize, dst, sizeof(jint), 0, 0,
                             width, stride * sizeof(jint), height, pf, flags);
@@ -757,15 +792,15 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress
   int nc = 0, i, width, height, scaledWidth, scaledHeight, nsf = 0;
   tjscalingfactor *sf;
 
-  gethandle();
+  GET_HANDLE();
 
   if ((*env)->GetArrayLength(env, src) < jpegSize)
-    _throwarg("Source buffer is not large enough");
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+    THROW_ARG("Source buffer is not large enough");
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
   jpegSubsamp = (int)(*env)->GetIntField(env, obj, _fid);
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
   jpegWidth = (int)(*env)->GetIntField(env, obj, _fid);
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
   jpegHeight = (int)(*env)->GetIntField(env, obj, _fid);
 
   nc = (jpegSubsamp == org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY ? 1 : 3);
@@ -776,7 +811,7 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress
   if (height == 0) height = jpegHeight;
   sf = tjGetScalingFactors(&nsf);
   if (!sf || nsf < 1)
-    _throwarg(tjGetErrorStr());
+    THROW_ARG(tjGetErrorStr());
   for (i = 0; i < nsf; i++) {
     scaledWidth = TJSCALED(jpegWidth, sf[i]);
     scaledHeight = TJSCALED(jpegHeight, sf[i]);
@@ -784,49 +819,59 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress
       break;
   }
   if (i >= nsf)
-    _throwarg("Could not scale down to desired image dimensions");
+    THROW_ARG("Could not scale down to desired image dimensions");
 
-  bailif0(dstOffsets = (*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
-  bailif0(dstStrides = (*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
+#define RELEASE_ARRAYS_DECOMPRESSTOYUV() { \
+  SAFE_RELEASE(src, jpegBuf); \
+  for (i = 0; i < nc; i++) \
+    SAFE_RELEASE(jDstPlanes[i], dstPlanes[i]); \
+  SAFE_RELEASE(jDstStrides, dstStrides); \
+  SAFE_RELEASE(jDstOffsets, dstOffsets); \
+}
+
+  BAILIF0(dstOffsets = (*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
+  BAILIF0(dstStrides = (*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
   for (i = 0; i < nc; i++) {
     int planeSize = tjPlaneSizeYUV(i, scaledWidth, dstStrides[i], scaledHeight,
                                    jpegSubsamp);
     int pw = tjPlaneWidth(i, scaledWidth, jpegSubsamp);
 
-    if (planeSize < 0 || pw < 0)
-      _throwarg(tjGetErrorStr());
+    if (planeSize < 0 || pw < 0) {
+      RELEASE_ARRAYS_DECOMPRESSTOYUV();
+      THROW_ARG(tjGetErrorStr());
+    }
 
-    if (dstOffsets[i] < 0)
-      _throwarg("Invalid argument in decompressToYUV()");
-    if (dstStrides[i] < 0 && dstOffsets[i] - planeSize + pw < 0)
-      _throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
+    if (dstOffsets[i] < 0) {
+      RELEASE_ARRAYS_DECOMPRESSTOYUV();
+      THROW_ARG("Invalid argument in decompressToYUV()");
+    }
+    if (dstStrides[i] < 0 && dstOffsets[i] - planeSize + pw < 0) {
+      RELEASE_ARRAYS_DECOMPRESSTOYUV();
+      THROW_ARG("Negative plane stride would cause memory to be accessed below plane boundary");
+    }
 
-    bailif0(jDstPlanes[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
-    if ((*env)->GetArrayLength(env, jDstPlanes[i]) < dstOffsets[i] + planeSize)
-      _throwarg("Destination plane is not large enough");
+    BAILIF0(jDstPlanes[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
+    if ((*env)->GetArrayLength(env, jDstPlanes[i]) <
+        dstOffsets[i] + planeSize) {
+      RELEASE_ARRAYS_DECOMPRESSTOYUV();
+      THROW_ARG("Destination plane is not large enough");
+    }
 
-    bailif0(dstPlanes[i] =
+    BAILIF0(dstPlanes[i] =
             (*env)->GetPrimitiveArrayCritical(env, jDstPlanes[i], 0));
     dstPlanes[i] = &dstPlanes[i][dstOffsets[i]];
   }
-  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
 
   if (tjDecompressToYUVPlanes(handle, jpegBuf, (unsigned long)jpegSize,
                               dstPlanes, desiredWidth, dstStrides,
-                              desiredHeight, flags) == -1)
-    _throwtj();
+                              desiredHeight, flags) == -1) {
+    RELEASE_ARRAYS_DECOMPRESSTOYUV();
+    THROW_TJ();
+  }
 
 bailout:
-  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
-  for (i = 0; i < nc; i++) {
-    if (dstPlanes[i] && jDstPlanes[i])
-      (*env)->ReleasePrimitiveArrayCritical(env, jDstPlanes[i], dstPlanes[i],
-                                            0);
-  }
-  if (dstStrides)
-    (*env)->ReleasePrimitiveArrayCritical(env, jDstStrides, dstStrides, 0);
-  if (dstOffsets)
-    (*env)->ReleasePrimitiveArrayCritical(env, jDstOffsets, dstOffsets, 0);
+  RELEASE_ARRAYS_DECOMPRESSTOYUV();
 }
 
 /* TurboJPEG 1.2.x: TJDecompressor::decompressToYUV() */
@@ -838,30 +883,33 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress
   unsigned char *jpegBuf = NULL, *dstBuf = NULL;
   int jpegSubsamp = -1, jpegWidth = 0, jpegHeight = 0;
 
-  gethandle();
+  GET_HANDLE();
 
   if ((*env)->GetArrayLength(env, src) < jpegSize)
-    _throwarg("Source buffer is not large enough");
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+    THROW_ARG("Source buffer is not large enough");
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
   jpegSubsamp = (int)(*env)->GetIntField(env, obj, _fid);
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
   jpegWidth = (int)(*env)->GetIntField(env, obj, _fid);
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
   jpegHeight = (int)(*env)->GetIntField(env, obj, _fid);
   if ((*env)->GetArrayLength(env, dst) <
       (jsize)tjBufSizeYUV(jpegWidth, jpegHeight, jpegSubsamp))
-    _throwarg("Destination buffer is not large enough");
+    THROW_ARG("Destination buffer is not large enough");
 
-  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
-  bailif0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
   if (tjDecompressToYUV(handle, jpegBuf, (unsigned long)jpegSize, dstBuf,
-                        flags) == -1)
-    _throwtj();
+                        flags) == -1) {
+    SAFE_RELEASE(dst, dstBuf);
+    SAFE_RELEASE(src, jpegBuf);
+    THROW_TJ();
+  }
 
 bailout:
-  if (dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
+  SAFE_RELEASE(dst, dstBuf);
+  SAFE_RELEASE(src, jpegBuf);
 }
 
 static void TJDecompressor_decodeYUV
@@ -877,67 +925,77 @@ static void TJDecompressor_decodeYUV
   int *srcOffsets = NULL, *srcStrides = NULL;
   int nc = (subsamp == org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY ? 1 : 3), i;
 
-  gethandle();
+  GET_HANDLE();
 
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || subsamp < 0 ||
       subsamp >= org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
-    _throwarg("Invalid argument in decodeYUV()");
+    THROW_ARG("Invalid argument in decodeYUV()");
   if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF ||
       org_libjpegturbo_turbojpeg_TJ_NUMSAMP != TJ_NUMSAMP)
-    _throwarg("Mismatch between Java and C API");
+    THROW_ARG("Mismatch between Java and C API");
 
   if ((*env)->GetArrayLength(env, srcobjs) < nc)
-    _throwarg("Planes array is too small for the subsampling type");
+    THROW_ARG("Planes array is too small for the subsampling type");
   if ((*env)->GetArrayLength(env, jSrcOffsets) < nc)
-    _throwarg("Offsets array is too small for the subsampling type");
+    THROW_ARG("Offsets array is too small for the subsampling type");
   if ((*env)->GetArrayLength(env, jSrcStrides) < nc)
-    _throwarg("Strides array is too small for the subsampling type");
+    THROW_ARG("Strides array is too small for the subsampling type");
 
   actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
   arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
   if ((*env)->GetArrayLength(env, dst) * dstElementSize < arraySize)
-    _throwarg("Destination buffer is not large enough");
+    THROW_ARG("Destination buffer is not large enough");
 
-  bailif0(srcOffsets = (*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
-  bailif0(srcStrides = (*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
+#define RELEASE_ARRAYS_DECODEYUV() { \
+  SAFE_RELEASE(dst, dstBuf); \
+  for (i = 0; i < nc; i++) \
+    SAFE_RELEASE(jSrcPlanes[i], srcPlanes[i]); \
+  SAFE_RELEASE(jSrcStrides, srcStrides); \
+  SAFE_RELEASE(jSrcOffsets, srcOffsets); \
+}
+
+  BAILIF0(srcOffsets = (*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
+  BAILIF0(srcStrides = (*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
   for (i = 0; i < nc; i++) {
     int planeSize = tjPlaneSizeYUV(i, width, srcStrides[i], height, subsamp);
     int pw = tjPlaneWidth(i, width, subsamp);
 
-    if (planeSize < 0 || pw < 0)
-      _throwarg(tjGetErrorStr());
+    if (planeSize < 0 || pw < 0) {
+      RELEASE_ARRAYS_DECODEYUV();
+      THROW_ARG(tjGetErrorStr());
+    }
 
-    if (srcOffsets[i] < 0)
-      _throwarg("Invalid argument in decodeYUV()");
-    if (srcStrides[i] < 0 && srcOffsets[i] - planeSize + pw < 0)
-      _throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
+    if (srcOffsets[i] < 0) {
+      RELEASE_ARRAYS_DECODEYUV();
+      THROW_ARG("Invalid argument in decodeYUV()");
+    }
+    if (srcStrides[i] < 0 && srcOffsets[i] - planeSize + pw < 0) {
+      RELEASE_ARRAYS_DECODEYUV();
+      THROW_ARG("Negative plane stride would cause memory to be accessed below plane boundary");
+    }
 
-    bailif0(jSrcPlanes[i] = (*env)->GetObjectArrayElement(env, srcobjs, i));
-    if ((*env)->GetArrayLength(env, jSrcPlanes[i]) < srcOffsets[i] + planeSize)
-      _throwarg("Source plane is not large enough");
+    BAILIF0(jSrcPlanes[i] = (*env)->GetObjectArrayElement(env, srcobjs, i));
+    if ((*env)->GetArrayLength(env, jSrcPlanes[i]) <
+        srcOffsets[i] + planeSize) {
+      RELEASE_ARRAYS_DECODEYUV();
+      THROW_ARG("Source plane is not large enough");
+    }
 
-    bailif0(srcPlanes[i] =
+    BAILIF0(srcPlanes[i] =
             (*env)->GetPrimitiveArrayCritical(env, jSrcPlanes[i], 0));
     srcPlanes[i] = &srcPlanes[i][srcOffsets[i]];
   }
-  bailif0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+  BAILIF0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
   if (tjDecodeYUVPlanes(handle, srcPlanes, srcStrides, subsamp,
                         &dstBuf[y * actualPitch + x * tjPixelSize[pf]], width,
-                        pitch, height, pf, flags) == -1)
-    _throwtj();
+                        pitch, height, pf, flags) == -1) {
+    RELEASE_ARRAYS_DECODEYUV();
+    THROW_TJ();
+  }
 
 bailout:
-  if (dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-  for (i = 0; i < nc; i++) {
-    if (srcPlanes[i] && jSrcPlanes[i])
-      (*env)->ReleasePrimitiveArrayCritical(env, jSrcPlanes[i],
-                                            (unsigned char *)srcPlanes[i], 0);
-  }
-  if (srcStrides)
-    (*env)->ReleasePrimitiveArrayCritical(env, jSrcStrides, srcStrides, 0);
-  if (srcOffsets)
-    (*env)->ReleasePrimitiveArrayCritical(env, jSrcOffsets, srcOffsets, 0);
+  RELEASE_ARRAYS_DECODEYUV();
 }
 
 /* TurboJPEG 1.4.x: TJDecompressor::decodeYUV() byte destination */
@@ -958,9 +1016,9 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV_
    jint width, jint stride, jint height, jint pf, jint flags)
 {
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
-    _throwarg("Invalid argument in decodeYUV()");
+    THROW_ARG("Invalid argument in decodeYUV()");
   if (tjPixelSize[pf] != sizeof(jint))
-    _throwarg("Pixel format must be 32-bit when decoding to an integer buffer.");
+    THROW_ARG("Pixel format must be 32-bit when decoding to an integer buffer.");
 
   TJDecompressor_decodeYUV(env, obj, srcobjs, jSrcOffsets, jSrcStrides,
                            subsamp, dst, sizeof(jint), x, y, width,
@@ -979,10 +1037,10 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_init
   tjhandle handle;
 
   if ((handle = tjInitTransform()) == NULL)
-    _throw(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
+    THROW(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
 
-  bailif0(cls = (*env)->GetObjectClass(env, obj));
-  bailif0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
+  BAILIF0(cls = (*env)->GetObjectClass(env, obj));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
   (*env)->SetLongField(env, obj, fid, (size_t)handle);
 
 bailout:
@@ -1007,43 +1065,43 @@ static int JNICustomFilter(short *coeffs, tjregion arrayRegion,
   jmethodID mid;
   jfieldID fid;
 
-  bailif0(bufobj = (*env)->NewDirectByteBuffer(env, coeffs,
+  BAILIF0(bufobj = (*env)->NewDirectByteBuffer(env, coeffs,
     sizeof(short) * arrayRegion.w * arrayRegion.h));
-  bailif0(cls = (*env)->FindClass(env, "java/nio/ByteOrder"));
-  bailif0(mid = (*env)->GetStaticMethodID(env, cls, "nativeOrder",
+  BAILIF0(cls = (*env)->FindClass(env, "java/nio/ByteOrder"));
+  BAILIF0(mid = (*env)->GetStaticMethodID(env, cls, "nativeOrder",
                                           "()Ljava/nio/ByteOrder;"));
-  bailif0(borobj = (*env)->CallStaticObjectMethod(env, cls, mid));
-  bailif0(cls = (*env)->GetObjectClass(env, bufobj));
-  bailif0(mid = (*env)->GetMethodID(env, cls, "order",
+  BAILIF0(borobj = (*env)->CallStaticObjectMethod(env, cls, mid));
+  BAILIF0(cls = (*env)->GetObjectClass(env, bufobj));
+  BAILIF0(mid = (*env)->GetMethodID(env, cls, "order",
     "(Ljava/nio/ByteOrder;)Ljava/nio/ByteBuffer;"));
   (*env)->CallObjectMethod(env, bufobj, mid, borobj);
-  bailif0(mid = (*env)->GetMethodID(env, cls, "asShortBuffer",
+  BAILIF0(mid = (*env)->GetMethodID(env, cls, "asShortBuffer",
                                     "()Ljava/nio/ShortBuffer;"));
-  bailif0(bufobj = (*env)->CallObjectMethod(env, bufobj, mid));
+  BAILIF0(bufobj = (*env)->CallObjectMethod(env, bufobj, mid));
 
-  bailif0(cls = (*env)->FindClass(env, "java/awt/Rectangle"));
-  bailif0(arrayRegionObj = (*env)->AllocObject(env, cls));
-  bailif0(fid = (*env)->GetFieldID(env, cls, "x", "I"));
+  BAILIF0(cls = (*env)->FindClass(env, "java/awt/Rectangle"));
+  BAILIF0(arrayRegionObj = (*env)->AllocObject(env, cls));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "x", "I"));
   (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.x);
-  bailif0(fid = (*env)->GetFieldID(env, cls, "y", "I"));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "y", "I"));
   (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.y);
-  bailif0(fid = (*env)->GetFieldID(env, cls, "width", "I"));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "width", "I"));
   (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.w);
-  bailif0(fid = (*env)->GetFieldID(env, cls, "height", "I"));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "height", "I"));
   (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.h);
 
-  bailif0(planeRegionObj = (*env)->AllocObject(env, cls));
-  bailif0(fid = (*env)->GetFieldID(env, cls, "x", "I"));
+  BAILIF0(planeRegionObj = (*env)->AllocObject(env, cls));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "x", "I"));
   (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.x);
-  bailif0(fid = (*env)->GetFieldID(env, cls, "y", "I"));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "y", "I"));
   (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.y);
-  bailif0(fid = (*env)->GetFieldID(env, cls, "width", "I"));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "width", "I"));
   (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.w);
-  bailif0(fid = (*env)->GetFieldID(env, cls, "height", "I"));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "height", "I"));
   (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.h);
 
-  bailif0(cls = (*env)->GetObjectClass(env, cfobj));
-  bailif0(mid = (*env)->GetMethodID(env, cls, "customFilter",
+  BAILIF0(cls = (*env)->GetObjectClass(env, cfobj));
+  BAILIF0(mid = (*env)->GetMethodID(env, cls, "customFilter",
     "(Ljava/nio/ShortBuffer;Ljava/awt/Rectangle;Ljava/awt/Rectangle;IILorg/libjpegturbo/turbojpeg/TJTransform;)V"));
   (*env)->CallVoidMethod(env, cfobj, mid, bufobj, arrayRegionObj,
                          planeRegionObj, componentIndex, transformIndex, tobj);
@@ -1070,33 +1128,33 @@ JNIEXPORT jintArray JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_transf
   jint *dstSizesi = NULL;
   JNICustomFilterParams *params = NULL;
 
-  gethandle();
+  GET_HANDLE();
 
   if ((*env)->GetArrayLength(env, jsrcBuf) < jpegSize)
-    _throwarg("Source buffer is not large enough");
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+    THROW_ARG("Source buffer is not large enough");
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
   jpegWidth = (int)(*env)->GetIntField(env, obj, _fid);
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
   jpegHeight = (int)(*env)->GetIntField(env, obj, _fid);
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
   jpegSubsamp = (int)(*env)->GetIntField(env, obj, _fid);
 
   n = (*env)->GetArrayLength(env, dstobjs);
   if (n != (*env)->GetArrayLength(env, tobjs))
-    _throwarg("Mismatch between size of transforms array and destination buffers array");
+    THROW_ARG("Mismatch between size of transforms array and destination buffers array");
 
   if ((dstBufs =
        (unsigned char **)malloc(sizeof(unsigned char *) * n)) == NULL)
-    _throwmem();
+    THROW_MEM();
   if ((jdstBufs = (jbyteArray *)malloc(sizeof(jbyteArray) * n)) == NULL)
-    _throwmem();
+    THROW_MEM();
   if ((dstSizes = (unsigned long *)malloc(sizeof(unsigned long) * n)) == NULL)
-    _throwmem();
+    THROW_MEM();
   if ((t = (tjtransform *)malloc(sizeof(tjtransform) * n)) == NULL)
-    _throwmem();
+    THROW_MEM();
   if ((params = (JNICustomFilterParams *)malloc(sizeof(JNICustomFilterParams) *
                                                 n)) == NULL)
-    _throwmem();
+    THROW_MEM();
   for (i = 0; i < n; i++) {
     dstBufs[i] = NULL;  jdstBufs[i] = NULL;  dstSizes[i] = 0;
     memset(&t[i], 0, sizeof(tjtransform));
@@ -1106,22 +1164,22 @@ JNIEXPORT jintArray JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_transf
   for (i = 0; i < n; i++) {
     jobject tobj, cfobj;
 
-    bailif0(tobj = (*env)->GetObjectArrayElement(env, tobjs, i));
-    bailif0(_cls = (*env)->GetObjectClass(env, tobj));
-    bailif0(_fid = (*env)->GetFieldID(env, _cls, "op", "I"));
+    BAILIF0(tobj = (*env)->GetObjectArrayElement(env, tobjs, i));
+    BAILIF0(_cls = (*env)->GetObjectClass(env, tobj));
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "op", "I"));
     t[i].op = (*env)->GetIntField(env, tobj, _fid);
-    bailif0(_fid = (*env)->GetFieldID(env, _cls, "options", "I"));
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "options", "I"));
     t[i].options = (*env)->GetIntField(env, tobj, _fid);
-    bailif0(_fid = (*env)->GetFieldID(env, _cls, "x", "I"));
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "x", "I"));
     t[i].r.x = (*env)->GetIntField(env, tobj, _fid);
-    bailif0(_fid = (*env)->GetFieldID(env, _cls, "y", "I"));
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "y", "I"));
     t[i].r.y = (*env)->GetIntField(env, tobj, _fid);
-    bailif0(_fid = (*env)->GetFieldID(env, _cls, "width", "I"));
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "width", "I"));
     t[i].r.w = (*env)->GetIntField(env, tobj, _fid);
-    bailif0(_fid = (*env)->GetFieldID(env, _cls, "height", "I"));
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "height", "I"));
     t[i].r.h = (*env)->GetIntField(env, tobj, _fid);
 
-    bailif0(_fid = (*env)->GetFieldID(env, _cls, "cf",
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "cf",
       "Lorg/libjpegturbo/turbojpeg/TJCustomFilter;"));
     cfobj = (*env)->GetObjectField(env, tobj, _fid);
     if (cfobj) {
@@ -1138,29 +1196,30 @@ JNIEXPORT jintArray JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_transf
 
     if (t[i].r.w != 0) w = t[i].r.w;
     if (t[i].r.h != 0) h = t[i].r.h;
-    bailif0(jdstBufs[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
+    BAILIF0(jdstBufs[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
     if ((unsigned long)(*env)->GetArrayLength(env, jdstBufs[i]) <
         tjBufSize(w, h, jpegSubsamp))
-      _throwarg("Destination buffer is not large enough");
+      THROW_ARG("Destination buffer is not large enough");
   }
-  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, jsrcBuf, 0));
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, jsrcBuf, 0));
   for (i = 0; i < n; i++)
-    bailif0(dstBufs[i] =
+    BAILIF0(dstBufs[i] =
             (*env)->GetPrimitiveArrayCritical(env, jdstBufs[i], 0));
 
   if (tjTransform(handle, jpegBuf, jpegSize, n, dstBufs, dstSizes, t,
-                  flags | TJFLAG_NOREALLOC) == -1)
-    _throwtj();
-
-  for (i = 0; i < n; i++) {
-    (*env)->ReleasePrimitiveArrayCritical(env, jdstBufs[i], dstBufs[i], 0);
-    dstBufs[i] = NULL;
+                  flags | TJFLAG_NOREALLOC) == -1) {
+    for (i = 0; i < n; i++)
+      SAFE_RELEASE(jdstBufs[i], dstBufs[i]);
+    SAFE_RELEASE(jsrcBuf, jpegBuf);
+    THROW_TJ();
   }
-  (*env)->ReleasePrimitiveArrayCritical(env, jsrcBuf, jpegBuf, 0);
-  jpegBuf = NULL;
+
+  for (i = 0; i < n; i++)
+    SAFE_RELEASE(jdstBufs[i], dstBufs[i]);
+  SAFE_RELEASE(jsrcBuf, jpegBuf);
 
   jdstSizes = (*env)->NewIntArray(env, n);
-  bailif0(dstSizesi = (*env)->GetIntArrayElements(env, jdstSizes, 0));
+  BAILIF0(dstSizesi = (*env)->GetIntArrayElements(env, jdstSizes, 0));
   for (i = 0; i < n; i++) dstSizesi[i] = (int)dstSizes[i];
 
 bailout:
@@ -1172,10 +1231,10 @@ bailout:
     }
     free(dstBufs);
   }
-  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, jsrcBuf, jpegBuf, 0);
-  if (jdstBufs) free(jdstBufs);
-  if (dstSizes) free(dstSizes);
-  if (t) free(t);
+  SAFE_RELEASE(jsrcBuf, jpegBuf);
+  free(jdstBufs);
+  free(dstSizes);
+  free(t);
   return jdstSizes;
 }
 
diff --git a/turbojpeg.c b/turbojpeg.c
index ce3287b8..f8fd36bd 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c
@@ -1,7 +1,5 @@
 /*
- * Copyright (C)2009-2018 D. R. Commander.  All Rights Reserved.
- * mozjpeg Modifications:
- * Copyright (C) 2014, Mozilla Corporation.
+ * Copyright (C)2009-2019 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -49,20 +47,20 @@
 extern void jpeg_mem_dest_tj(j_compress_ptr, unsigned char **, unsigned long *,
                              boolean);
 extern void jpeg_mem_src_tj(j_decompress_ptr, const unsigned char *,
-	unsigned long);
+                            unsigned long);
 
-#define PAD(v, p) ((v+(p)-1)&(~((p)-1)))
-#define isPow2(x) (((x)&(x-1))==0)
+#define PAD(v, p)  ((v + (p) - 1) & (~((p) - 1)))
+#define IS_POW2(x)  (((x) & (x - 1)) == 0)
 
 
 /* Error handling (based on example in example.txt) */
 
-static char errStr[JMSG_LENGTH_MAX]="No error";
+static char errStr[JMSG_LENGTH_MAX] = "No error";
 
 struct my_error_mgr {
-	struct jpeg_error_mgr pub;
-	jmp_buf setjmp_buffer;
-	void (*emit_message)(j_common_ptr, int);
+  struct jpeg_error_mgr pub;
+  jmp_buf setjmp_buffer;
+  void (*emit_message) (j_common_ptr, int);
   boolean warning, stopOnWarning;
 };
 typedef struct my_error_mgr *my_error_ptr;
@@ -75,24 +73,24 @@ static const char *turbojpeg_message_table[] = {
 
 static void my_error_exit(j_common_ptr cinfo)
 {
-	my_error_ptr myerr=(my_error_ptr)cinfo->err;
+  my_error_ptr myerr = (my_error_ptr)cinfo->err;
 
-	(*cinfo->err->output_message)(cinfo);
-	longjmp(myerr->setjmp_buffer, 1);
+  (*cinfo->err->output_message) (cinfo);
+  longjmp(myerr->setjmp_buffer, 1);
 }
 
 /* Based on output_message() in jerror.c */
 
 static void my_output_message(j_common_ptr cinfo)
 {
-	(*cinfo->err->format_message)(cinfo, errStr);
+  (*cinfo->err->format_message) (cinfo, errStr);
 }
 
 static void my_emit_message(j_common_ptr cinfo, int msg_level)
 {
-	my_error_ptr myerr=(my_error_ptr)cinfo->err;
+  my_error_ptr myerr = (my_error_ptr)cinfo->err;
 
-	myerr->emit_message(cinfo, msg_level);
+  myerr->emit_message(cinfo, msg_level);
   if (msg_level < 0) {
     myerr->warning = TRUE;
     if (myerr->stopOnWarning) longjmp(myerr->setjmp_buffer, 1);
@@ -102,42 +100,42 @@ static void my_emit_message(j_common_ptr cinfo, int msg_level)
 
 /* Global structures, macros, etc. */
 
-enum {COMPRESS=1, DECOMPRESS=2};
+enum { COMPRESS = 1, DECOMPRESS = 2 };
 
 typedef struct _tjinstance {
-	struct jpeg_compress_struct cinfo;
-	struct jpeg_decompress_struct dinfo;
-	struct my_error_mgr jerr;
-	int init, headerRead;
+  struct jpeg_compress_struct cinfo;
+  struct jpeg_decompress_struct dinfo;
+  struct my_error_mgr jerr;
+  int init, headerRead;
   char errStr[JMSG_LENGTH_MAX];
   boolean isInstanceError;
 } tjinstance;
 
-static const int pixelsize[TJ_NUMSAMP]={3, 3, 3, 1, 3, 3};
+static const int pixelsize[TJ_NUMSAMP] = { 3, 3, 3, 1, 3, 3 };
 
 static const JXFORM_CODE xformtypes[TJ_NUMXOP] = {
-	JXFORM_NONE, JXFORM_FLIP_H, JXFORM_FLIP_V, JXFORM_TRANSPOSE,
-	JXFORM_TRANSVERSE, JXFORM_ROT_90, JXFORM_ROT_180, JXFORM_ROT_270
+  JXFORM_NONE, JXFORM_FLIP_H, JXFORM_FLIP_V, JXFORM_TRANSPOSE,
+  JXFORM_TRANSVERSE, JXFORM_ROT_90, JXFORM_ROT_180, JXFORM_ROT_270
 };
 
-#define NUMSF 16
-static const tjscalingfactor sf[NUMSF]={
-	{2, 1},
-	{15, 8},
-	{7, 4},
-	{13, 8},
-	{3, 2},
-	{11, 8},
-	{5, 4},
-	{9, 8},
-	{1, 1},
-	{7, 8},
-	{3, 4},
-	{5, 8},
-	{1, 2},
-	{3, 8},
-	{1, 4},
-	{1, 8}
+#define NUMSF  16
+static const tjscalingfactor sf[NUMSF] = {
+  { 2, 1 },
+  { 15, 8 },
+  { 7, 4 },
+  { 13, 8 },
+  { 3, 2 },
+  { 11, 8 },
+  { 5, 4 },
+  { 9, 8 },
+  { 1, 1 },
+  { 7, 8 },
+  { 3, 4 },
+  { 5, 8 },
+  { 1, 2 },
+  { 3, 8 },
+  { 1, 4 },
+  { 1, 8 }
 };
 
 static J_COLOR_SPACE pf2cs[TJ_NUMPF] = {
@@ -166,20 +164,20 @@ static int cs2pf[JPEG_NUMCS] = {
   TJPF_UNKNOWN
 };
 
-#define _throwg(m) { \
+#define THROWG(m) { \
   snprintf(errStr, JMSG_LENGTH_MAX, "%s", m); \
   retval = -1;  goto bailout; \
 }
-#define _throwunix(m) { \
+#define THROW_UNIX(m) { \
   snprintf(errStr, JMSG_LENGTH_MAX, "%s\n%s", m, strerror(errno)); \
   retval = -1;  goto bailout; \
 }
-#define _throw(m) { \
+#define THROW(m) { \
   snprintf(this->errStr, JMSG_LENGTH_MAX, "%s", m); \
-  this->isInstanceError = TRUE;  _throwg(m) \
+  this->isInstanceError = TRUE;  THROWG(m) \
 }
 
-#define getinstance(handle) \
+#define GET_INSTANCE(handle) \
   tjinstance *this = (tjinstance *)handle; \
   j_compress_ptr cinfo = NULL; \
   j_decompress_ptr dinfo = NULL; \
@@ -188,72 +186,72 @@ static int cs2pf[JPEG_NUMCS] = {
     snprintf(errStr, JMSG_LENGTH_MAX, "Invalid handle"); \
     return -1; \
   } \
-	cinfo=&this->cinfo;  dinfo=&this->dinfo;  \
+  cinfo = &this->cinfo;  dinfo = &this->dinfo; \
   this->jerr.warning = FALSE; \
   this->isInstanceError = FALSE;
 
-#define getcinstance(handle) \
+#define GET_CINSTANCE(handle) \
   tjinstance *this = (tjinstance *)handle; \
-	j_compress_ptr cinfo=NULL;  \
+  j_compress_ptr cinfo = NULL; \
   \
   if (!this) { \
     snprintf(errStr, JMSG_LENGTH_MAX, "Invalid handle"); \
     return -1; \
   } \
-	cinfo=&this->cinfo;  \
+  cinfo = &this->cinfo; \
   this->jerr.warning = FALSE; \
   this->isInstanceError = FALSE;
 
-#define getdinstance(handle) \
+#define GET_DINSTANCE(handle) \
   tjinstance *this = (tjinstance *)handle; \
-	j_decompress_ptr dinfo=NULL;  \
+  j_decompress_ptr dinfo = NULL; \
   \
   if (!this) { \
     snprintf(errStr, JMSG_LENGTH_MAX, "Invalid handle"); \
     return -1; \
   } \
-	dinfo=&this->dinfo;  \
+  dinfo = &this->dinfo; \
   this->jerr.warning = FALSE; \
   this->isInstanceError = FALSE;
 
 static int getPixelFormat(int pixelSize, int flags)
 {
-	if(pixelSize==1) return TJPF_GRAY;
+  if (pixelSize == 1) return TJPF_GRAY;
   if (pixelSize == 3) {
-		if(flags&TJ_BGR) return TJPF_BGR;
-		else return TJPF_RGB;
-	}
+    if (flags & TJ_BGR) return TJPF_BGR;
+    else return TJPF_RGB;
+  }
   if (pixelSize == 4) {
     if (flags & TJ_ALPHAFIRST) {
-			if(flags&TJ_BGR) return TJPF_XBGR;
-			else return TJPF_XRGB;
+      if (flags & TJ_BGR) return TJPF_XBGR;
+      else return TJPF_XRGB;
     } else {
-			if(flags&TJ_BGR) return TJPF_BGRX;
-			else return TJPF_RGBX;
-		}
-	}
-	return -1;
+      if (flags & TJ_BGR) return TJPF_BGRX;
+      else return TJPF_RGBX;
+    }
+  }
+  return -1;
 }
 
 static int setCompDefaults(struct jpeg_compress_struct *cinfo, int pixelFormat,
                            int subsamp, int jpegQual, int flags)
 {
-	int retval=0;
-	char *env=NULL;
+  int retval = 0;
+#ifndef NO_GETENV
+  char *env = NULL;
+#endif
 
   cinfo->in_color_space = pf2cs[pixelFormat];
-	cinfo->input_components=tjPixelSize[pixelFormat];
-	if((env=getenv("TJ_REVERT"))!=NULL && strlen(env)>0 && !strcmp(env, "1"))
-		cinfo->master->compress_profile=JCP_FASTEST;
-	jpeg_set_defaults(cinfo);
+  cinfo->input_components = tjPixelSize[pixelFormat];
+  jpeg_set_defaults(cinfo);
 
 #ifndef NO_GETENV
   if ((env = getenv("TJ_OPTIMIZE")) != NULL && strlen(env) > 0 &&
       !strcmp(env, "1"))
-		cinfo->optimize_coding=TRUE;
+    cinfo->optimize_coding = TRUE;
   if ((env = getenv("TJ_ARITHMETIC")) != NULL && strlen(env) > 0 &&
       !strcmp(env, "1"))
-		cinfo->arith_code=TRUE;
+    cinfo->arith_code = TRUE;
   if ((env = getenv("TJ_RESTART")) != NULL && strlen(env) > 0) {
     int temp = -1;
     char tempc = 0;
@@ -261,25 +259,25 @@ static int setCompDefaults(struct jpeg_compress_struct *cinfo, int pixelFormat,
     if (sscanf(env, "%d%c", &temp, &tempc) >= 1 && temp >= 0 &&
         temp <= 65535) {
       if (toupper(tempc) == 'B') {
-				cinfo->restart_interval=temp;
-				cinfo->restart_in_rows=0;
+        cinfo->restart_interval = temp;
+        cinfo->restart_in_rows = 0;
       } else
-				cinfo->restart_in_rows=temp;
-		}
-	}
+        cinfo->restart_in_rows = temp;
+    }
+  }
 #endif
 
   if (jpegQual >= 0) {
-		jpeg_set_quality(cinfo, jpegQual, TRUE);
+    jpeg_set_quality(cinfo, jpegQual, TRUE);
     if (jpegQual >= 96 || flags & TJFLAG_ACCURATEDCT)
       cinfo->dct_method = JDCT_ISLOW;
     else
       cinfo->dct_method = JDCT_FASTEST;
-	}
-	if(subsamp==TJSAMP_GRAY)
-		jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
-	else if(pixelFormat==TJPF_CMYK)
-		jpeg_set_colorspace(cinfo, JCS_YCCK);
+  }
+  if (subsamp == TJSAMP_GRAY)
+    jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
+  else if (pixelFormat == TJPF_CMYK)
+    jpeg_set_colorspace(cinfo, JCS_YCCK);
   else
     jpeg_set_colorspace(cinfo, JCS_YCbCr);
 
@@ -288,38 +286,34 @@ static int setCompDefaults(struct jpeg_compress_struct *cinfo, int pixelFormat,
 #ifndef NO_GETENV
   else if ((env = getenv("TJ_PROGRESSIVE")) != NULL && strlen(env) > 0 &&
            !strcmp(env, "1"))
-		jpeg_simple_progression(cinfo);
+    jpeg_simple_progression(cinfo);
 #endif
 
-	/* Set scan pattern again as colorspace might have changed */
-	if(cinfo->master->compress_profile == JCP_MAX_COMPRESSION)
-		jpeg_simple_progression(cinfo);
+  cinfo->comp_info[0].h_samp_factor = tjMCUWidth[subsamp] / 8;
+  cinfo->comp_info[1].h_samp_factor = 1;
+  cinfo->comp_info[2].h_samp_factor = 1;
+  if (cinfo->num_components > 3)
+    cinfo->comp_info[3].h_samp_factor = tjMCUWidth[subsamp] / 8;
+  cinfo->comp_info[0].v_samp_factor = tjMCUHeight[subsamp] / 8;
+  cinfo->comp_info[1].v_samp_factor = 1;
+  cinfo->comp_info[2].v_samp_factor = 1;
+  if (cinfo->num_components > 3)
+    cinfo->comp_info[3].v_samp_factor = tjMCUHeight[subsamp] / 8;
 
-	cinfo->comp_info[0].h_samp_factor=tjMCUWidth[subsamp]/8;
-	cinfo->comp_info[1].h_samp_factor=1;
-	cinfo->comp_info[2].h_samp_factor=1;
-	if(cinfo->num_components>3)
-		cinfo->comp_info[3].h_samp_factor=tjMCUWidth[subsamp]/8;
-	cinfo->comp_info[0].v_samp_factor=tjMCUHeight[subsamp]/8;
-	cinfo->comp_info[1].v_samp_factor=1;
-	cinfo->comp_info[2].v_samp_factor=1;
-	if(cinfo->num_components>3)
-		cinfo->comp_info[3].v_samp_factor=tjMCUHeight[subsamp]/8;
-
-	return retval;
+  return retval;
 }
 
 
 static int getSubsamp(j_decompress_ptr dinfo)
 {
-	int retval=-1, i, k;
+  int retval = -1, i, k;
 
-	/* The sampling factors actually have no meaning with grayscale JPEG files,
-	   and in fact it's possible to generate grayscale JPEGs with sampling
-	   factors > 1 (even though those sampling factors are ignored by the
-	   decompressor.)  Thus, we need to treat grayscale as a special case. */
-	if(dinfo->num_components==1 && dinfo->jpeg_color_space==JCS_GRAYSCALE)
-		return TJSAMP_GRAY;
+  /* The sampling factors actually have no meaning with grayscale JPEG files,
+     and in fact it's possible to generate grayscale JPEGs with sampling
+     factors > 1 (even though those sampling factors are ignored by the
+     decompressor.)  Thus, we need to treat grayscale as a special case. */
+  if (dinfo->num_components == 1 && dinfo->jpeg_color_space == JCS_GRAYSCALE)
+    return TJSAMP_GRAY;
 
   for (i = 0; i < NUMSUBOPT; i++) {
     if (dinfo->num_components == pixelsize[i] ||
@@ -328,48 +322,65 @@ static int getSubsamp(j_decompress_ptr dinfo)
          pixelsize[i] == 3 && dinfo->num_components == 4)) {
       if (dinfo->comp_info[0].h_samp_factor == tjMCUWidth[i] / 8 &&
           dinfo->comp_info[0].v_samp_factor == tjMCUHeight[i] / 8) {
-				int match=0;
+        int match = 0;
 
         for (k = 1; k < dinfo->num_components; k++) {
-					int href=1, vref=1;
+          int href = 1, vref = 1;
 
           if ((dinfo->jpeg_color_space == JCS_YCCK ||
                dinfo->jpeg_color_space == JCS_CMYK) && k == 3) {
-						href=tjMCUWidth[i]/8;  vref=tjMCUHeight[i]/8;
-					}
+            href = tjMCUWidth[i] / 8;  vref = tjMCUHeight[i] / 8;
+          }
           if (dinfo->comp_info[k].h_samp_factor == href &&
               dinfo->comp_info[k].v_samp_factor == vref)
-						match++;
-				}
+            match++;
+        }
         if (match == dinfo->num_components - 1) {
-					retval=i;  break;
-				}
-			}
-			/* Handle 4:2:2 and 4:4:0 images whose sampling factors are specified
-			   in non-standard ways. */
-			if(dinfo->comp_info[0].h_samp_factor==2 &&
-				dinfo->comp_info[0].v_samp_factor==2 &&
+          retval = i;  break;
+        }
+      }
+      /* Handle 4:2:2 and 4:4:0 images whose sampling factors are specified
+         in non-standard ways. */
+      if (dinfo->comp_info[0].h_samp_factor == 2 &&
+          dinfo->comp_info[0].v_samp_factor == 2 &&
           (i == TJSAMP_422 || i == TJSAMP_440)) {
-				int match=0;
+        int match = 0;
 
         for (k = 1; k < dinfo->num_components; k++) {
-					int href=tjMCUHeight[i]/8, vref=tjMCUWidth[i]/8;
+          int href = tjMCUHeight[i] / 8, vref = tjMCUWidth[i] / 8;
 
           if ((dinfo->jpeg_color_space == JCS_YCCK ||
                dinfo->jpeg_color_space == JCS_CMYK) && k == 3) {
-						href=vref=2;
-					}
+            href = vref = 2;
+          }
           if (dinfo->comp_info[k].h_samp_factor == href &&
               dinfo->comp_info[k].v_samp_factor == vref)
-						match++;
-				}
+            match++;
+        }
         if (match == dinfo->num_components - 1) {
-					retval=i;  break;
-				}
-			}
-		}
-	}
-	return retval;
+          retval = i;  break;
+        }
+      }
+      /* Handle 4:4:4 images whose sampling factors are specified in
+         non-standard ways. */
+      if (dinfo->comp_info[0].h_samp_factor *
+          dinfo->comp_info[0].v_samp_factor <=
+          D_MAX_BLOCKS_IN_MCU / pixelsize[i] && i == TJSAMP_444) {
+        int match = 0;
+        for (k = 1; k < dinfo->num_components; k++) {
+          if (dinfo->comp_info[k].h_samp_factor ==
+              dinfo->comp_info[0].h_samp_factor &&
+              dinfo->comp_info[k].v_samp_factor ==
+              dinfo->comp_info[0].v_samp_factor)
+            match++;
+          if (match == dinfo->num_components - 1) {
+            retval = i;  break;
+          }
+        }
+    }
+  }
+  }
+  return retval;
 }
 
 
@@ -404,13 +415,13 @@ DLLEXPORT int tjGetErrorCode(tjhandle handle)
 
 DLLEXPORT int tjDestroy(tjhandle handle)
 {
-	getinstance(handle);
+  GET_INSTANCE(handle);
 
-	if(setjmp(this->jerr.setjmp_buffer)) return -1;
-	if(this->init&COMPRESS) jpeg_destroy_compress(cinfo);
-	if(this->init&DECOMPRESS) jpeg_destroy_decompress(dinfo);
-	free(this);
-	return 0;
+  if (setjmp(this->jerr.setjmp_buffer)) return -1;
+  if (this->init & COMPRESS) jpeg_destroy_compress(cinfo);
+  if (this->init & DECOMPRESS) jpeg_destroy_decompress(dinfo);
+  free(this);
+  return 0;
 }
 
 
@@ -421,13 +432,13 @@ DLLEXPORT int tjDestroy(tjhandle handle)
 
 DLLEXPORT void tjFree(unsigned char *buf)
 {
-	if(buf) free(buf);
+  free(buf);
 }
 
 
 DLLEXPORT unsigned char *tjAlloc(int bytes)
 {
-	return (unsigned char *)malloc(bytes);
+  return (unsigned char *)malloc(bytes);
 }
 
 
@@ -435,181 +446,190 @@ DLLEXPORT unsigned char *tjAlloc(int bytes)
 
 static tjhandle _tjInitCompress(tjinstance *this)
 {
-	static unsigned char buffer[1];
+  static unsigned char buffer[1];
   unsigned char *buf = buffer;
   unsigned long size = 1;
 
   /* This is also straight out of example.txt */
-	this->cinfo.err=jpeg_std_error(&this->jerr.pub);
-	this->jerr.pub.error_exit=my_error_exit;
-	this->jerr.pub.output_message=my_output_message;
-	this->jerr.emit_message=this->jerr.pub.emit_message;
-	this->jerr.pub.emit_message=my_emit_message;
+  this->cinfo.err = jpeg_std_error(&this->jerr.pub);
+  this->jerr.pub.error_exit = my_error_exit;
+  this->jerr.pub.output_message = my_output_message;
+  this->jerr.emit_message = this->jerr.pub.emit_message;
+  this->jerr.pub.emit_message = my_emit_message;
   this->jerr.pub.addon_message_table = turbojpeg_message_table;
   this->jerr.pub.first_addon_message = JMSG_FIRSTADDONCODE;
   this->jerr.pub.last_addon_message = JMSG_LASTADDONCODE;
 
   if (setjmp(this->jerr.setjmp_buffer)) {
-		/* If we get here, the JPEG code has signaled an error. */
-		if(this) free(this);
-		return NULL;
-	}
+    /* If we get here, the JPEG code has signaled an error. */
+    free(this);
+    return NULL;
+  }
 
-	jpeg_create_compress(&this->cinfo);
-	/* Make an initial call so it will create the destination manager */
-	jpeg_mem_dest_tj(&this->cinfo, &buf, &size, 0);
+  jpeg_create_compress(&this->cinfo);
+  /* Make an initial call so it will create the destination manager */
+  jpeg_mem_dest_tj(&this->cinfo, &buf, &size, 0);
 
-	this->init|=COMPRESS;
-	return (tjhandle)this;
+  this->init |= COMPRESS;
+  return (tjhandle)this;
 }
 
 DLLEXPORT tjhandle tjInitCompress(void)
 {
-	tjinstance *this=NULL;
+  tjinstance *this = NULL;
 
   if ((this = (tjinstance *)malloc(sizeof(tjinstance))) == NULL) {
-		snprintf(errStr, JMSG_LENGTH_MAX,
-			"tjInitCompress(): Memory allocation failure");
-		return NULL;
-	}
-	MEMZERO(this, sizeof(tjinstance));
+    snprintf(errStr, JMSG_LENGTH_MAX,
+             "tjInitCompress(): Memory allocation failure");
+    return NULL;
+  }
+  MEMZERO(this, sizeof(tjinstance));
   snprintf(this->errStr, JMSG_LENGTH_MAX, "No error");
-	return _tjInitCompress(this);
+  return _tjInitCompress(this);
 }
 
 
 DLLEXPORT unsigned long tjBufSize(int width, int height, int jpegSubsamp)
 {
-  unsigned long retval = 0;
+  unsigned long long retval = 0;
   int mcuw, mcuh, chromasf;
 
-	if(width<1 || height<1 || jpegSubsamp<0 || jpegSubsamp>=NUMSUBOPT)
-    _throwg("tjBufSize(): Invalid argument");
+  if (width < 1 || height < 1 || jpegSubsamp < 0 || jpegSubsamp >= NUMSUBOPT)
+    THROWG("tjBufSize(): Invalid argument");
 
-	/* This allows for rare corner cases in which a JPEG image can actually be
-	   larger than the uncompressed input (we wouldn't mention it if it hadn't
-	   happened before.) */
-	mcuw=tjMCUWidth[jpegSubsamp];
-	mcuh=tjMCUHeight[jpegSubsamp];
-	chromasf=jpegSubsamp==TJSAMP_GRAY? 0: 4*64/(mcuw*mcuh);
-	retval=PAD(width, mcuw) * PAD(height, mcuh) * (2 + chromasf) + 2048;
+  /* This allows for rare corner cases in which a JPEG image can actually be
+     larger than the uncompressed input (we wouldn't mention it if it hadn't
+     happened before.) */
+  mcuw = tjMCUWidth[jpegSubsamp];
+  mcuh = tjMCUHeight[jpegSubsamp];
+  chromasf = jpegSubsamp == TJSAMP_GRAY ? 0 : 4 * 64 / (mcuw * mcuh);
+  retval = PAD(width, mcuw) * PAD(height, mcuh) * (2ULL + chromasf) + 2048ULL;
+  if (retval > (unsigned long long)((unsigned long)-1))
+    THROWG("tjBufSize(): Image is too large");
 
-	bailout:
-	return retval;
+bailout:
+  return (unsigned long)retval;
 }
 
 DLLEXPORT unsigned long TJBUFSIZE(int width, int height)
 {
-	unsigned long retval=0;
+  unsigned long long retval = 0;
 
-	if(width<1 || height<1)
-    _throwg("TJBUFSIZE(): Invalid argument");
+  if (width < 1 || height < 1)
+    THROWG("TJBUFSIZE(): Invalid argument");
 
-	/* This allows for rare corner cases in which a JPEG image can actually be
-	   larger than the uncompressed input (we wouldn't mention it if it hadn't
-	   happened before.) */
-	retval=PAD(width, 16) * PAD(height, 16) * 6 + 2048;
+  /* This allows for rare corner cases in which a JPEG image can actually be
+     larger than the uncompressed input (we wouldn't mention it if it hadn't
+     happened before.) */
+  retval = PAD(width, 16) * PAD(height, 16) * 6ULL + 2048ULL;
+  if (retval > (unsigned long long)((unsigned long)-1))
+    THROWG("TJBUFSIZE(): Image is too large");
 
-	bailout:
-	return retval;
+bailout:
+  return (unsigned long)retval;
 }
 
 
 DLLEXPORT unsigned long tjBufSizeYUV2(int width, int pad, int height,
-	int subsamp)
+                                      int subsamp)
 {
-	int retval=0, nc, i;
+  unsigned long long retval = 0;
+  int nc, i;
 
-	if(subsamp<0 || subsamp>=NUMSUBOPT)
-    _throwg("tjBufSizeYUV2(): Invalid argument");
+  if (subsamp < 0 || subsamp >= NUMSUBOPT)
+    THROWG("tjBufSizeYUV2(): Invalid argument");
 
-	nc=(subsamp==TJSAMP_GRAY? 1:3);
+  nc = (subsamp == TJSAMP_GRAY ? 1 : 3);
   for (i = 0; i < nc; i++) {
-		int pw=tjPlaneWidth(i, width, subsamp);
-		int stride=PAD(pw, pad);
-		int ph=tjPlaneHeight(i, height, subsamp);
+    int pw = tjPlaneWidth(i, width, subsamp);
+    int stride = PAD(pw, pad);
+    int ph = tjPlaneHeight(i, height, subsamp);
 
-		if(pw<0 || ph<0) return -1;
-		else retval+=stride*ph;
-	}
+    if (pw < 0 || ph < 0) return -1;
+    else retval += (unsigned long long)stride * ph;
+  }
+  if (retval > (unsigned long long)((unsigned long)-1))
+    THROWG("tjBufSizeYUV2(): Image is too large");
 
-	bailout:
-	return retval;
+bailout:
+  return (unsigned long)retval;
 }
 
 DLLEXPORT unsigned long tjBufSizeYUV(int width, int height, int subsamp)
 {
-	return tjBufSizeYUV2(width, 4, height, subsamp);
+  return tjBufSizeYUV2(width, 4, height, subsamp);
 }
 
 DLLEXPORT unsigned long TJBUFSIZEYUV(int width, int height, int subsamp)
 {
-	return tjBufSizeYUV(width, height, subsamp);
+  return tjBufSizeYUV(width, height, subsamp);
 }
 
 
 DLLEXPORT int tjPlaneWidth(int componentID, int width, int subsamp)
 {
-	int pw, nc, retval=0;
+  int pw, nc, retval = 0;
 
-	if(width<1 || subsamp<0 || subsamp>=TJ_NUMSAMP)
-    _throwg("tjPlaneWidth(): Invalid argument");
-	nc=(subsamp==TJSAMP_GRAY? 1:3);
-	if(componentID<0 || componentID>=nc)
-    _throwg("tjPlaneWidth(): Invalid argument");
+  if (width < 1 || subsamp < 0 || subsamp >= TJ_NUMSAMP)
+    THROWG("tjPlaneWidth(): Invalid argument");
+  nc = (subsamp == TJSAMP_GRAY ? 1 : 3);
+  if (componentID < 0 || componentID >= nc)
+    THROWG("tjPlaneWidth(): Invalid argument");
 
-	pw=PAD(width, tjMCUWidth[subsamp]/8);
-	if(componentID==0)
-		retval=pw;
-	else
-		retval=pw*8/tjMCUWidth[subsamp];
+  pw = PAD(width, tjMCUWidth[subsamp] / 8);
+  if (componentID == 0)
+    retval = pw;
+  else
+    retval = pw * 8 / tjMCUWidth[subsamp];
 
-	bailout:
-	return retval;
+bailout:
+  return retval;
 }
 
 
 DLLEXPORT int tjPlaneHeight(int componentID, int height, int subsamp)
 {
-	int ph, nc, retval=0;
+  int ph, nc, retval = 0;
 
-	if(height<1 || subsamp<0 || subsamp>=TJ_NUMSAMP)
-    _throwg("tjPlaneHeight(): Invalid argument");
-	nc=(subsamp==TJSAMP_GRAY? 1:3);
-	if(componentID<0 || componentID>=nc)
-    _throwg("tjPlaneHeight(): Invalid argument");
+  if (height < 1 || subsamp < 0 || subsamp >= TJ_NUMSAMP)
+    THROWG("tjPlaneHeight(): Invalid argument");
+  nc = (subsamp == TJSAMP_GRAY ? 1 : 3);
+  if (componentID < 0 || componentID >= nc)
+    THROWG("tjPlaneHeight(): Invalid argument");
 
-	ph=PAD(height, tjMCUHeight[subsamp]/8);
-	if(componentID==0)
-		retval=ph;
-	else
-		retval=ph*8/tjMCUHeight[subsamp];
+  ph = PAD(height, tjMCUHeight[subsamp] / 8);
+  if (componentID == 0)
+    retval = ph;
+  else
+    retval = ph * 8 / tjMCUHeight[subsamp];
 
-	bailout:
-	return retval;
+bailout:
+  return retval;
 }
 
 
 DLLEXPORT unsigned long tjPlaneSizeYUV(int componentID, int width, int stride,
                                        int height, int subsamp)
 {
-	unsigned long retval=0;
-	int pw, ph;
+  unsigned long long retval = 0;
+  int pw, ph;
 
-	if(width<1 || height<1 || subsamp<0 || subsamp>=NUMSUBOPT)
-    _throwg("tjPlaneSizeYUV(): Invalid argument");
+  if (width < 1 || height < 1 || subsamp < 0 || subsamp >= NUMSUBOPT)
+    THROWG("tjPlaneSizeYUV(): Invalid argument");
 
-	pw=tjPlaneWidth(componentID, width, subsamp);
-	ph=tjPlaneHeight(componentID, height, subsamp);
-	if(pw<0 || ph<0) return -1;
+  pw = tjPlaneWidth(componentID, width, subsamp);
+  ph = tjPlaneHeight(componentID, height, subsamp);
+  if (pw < 0 || ph < 0) return -1;
 
-	if(stride==0) stride=pw;
-	else stride=abs(stride);
+  if (stride == 0) stride = pw;
+  else stride = abs(stride);
 
-	retval=stride*(ph-1)+pw;
+  retval = (unsigned long long)stride * (ph - 1) + pw;
+  if (retval > (unsigned long long)((unsigned long)-1))
+    THROWG("tjPlaneSizeYUV(): Image is too large");
 
-	bailout:
-	return retval;
+bailout:
+  return (unsigned long)retval;
 }
 
 
@@ -621,61 +641,61 @@ DLLEXPORT int tjCompress2(tjhandle handle, const unsigned char *srcBuf,
   int i, retval = 0, alloc = 1;
   JSAMPROW *row_pointer = NULL;
 
-	getcinstance(handle)
+  GET_CINSTANCE(handle)
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
-	if((this->init&COMPRESS)==0)
-		_throw("tjCompress2(): Instance has not been initialized for compression");
+  if ((this->init & COMPRESS) == 0)
+    THROW("tjCompress2(): Instance has not been initialized for compression");
 
   if (srcBuf == NULL || width <= 0 || pitch < 0 || height <= 0 ||
       pixelFormat < 0 || pixelFormat >= TJ_NUMPF || jpegBuf == NULL ||
       jpegSize == NULL || jpegSubsamp < 0 || jpegSubsamp >= NUMSUBOPT ||
       jpegQual < 0 || jpegQual > 100)
-		_throw("tjCompress2(): Invalid argument");
+    THROW("tjCompress2(): Invalid argument");
 
-	if(pitch==0) pitch=width*tjPixelSize[pixelFormat];
+  if (pitch == 0) pitch = width * tjPixelSize[pixelFormat];
 
-	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*height))==NULL)
-		_throw("tjCompress2(): Memory allocation failure");
+  if ((row_pointer = (JSAMPROW *)malloc(sizeof(JSAMPROW) * height)) == NULL)
+    THROW("tjCompress2(): Memory allocation failure");
 
   if (setjmp(this->jerr.setjmp_buffer)) {
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	cinfo->image_width=width;
-	cinfo->image_height=height;
+  cinfo->image_width = width;
+  cinfo->image_height = height;
 
 #ifndef NO_PUTENV
-	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+  if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+  else if (flags & TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+  else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 #endif
 
   if (flags & TJFLAG_NOREALLOC) {
-		alloc=0;  *jpegSize=tjBufSize(width, height, jpegSubsamp);
-	}
-	jpeg_mem_dest_tj(cinfo, jpegBuf, jpegSize, alloc);
-	if(setCompDefaults(cinfo, pixelFormat, jpegSubsamp, jpegQual, flags)==-1)
-		return -1;
+    alloc = 0;  *jpegSize = tjBufSize(width, height, jpegSubsamp);
+  }
+  jpeg_mem_dest_tj(cinfo, jpegBuf, jpegSize, alloc);
+  if (setCompDefaults(cinfo, pixelFormat, jpegSubsamp, jpegQual, flags) == -1)
+    return -1;
 
-	jpeg_start_compress(cinfo, TRUE);
+  jpeg_start_compress(cinfo, TRUE);
   for (i = 0; i < height; i++) {
-		if(flags&TJFLAG_BOTTOMUP)
-			row_pointer[i]=(JSAMPROW)&srcBuf[(height-i-1)*pitch];
+    if (flags & TJFLAG_BOTTOMUP)
+      row_pointer[i] = (JSAMPROW)&srcBuf[(height - i - 1) * (size_t)pitch];
     else
-      row_pointer[i] = (JSAMPROW)&srcBuf[i * pitch];
-	}
-	while(cinfo->next_scanline<cinfo->image_height)
-		jpeg_write_scanlines(cinfo, &row_pointer[cinfo->next_scanline],
-			cinfo->image_height-cinfo->next_scanline);
-	jpeg_finish_compress(cinfo);
+      row_pointer[i] = (JSAMPROW)&srcBuf[i * (size_t)pitch];
+  }
+  while (cinfo->next_scanline < cinfo->image_height)
+    jpeg_write_scanlines(cinfo, &row_pointer[cinfo->next_scanline],
+                         cinfo->image_height - cinfo->next_scanline);
+  jpeg_finish_compress(cinfo);
 
-	bailout:
-	if(cinfo->global_state>CSTATE_START) jpeg_abort_compress(cinfo);
-	if(row_pointer) free(row_pointer);
-	if(this->jerr.warning) retval=-1;
+bailout:
+  if (cinfo->global_state > CSTATE_START) jpeg_abort_compress(cinfo);
+  free(row_pointer);
+  if (this->jerr.warning) retval = -1;
   this->jerr.stopOnWarning = FALSE;
-	return retval;
+  return retval;
 }
 
 DLLEXPORT int tjCompress(tjhandle handle, unsigned char *srcBuf, int width,
@@ -687,17 +707,17 @@ DLLEXPORT int tjCompress(tjhandle handle, unsigned char *srcBuf, int width,
   unsigned long size;
 
   if (flags & TJ_YUV) {
-		size=tjBufSizeYUV(width, height, jpegSubsamp);
-		retval=tjEncodeYUV2(handle, srcBuf, width, pitch, height,
+    size = tjBufSizeYUV(width, height, jpegSubsamp);
+    retval = tjEncodeYUV2(handle, srcBuf, width, pitch, height,
                           getPixelFormat(pixelSize, flags), jpegBuf,
                           jpegSubsamp, flags);
   } else {
-		retval=tjCompress2(handle, srcBuf, width, pitch, height,
+    retval = tjCompress2(handle, srcBuf, width, pitch, height,
                          getPixelFormat(pixelSize, flags), &jpegBuf, &size,
                          jpegSubsamp, jpegQual, flags | TJFLAG_NOREALLOC);
-	}
-	*jpegSize=size;
-	return retval;
+  }
+  *jpegSize = size;
+  return retval;
 }
 
 
@@ -707,156 +727,156 @@ DLLEXPORT int tjEncodeYUVPlanes(tjhandle handle, const unsigned char *srcBuf,
                                 int *strides, int subsamp, int flags)
 {
   JSAMPROW *row_pointer = NULL;
-	JSAMPLE *_tmpbuf[MAX_COMPONENTS], *_tmpbuf2[MAX_COMPONENTS];
-	JSAMPROW *tmpbuf[MAX_COMPONENTS], *tmpbuf2[MAX_COMPONENTS];
-	JSAMPROW *outbuf[MAX_COMPONENTS];
+  JSAMPLE *_tmpbuf[MAX_COMPONENTS], *_tmpbuf2[MAX_COMPONENTS];
+  JSAMPROW *tmpbuf[MAX_COMPONENTS], *tmpbuf2[MAX_COMPONENTS];
+  JSAMPROW *outbuf[MAX_COMPONENTS];
   int i, retval = 0, row, pw0, ph0, pw[MAX_COMPONENTS], ph[MAX_COMPONENTS];
-	JSAMPLE *ptr;
-	jpeg_component_info *compptr;
+  JSAMPLE *ptr;
+  jpeg_component_info *compptr;
 
-	getcinstance(handle);
+  GET_CINSTANCE(handle);
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
 
   for (i = 0; i < MAX_COMPONENTS; i++) {
-		tmpbuf[i]=NULL;  _tmpbuf[i]=NULL;
-		tmpbuf2[i]=NULL;  _tmpbuf2[i]=NULL;  outbuf[i]=NULL;
-	}
+    tmpbuf[i] = NULL;  _tmpbuf[i] = NULL;
+    tmpbuf2[i] = NULL;  _tmpbuf2[i] = NULL;  outbuf[i] = NULL;
+  }
 
-	if((this->init&COMPRESS)==0)
-		_throw("tjEncodeYUVPlanes(): Instance has not been initialized for compression");
+  if ((this->init & COMPRESS) == 0)
+    THROW("tjEncodeYUVPlanes(): Instance has not been initialized for compression");
 
   if (srcBuf == NULL || width <= 0 || pitch < 0 || height <= 0 ||
       pixelFormat < 0 || pixelFormat >= TJ_NUMPF || !dstPlanes ||
       !dstPlanes[0] || subsamp < 0 || subsamp >= NUMSUBOPT)
-		_throw("tjEncodeYUVPlanes(): Invalid argument");
-	if(subsamp!=TJSAMP_GRAY && (!dstPlanes[1] || !dstPlanes[2]))
-		_throw("tjEncodeYUVPlanes(): Invalid argument");
+    THROW("tjEncodeYUVPlanes(): Invalid argument");
+  if (subsamp != TJSAMP_GRAY && (!dstPlanes[1] || !dstPlanes[2]))
+    THROW("tjEncodeYUVPlanes(): Invalid argument");
 
-	if(pixelFormat==TJPF_CMYK)
-		_throw("tjEncodeYUVPlanes(): Cannot generate YUV images from CMYK pixels");
+  if (pixelFormat == TJPF_CMYK)
+    THROW("tjEncodeYUVPlanes(): Cannot generate YUV images from CMYK pixels");
 
-	if(pitch==0) pitch=width*tjPixelSize[pixelFormat];
+  if (pitch == 0) pitch = width * tjPixelSize[pixelFormat];
 
   if (setjmp(this->jerr.setjmp_buffer)) {
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	cinfo->image_width=width;
-	cinfo->image_height=height;
+  cinfo->image_width = width;
+  cinfo->image_height = height;
 
 #ifndef NO_PUTENV
-	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+  if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+  else if (flags & TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+  else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 #endif
 
-	if(setCompDefaults(cinfo, pixelFormat, subsamp, -1, flags)==-1) return -1;
+  if (setCompDefaults(cinfo, pixelFormat, subsamp, -1, flags) == -1) return -1;
 
-	/* Execute only the parts of jpeg_start_compress() that we need.  If we
-	   were to call the whole jpeg_start_compress() function, then it would try
-	   to write the file headers, which could overflow the output buffer if the
-	   YUV image were very small. */
-	if(cinfo->global_state!=CSTATE_START)
-		_throw("tjEncodeYUVPlanes(): libjpeg API is in the wrong state");
-	(*cinfo->err->reset_error_mgr)((j_common_ptr)cinfo);
-	jinit_c_master_control(cinfo, FALSE);
-	jinit_color_converter(cinfo);
-	jinit_downsampler(cinfo);
-	(*cinfo->cconvert->start_pass)(cinfo);
+  /* Execute only the parts of jpeg_start_compress() that we need.  If we
+     were to call the whole jpeg_start_compress() function, then it would try
+     to write the file headers, which could overflow the output buffer if the
+     YUV image were very small. */
+  if (cinfo->global_state != CSTATE_START)
+    THROW("tjEncodeYUVPlanes(): libjpeg API is in the wrong state");
+  (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
+  jinit_c_master_control(cinfo, FALSE);
+  jinit_color_converter(cinfo);
+  jinit_downsampler(cinfo);
+  (*cinfo->cconvert->start_pass) (cinfo);
 
-	pw0=PAD(width, cinfo->max_h_samp_factor);
-	ph0=PAD(height, cinfo->max_v_samp_factor);
+  pw0 = PAD(width, cinfo->max_h_samp_factor);
+  ph0 = PAD(height, cinfo->max_v_samp_factor);
 
-	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph0))==NULL)
-		_throw("tjEncodeYUVPlanes(): Memory allocation failure");
+  if ((row_pointer = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph0)) == NULL)
+    THROW("tjEncodeYUVPlanes(): Memory allocation failure");
   for (i = 0; i < height; i++) {
-		if(flags&TJFLAG_BOTTOMUP)
-			row_pointer[i]=(JSAMPROW)&srcBuf[(height-i-1)*pitch];
+    if (flags & TJFLAG_BOTTOMUP)
+      row_pointer[i] = (JSAMPROW)&srcBuf[(height - i - 1) * (size_t)pitch];
     else
-      row_pointer[i] = (JSAMPROW)&srcBuf[i * pitch];
-	}
-	if(height<ph0)
-		for(i=height; i<ph0; i++) row_pointer[i]=row_pointer[height-1];
+      row_pointer[i] = (JSAMPROW)&srcBuf[i * (size_t)pitch];
+  }
+  if (height < ph0)
+    for (i = height; i < ph0; i++) row_pointer[i] = row_pointer[height - 1];
 
   for (i = 0; i < cinfo->num_components; i++) {
-		compptr=&cinfo->comp_info[i];
-		_tmpbuf[i]=(JSAMPLE *)malloc(
+    compptr = &cinfo->comp_info[i];
+    _tmpbuf[i] = (JSAMPLE *)malloc(
       PAD((compptr->width_in_blocks * cinfo->max_h_samp_factor * DCTSIZE) /
           compptr->h_samp_factor, 32) *
       cinfo->max_v_samp_factor + 32);
     if (!_tmpbuf[i])
-      _throw("tjEncodeYUVPlanes(): Memory allocation failure");
+      THROW("tjEncodeYUVPlanes(): Memory allocation failure");
     tmpbuf[i] =
       (JSAMPROW *)malloc(sizeof(JSAMPROW) * cinfo->max_v_samp_factor);
     if (!tmpbuf[i])
-      _throw("tjEncodeYUVPlanes(): Memory allocation failure");
+      THROW("tjEncodeYUVPlanes(): Memory allocation failure");
     for (row = 0; row < cinfo->max_v_samp_factor; row++) {
-			unsigned char *_tmpbuf_aligned=
+      unsigned char *_tmpbuf_aligned =
         (unsigned char *)PAD((size_t)_tmpbuf[i], 32);
 
-			tmpbuf[i][row]=&_tmpbuf_aligned[
+      tmpbuf[i][row] = &_tmpbuf_aligned[
         PAD((compptr->width_in_blocks * cinfo->max_h_samp_factor * DCTSIZE) /
             compptr->h_samp_factor, 32) * row];
-		}
+    }
     _tmpbuf2[i] =
       (JSAMPLE *)malloc(PAD(compptr->width_in_blocks * DCTSIZE, 32) *
                         compptr->v_samp_factor + 32);
     if (!_tmpbuf2[i])
-      _throw("tjEncodeYUVPlanes(): Memory allocation failure");
-		tmpbuf2[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*compptr->v_samp_factor);
+      THROW("tjEncodeYUVPlanes(): Memory allocation failure");
+    tmpbuf2[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * compptr->v_samp_factor);
     if (!tmpbuf2[i])
-      _throw("tjEncodeYUVPlanes(): Memory allocation failure");
+      THROW("tjEncodeYUVPlanes(): Memory allocation failure");
     for (row = 0; row < compptr->v_samp_factor; row++) {
-			unsigned char *_tmpbuf2_aligned=
+      unsigned char *_tmpbuf2_aligned =
         (unsigned char *)PAD((size_t)_tmpbuf2[i], 32);
 
       tmpbuf2[i][row] =
         &_tmpbuf2_aligned[PAD(compptr->width_in_blocks * DCTSIZE, 32) * row];
-		}
-		pw[i]=pw0*compptr->h_samp_factor/cinfo->max_h_samp_factor;
-		ph[i]=ph0*compptr->v_samp_factor/cinfo->max_v_samp_factor;
-		outbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph[i]);
+    }
+    pw[i] = pw0 * compptr->h_samp_factor / cinfo->max_h_samp_factor;
+    ph[i] = ph0 * compptr->v_samp_factor / cinfo->max_v_samp_factor;
+    outbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph[i]);
     if (!outbuf[i])
-      _throw("tjEncodeYUVPlanes(): Memory allocation failure");
-		ptr=dstPlanes[i];
+      THROW("tjEncodeYUVPlanes(): Memory allocation failure");
+    ptr = dstPlanes[i];
     for (row = 0; row < ph[i]; row++) {
-			outbuf[i][row]=ptr;
-			ptr+=(strides && strides[i]!=0)? strides[i]:pw[i];
-		}
-	}
+      outbuf[i][row] = ptr;
+      ptr += (strides && strides[i] != 0) ? strides[i] : pw[i];
+    }
+  }
 
   if (setjmp(this->jerr.setjmp_buffer)) {
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
   for (row = 0; row < ph0; row += cinfo->max_v_samp_factor) {
-		(*cinfo->cconvert->color_convert)(cinfo, &row_pointer[row], tmpbuf, 0,
-			cinfo->max_v_samp_factor);
-		(cinfo->downsample->downsample)(cinfo, tmpbuf, 0, tmpbuf2, 0);
+    (*cinfo->cconvert->color_convert) (cinfo, &row_pointer[row], tmpbuf, 0,
+                                       cinfo->max_v_samp_factor);
+    (cinfo->downsample->downsample) (cinfo, tmpbuf, 0, tmpbuf2, 0);
     for (i = 0, compptr = cinfo->comp_info; i < cinfo->num_components;
          i++, compptr++)
-			jcopy_sample_rows(tmpbuf2[i], 0, outbuf[i],
-				row*compptr->v_samp_factor/cinfo->max_v_samp_factor,
-				compptr->v_samp_factor, pw[i]);
-	}
-	cinfo->next_scanline+=height;
-	jpeg_abort_compress(cinfo);
+      jcopy_sample_rows(tmpbuf2[i], 0, outbuf[i],
+        row * compptr->v_samp_factor / cinfo->max_v_samp_factor,
+        compptr->v_samp_factor, pw[i]);
+  }
+  cinfo->next_scanline += height;
+  jpeg_abort_compress(cinfo);
 
-	bailout:
-	if(cinfo->global_state>CSTATE_START) jpeg_abort_compress(cinfo);
-	if(row_pointer) free(row_pointer);
+bailout:
+  if (cinfo->global_state > CSTATE_START) jpeg_abort_compress(cinfo);
+  free(row_pointer);
   for (i = 0; i < MAX_COMPONENTS; i++) {
-		if(tmpbuf[i]!=NULL) free(tmpbuf[i]);
-		if(_tmpbuf[i]!=NULL) free(_tmpbuf[i]);
-		if(tmpbuf2[i]!=NULL) free(tmpbuf2[i]);
-		if(_tmpbuf2[i]!=NULL) free(_tmpbuf2[i]);
-		if(outbuf[i]!=NULL) free(outbuf[i]);
-	}
-	if(this->jerr.warning) retval=-1;
+    free(tmpbuf[i]);
+    free(_tmpbuf[i]);
+    free(tmpbuf2[i]);
+    free(_tmpbuf2[i]);
+    free(outbuf[i]);
+  }
+  if (this->jerr.warning) retval = -1;
   this->jerr.stopOnWarning = FALSE;
-	return retval;
+  return retval;
 }
 
 DLLEXPORT int tjEncodeYUV3(tjhandle handle, const unsigned char *srcBuf,
@@ -864,53 +884,53 @@ DLLEXPORT int tjEncodeYUV3(tjhandle handle, const unsigned char *srcBuf,
                            unsigned char *dstBuf, int pad, int subsamp,
                            int flags)
 {
-	unsigned char *dstPlanes[3];
-	int pw0, ph0, strides[3], retval=-1;
+  unsigned char *dstPlanes[3];
+  int pw0, ph0, strides[3], retval = -1;
   tjinstance *this = (tjinstance *)handle;
 
-  if (!this) _throwg("tjEncodeYUV3(): Invalid handle");
+  if (!this) THROWG("tjEncodeYUV3(): Invalid handle");
   this->isInstanceError = FALSE;
 
-  if (width <= 0 || height <= 0 || dstBuf == NULL || pad < 0 || !isPow2(pad) ||
-      subsamp < 0 || subsamp >= NUMSUBOPT)
-		_throw("tjEncodeYUV3(): Invalid argument");
+  if (width <= 0 || height <= 0 || dstBuf == NULL || pad < 0 ||
+      !IS_POW2(pad) || subsamp < 0 || subsamp >= NUMSUBOPT)
+    THROW("tjEncodeYUV3(): Invalid argument");
 
-	pw0=tjPlaneWidth(0, width, subsamp);
-	ph0=tjPlaneHeight(0, height, subsamp);
-	dstPlanes[0]=dstBuf;
-	strides[0]=PAD(pw0, pad);
+  pw0 = tjPlaneWidth(0, width, subsamp);
+  ph0 = tjPlaneHeight(0, height, subsamp);
+  dstPlanes[0] = dstBuf;
+  strides[0] = PAD(pw0, pad);
   if (subsamp == TJSAMP_GRAY) {
-		strides[1]=strides[2]=0;
-		dstPlanes[1]=dstPlanes[2]=NULL;
+    strides[1] = strides[2] = 0;
+    dstPlanes[1] = dstPlanes[2] = NULL;
   } else {
-		int pw1=tjPlaneWidth(1, width, subsamp);
-		int ph1=tjPlaneHeight(1, height, subsamp);
+    int pw1 = tjPlaneWidth(1, width, subsamp);
+    int ph1 = tjPlaneHeight(1, height, subsamp);
 
-		strides[1]=strides[2]=PAD(pw1, pad);
-		dstPlanes[1]=dstPlanes[0]+strides[0]*ph0;
-		dstPlanes[2]=dstPlanes[1]+strides[1]*ph1;
-	}
+    strides[1] = strides[2] = PAD(pw1, pad);
+    dstPlanes[1] = dstPlanes[0] + strides[0] * ph0;
+    dstPlanes[2] = dstPlanes[1] + strides[1] * ph1;
+  }
 
-	return tjEncodeYUVPlanes(handle, srcBuf, width, pitch, height, pixelFormat,
-		dstPlanes, strides, subsamp, flags);
+  return tjEncodeYUVPlanes(handle, srcBuf, width, pitch, height, pixelFormat,
+                           dstPlanes, strides, subsamp, flags);
 
-	bailout:
-	return retval;
+bailout:
+  return retval;
 }
 
 DLLEXPORT int tjEncodeYUV2(tjhandle handle, unsigned char *srcBuf, int width,
                            int pitch, int height, int pixelFormat,
                            unsigned char *dstBuf, int subsamp, int flags)
 {
-	return tjEncodeYUV3(handle, srcBuf, width, pitch, height, pixelFormat,
-		dstBuf, 4, subsamp, flags);
+  return tjEncodeYUV3(handle, srcBuf, width, pitch, height, pixelFormat,
+                      dstBuf, 4, subsamp, flags);
 }
 
 DLLEXPORT int tjEncodeYUV(tjhandle handle, unsigned char *srcBuf, int width,
                           int pitch, int height, int pixelSize,
                           unsigned char *dstBuf, int subsamp, int flags)
 {
-	return tjEncodeYUV2(handle, srcBuf, width, pitch, height,
+  return tjEncodeYUV2(handle, srcBuf, width, pitch, height,
                       getPixelFormat(pixelSize, flags), dstBuf, subsamp,
                       flags);
 }
@@ -922,133 +942,133 @@ DLLEXPORT int tjCompressFromYUVPlanes(tjhandle handle,
                                       int height, int subsamp,
                                       unsigned char **jpegBuf,
                                       unsigned long *jpegSize, int jpegQual,
-	int flags)
+                                      int flags)
 {
   int i, row, retval = 0, alloc = 1;
-	int pw[MAX_COMPONENTS], ph[MAX_COMPONENTS], iw[MAX_COMPONENTS],
-		tmpbufsize=0, usetmpbuf=0, th[MAX_COMPONENTS];
+  int pw[MAX_COMPONENTS], ph[MAX_COMPONENTS], iw[MAX_COMPONENTS],
+    tmpbufsize = 0, usetmpbuf = 0, th[MAX_COMPONENTS];
   JSAMPLE *_tmpbuf = NULL, *ptr;
   JSAMPROW *inbuf[MAX_COMPONENTS], *tmpbuf[MAX_COMPONENTS];
 
-	getcinstance(handle)
+  GET_CINSTANCE(handle)
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
 
   for (i = 0; i < MAX_COMPONENTS; i++) {
-		tmpbuf[i]=NULL;  inbuf[i]=NULL;
-	}
+    tmpbuf[i] = NULL;  inbuf[i] = NULL;
+  }
 
-	if((this->init&COMPRESS)==0)
-		_throw("tjCompressFromYUVPlanes(): Instance has not been initialized for compression");
+  if ((this->init & COMPRESS) == 0)
+    THROW("tjCompressFromYUVPlanes(): Instance has not been initialized for compression");
 
   if (!srcPlanes || !srcPlanes[0] || width <= 0 || height <= 0 ||
       subsamp < 0 || subsamp >= NUMSUBOPT || jpegBuf == NULL ||
       jpegSize == NULL || jpegQual < 0 || jpegQual > 100)
-		_throw("tjCompressFromYUVPlanes(): Invalid argument");
-	if(subsamp!=TJSAMP_GRAY && (!srcPlanes[1] || !srcPlanes[2]))
-		_throw("tjCompressFromYUVPlanes(): Invalid argument");
+    THROW("tjCompressFromYUVPlanes(): Invalid argument");
+  if (subsamp != TJSAMP_GRAY && (!srcPlanes[1] || !srcPlanes[2]))
+    THROW("tjCompressFromYUVPlanes(): Invalid argument");
 
   if (setjmp(this->jerr.setjmp_buffer)) {
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	cinfo->image_width=width;
-	cinfo->image_height=height;
+  cinfo->image_width = width;
+  cinfo->image_height = height;
 
 #ifndef NO_PUTENV
-	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+  if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+  else if (flags & TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+  else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 #endif
 
   if (flags & TJFLAG_NOREALLOC) {
-		alloc=0;  *jpegSize=tjBufSize(width, height, subsamp);
-	}
-	jpeg_mem_dest_tj(cinfo, jpegBuf, jpegSize, alloc);
-	if(setCompDefaults(cinfo, TJPF_RGB, subsamp, jpegQual, flags)==-1)
-		return -1;
-	cinfo->raw_data_in=TRUE;
+    alloc = 0;  *jpegSize = tjBufSize(width, height, subsamp);
+  }
+  jpeg_mem_dest_tj(cinfo, jpegBuf, jpegSize, alloc);
+  if (setCompDefaults(cinfo, TJPF_RGB, subsamp, jpegQual, flags) == -1)
+    return -1;
+  cinfo->raw_data_in = TRUE;
 
-	jpeg_start_compress(cinfo, TRUE);
+  jpeg_start_compress(cinfo, TRUE);
   for (i = 0; i < cinfo->num_components; i++) {
-		jpeg_component_info *compptr=&cinfo->comp_info[i];
-		int ih;
+    jpeg_component_info *compptr = &cinfo->comp_info[i];
+    int ih;
 
-		iw[i]=compptr->width_in_blocks*DCTSIZE;
-		ih=compptr->height_in_blocks*DCTSIZE;
+    iw[i] = compptr->width_in_blocks * DCTSIZE;
+    ih = compptr->height_in_blocks * DCTSIZE;
     pw[i] = PAD(cinfo->image_width, cinfo->max_h_samp_factor) *
             compptr->h_samp_factor / cinfo->max_h_samp_factor;
     ph[i] = PAD(cinfo->image_height, cinfo->max_v_samp_factor) *
             compptr->v_samp_factor / cinfo->max_v_samp_factor;
-		if(iw[i]!=pw[i] || ih!=ph[i]) usetmpbuf=1;
-		th[i]=compptr->v_samp_factor*DCTSIZE;
-		tmpbufsize+=iw[i]*th[i];
-		if((inbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph[i]))==NULL)
-			_throw("tjCompressFromYUVPlanes(): Memory allocation failure");
-		ptr=(JSAMPLE *)srcPlanes[i];
+    if (iw[i] != pw[i] || ih != ph[i]) usetmpbuf = 1;
+    th[i] = compptr->v_samp_factor * DCTSIZE;
+    tmpbufsize += iw[i] * th[i];
+    if ((inbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph[i])) == NULL)
+      THROW("tjCompressFromYUVPlanes(): Memory allocation failure");
+    ptr = (JSAMPLE *)srcPlanes[i];
     for (row = 0; row < ph[i]; row++) {
-			inbuf[i][row]=ptr;
-			ptr+=(strides && strides[i]!=0)? strides[i]:pw[i];
-		}
-	}
+      inbuf[i][row] = ptr;
+      ptr += (strides && strides[i] != 0) ? strides[i] : pw[i];
+    }
+  }
   if (usetmpbuf) {
-		if((_tmpbuf=(JSAMPLE *)malloc(sizeof(JSAMPLE)*tmpbufsize))==NULL)
-			_throw("tjCompressFromYUVPlanes(): Memory allocation failure");
-		ptr=_tmpbuf;
+    if ((_tmpbuf = (JSAMPLE *)malloc(sizeof(JSAMPLE) * tmpbufsize)) == NULL)
+      THROW("tjCompressFromYUVPlanes(): Memory allocation failure");
+    ptr = _tmpbuf;
     for (i = 0; i < cinfo->num_components; i++) {
-			if((tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*th[i]))==NULL)
-				_throw("tjCompressFromYUVPlanes(): Memory allocation failure");
+      if ((tmpbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * th[i])) == NULL)
+        THROW("tjCompressFromYUVPlanes(): Memory allocation failure");
       for (row = 0; row < th[i]; row++) {
-				tmpbuf[i][row]=ptr;
-				ptr+=iw[i];
-			}
-		}
-	}
+        tmpbuf[i][row] = ptr;
+        ptr += iw[i];
+      }
+    }
+  }
 
   if (setjmp(this->jerr.setjmp_buffer)) {
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	for(row=0; row<(int)cinfo->image_height;
+  for (row = 0; row < (int)cinfo->image_height;
        row += cinfo->max_v_samp_factor * DCTSIZE) {
-		JSAMPARRAY yuvptr[MAX_COMPONENTS];
-		int crow[MAX_COMPONENTS];
+    JSAMPARRAY yuvptr[MAX_COMPONENTS];
+    int crow[MAX_COMPONENTS];
 
     for (i = 0; i < cinfo->num_components; i++) {
-			jpeg_component_info *compptr=&cinfo->comp_info[i];
+      jpeg_component_info *compptr = &cinfo->comp_info[i];
 
-			crow[i]=row*compptr->v_samp_factor/cinfo->max_v_samp_factor;
+      crow[i] = row * compptr->v_samp_factor / cinfo->max_v_samp_factor;
       if (usetmpbuf) {
-				int j, k;
+        int j, k;
 
         for (j = 0; j < MIN(th[i], ph[i] - crow[i]); j++) {
-					memcpy(tmpbuf[i][j], inbuf[i][crow[i]+j], pw[i]);
-					/* Duplicate last sample in row to fill out MCU */
+          memcpy(tmpbuf[i][j], inbuf[i][crow[i] + j], pw[i]);
+          /* Duplicate last sample in row to fill out MCU */
           for (k = pw[i]; k < iw[i]; k++)
             tmpbuf[i][j][k] = tmpbuf[i][j][pw[i] - 1];
-				}
-				/* Duplicate last row to fill out MCU */
-				for(j=ph[i]-crow[i]; j<th[i]; j++)
-					memcpy(tmpbuf[i][j], tmpbuf[i][ph[i]-crow[i]-1], iw[i]);
-				yuvptr[i]=tmpbuf[i];
+        }
+        /* Duplicate last row to fill out MCU */
+        for (j = ph[i] - crow[i]; j < th[i]; j++)
+          memcpy(tmpbuf[i][j], tmpbuf[i][ph[i] - crow[i] - 1], iw[i]);
+        yuvptr[i] = tmpbuf[i];
       } else
-				yuvptr[i]=&inbuf[i][crow[i]];
-		}
-		jpeg_write_raw_data(cinfo, yuvptr, cinfo->max_v_samp_factor*DCTSIZE);
-	}
-	jpeg_finish_compress(cinfo);
+        yuvptr[i] = &inbuf[i][crow[i]];
+    }
+    jpeg_write_raw_data(cinfo, yuvptr, cinfo->max_v_samp_factor * DCTSIZE);
+  }
+  jpeg_finish_compress(cinfo);
 
-	bailout:
-	if(cinfo->global_state>CSTATE_START) jpeg_abort_compress(cinfo);
+bailout:
+  if (cinfo->global_state > CSTATE_START) jpeg_abort_compress(cinfo);
   for (i = 0; i < MAX_COMPONENTS; i++) {
-		if(tmpbuf[i]) free(tmpbuf[i]);
-		if(inbuf[i]) free(inbuf[i]);
-	}
-	if(_tmpbuf) free(_tmpbuf);
-	if(this->jerr.warning) retval=-1;
+    free(tmpbuf[i]);
+    free(inbuf[i]);
+  }
+  free(_tmpbuf);
+  if (this->jerr.warning) retval = -1;
   this->jerr.stopOnWarning = FALSE;
-	return retval;
+  return retval;
 }
 
 DLLEXPORT int tjCompressFromYUV(tjhandle handle, const unsigned char *srcBuf,
@@ -1057,38 +1077,38 @@ DLLEXPORT int tjCompressFromYUV(tjhandle handle, const unsigned char *srcBuf,
                                 unsigned long *jpegSize, int jpegQual,
                                 int flags)
 {
-	const unsigned char *srcPlanes[3];
-	int pw0, ph0, strides[3], retval=-1;
+  const unsigned char *srcPlanes[3];
+  int pw0, ph0, strides[3], retval = -1;
   tjinstance *this = (tjinstance *)handle;
 
-  if (!this) _throwg("tjCompressFromYUV(): Invalid handle");
+  if (!this) THROWG("tjCompressFromYUV(): Invalid handle");
   this->isInstanceError = FALSE;
 
   if (srcBuf == NULL || width <= 0 || pad < 1 || height <= 0 || subsamp < 0 ||
       subsamp >= NUMSUBOPT)
-		_throw("tjCompressFromYUV(): Invalid argument");
+    THROW("tjCompressFromYUV(): Invalid argument");
 
-	pw0=tjPlaneWidth(0, width, subsamp);
-	ph0=tjPlaneHeight(0, height, subsamp);
-	srcPlanes[0]=srcBuf;
-	strides[0]=PAD(pw0, pad);
+  pw0 = tjPlaneWidth(0, width, subsamp);
+  ph0 = tjPlaneHeight(0, height, subsamp);
+  srcPlanes[0] = srcBuf;
+  strides[0] = PAD(pw0, pad);
   if (subsamp == TJSAMP_GRAY) {
-		strides[1]=strides[2]=0;
-		srcPlanes[1]=srcPlanes[2]=NULL;
+    strides[1] = strides[2] = 0;
+    srcPlanes[1] = srcPlanes[2] = NULL;
   } else {
-		int pw1=tjPlaneWidth(1, width, subsamp);
-		int ph1=tjPlaneHeight(1, height, subsamp);
+    int pw1 = tjPlaneWidth(1, width, subsamp);
+    int ph1 = tjPlaneHeight(1, height, subsamp);
 
-		strides[1]=strides[2]=PAD(pw1, pad);
-		srcPlanes[1]=srcPlanes[0]+strides[0]*ph0;
-		srcPlanes[2]=srcPlanes[1]+strides[1]*ph1;
-	}
+    strides[1] = strides[2] = PAD(pw1, pad);
+    srcPlanes[1] = srcPlanes[0] + strides[0] * ph0;
+    srcPlanes[2] = srcPlanes[1] + strides[1] * ph1;
+  }
 
-	return tjCompressFromYUVPlanes(handle, srcPlanes, width, strides, height,
-		subsamp, jpegBuf, jpegSize, jpegQual, flags);
+  return tjCompressFromYUVPlanes(handle, srcPlanes, width, strides, height,
+                                 subsamp, jpegBuf, jpegSize, jpegQual, flags);
 
-	bailout:
-	return retval;
+bailout:
+  return retval;
 }
 
 
@@ -1096,44 +1116,44 @@ DLLEXPORT int tjCompressFromYUV(tjhandle handle, const unsigned char *srcBuf,
 
 static tjhandle _tjInitDecompress(tjinstance *this)
 {
-	static unsigned char buffer[1];
+  static unsigned char buffer[1];
 
   /* This is also straight out of example.txt */
-	this->dinfo.err=jpeg_std_error(&this->jerr.pub);
-	this->jerr.pub.error_exit=my_error_exit;
-	this->jerr.pub.output_message=my_output_message;
-	this->jerr.emit_message=this->jerr.pub.emit_message;
-	this->jerr.pub.emit_message=my_emit_message;
+  this->dinfo.err = jpeg_std_error(&this->jerr.pub);
+  this->jerr.pub.error_exit = my_error_exit;
+  this->jerr.pub.output_message = my_output_message;
+  this->jerr.emit_message = this->jerr.pub.emit_message;
+  this->jerr.pub.emit_message = my_emit_message;
   this->jerr.pub.addon_message_table = turbojpeg_message_table;
   this->jerr.pub.first_addon_message = JMSG_FIRSTADDONCODE;
   this->jerr.pub.last_addon_message = JMSG_LASTADDONCODE;
 
   if (setjmp(this->jerr.setjmp_buffer)) {
-		/* If we get here, the JPEG code has signaled an error. */
-		if(this) free(this);
-		return NULL;
-	}
+    /* If we get here, the JPEG code has signaled an error. */
+    free(this);
+    return NULL;
+  }
 
-	jpeg_create_decompress(&this->dinfo);
-	/* Make an initial call so it will create the source manager */
-	jpeg_mem_src_tj(&this->dinfo, buffer, 1);
+  jpeg_create_decompress(&this->dinfo);
+  /* Make an initial call so it will create the source manager */
+  jpeg_mem_src_tj(&this->dinfo, buffer, 1);
 
-	this->init|=DECOMPRESS;
-	return (tjhandle)this;
+  this->init |= DECOMPRESS;
+  return (tjhandle)this;
 }
 
 DLLEXPORT tjhandle tjInitDecompress(void)
 {
-	tjinstance *this;
+  tjinstance *this;
 
   if ((this = (tjinstance *)malloc(sizeof(tjinstance))) == NULL) {
-		snprintf(errStr, JMSG_LENGTH_MAX,
-			"tjInitDecompress(): Memory allocation failure");
-		return NULL;
-	}
-	MEMZERO(this, sizeof(tjinstance));
+    snprintf(errStr, JMSG_LENGTH_MAX,
+             "tjInitDecompress(): Memory allocation failure");
+    return NULL;
+  }
+  MEMZERO(this, sizeof(tjinstance));
   snprintf(this->errStr, JMSG_LENGTH_MAX, "No error");
-	return _tjInitDecompress(this);
+  return _tjInitDecompress(this);
 }
 
 
@@ -1143,81 +1163,81 @@ DLLEXPORT int tjDecompressHeader3(tjhandle handle,
                                   int *height, int *jpegSubsamp,
                                   int *jpegColorspace)
 {
-	int retval=0;
+  int retval = 0;
 
-	getdinstance(handle);
-	if((this->init&DECOMPRESS)==0)
-		_throw("tjDecompressHeader3(): Instance has not been initialized for decompression");
+  GET_DINSTANCE(handle);
+  if ((this->init & DECOMPRESS) == 0)
+    THROW("tjDecompressHeader3(): Instance has not been initialized for decompression");
 
   if (jpegBuf == NULL || jpegSize <= 0 || width == NULL || height == NULL ||
       jpegSubsamp == NULL || jpegColorspace == NULL)
-		_throw("tjDecompressHeader3(): Invalid argument");
+    THROW("tjDecompressHeader3(): Invalid argument");
 
   if (setjmp(this->jerr.setjmp_buffer)) {
-		/* If we get here, the JPEG code has signaled an error. */
-		return -1;
-	}
+    /* If we get here, the JPEG code has signaled an error. */
+    return -1;
+  }
 
-	jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
-	jpeg_read_header(dinfo, TRUE);
+  jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+  jpeg_read_header(dinfo, TRUE);
 
-	*width=dinfo->image_width;
-	*height=dinfo->image_height;
-	*jpegSubsamp=getSubsamp(dinfo);
+  *width = dinfo->image_width;
+  *height = dinfo->image_height;
+  *jpegSubsamp = getSubsamp(dinfo);
   switch (dinfo->jpeg_color_space) {
-		case JCS_GRAYSCALE:  *jpegColorspace=TJCS_GRAY;  break;
-		case JCS_RGB:        *jpegColorspace=TJCS_RGB;  break;
-		case JCS_YCbCr:      *jpegColorspace=TJCS_YCbCr;  break;
-		case JCS_CMYK:       *jpegColorspace=TJCS_CMYK;  break;
-		case JCS_YCCK:       *jpegColorspace=TJCS_YCCK;  break;
-		default:             *jpegColorspace=-1;  break;
-	}
+  case JCS_GRAYSCALE:  *jpegColorspace = TJCS_GRAY;  break;
+  case JCS_RGB:        *jpegColorspace = TJCS_RGB;  break;
+  case JCS_YCbCr:      *jpegColorspace = TJCS_YCbCr;  break;
+  case JCS_CMYK:       *jpegColorspace = TJCS_CMYK;  break;
+  case JCS_YCCK:       *jpegColorspace = TJCS_YCCK;  break;
+  default:             *jpegColorspace = -1;  break;
+  }
 
-	jpeg_abort_decompress(dinfo);
+  jpeg_abort_decompress(dinfo);
 
-	if(*jpegSubsamp<0)
-		_throw("tjDecompressHeader3(): Could not determine subsampling type for JPEG image");
-	if(*jpegColorspace<0)
-		_throw("tjDecompressHeader3(): Could not determine colorspace of JPEG image");
-	if(*width<1 || *height<1)
-		_throw("tjDecompressHeader3(): Invalid data returned in header");
+  if (*jpegSubsamp < 0)
+    THROW("tjDecompressHeader3(): Could not determine subsampling type for JPEG image");
+  if (*jpegColorspace < 0)
+    THROW("tjDecompressHeader3(): Could not determine colorspace of JPEG image");
+  if (*width < 1 || *height < 1)
+    THROW("tjDecompressHeader3(): Invalid data returned in header");
 
-	bailout:
-	if(this->jerr.warning) retval=-1;
-	return retval;
+bailout:
+  if (this->jerr.warning) retval = -1;
+  return retval;
 }
 
 DLLEXPORT int tjDecompressHeader2(tjhandle handle, unsigned char *jpegBuf,
                                   unsigned long jpegSize, int *width,
                                   int *height, int *jpegSubsamp)
 {
-	int jpegColorspace;
+  int jpegColorspace;
 
-	return tjDecompressHeader3(handle, jpegBuf, jpegSize, width, height,
-		jpegSubsamp, &jpegColorspace);
+  return tjDecompressHeader3(handle, jpegBuf, jpegSize, width, height,
+                             jpegSubsamp, &jpegColorspace);
 }
 
 DLLEXPORT int tjDecompressHeader(tjhandle handle, unsigned char *jpegBuf,
                                  unsigned long jpegSize, int *width,
                                  int *height)
 {
-	int jpegSubsamp;
+  int jpegSubsamp;
 
-	return tjDecompressHeader2(handle, jpegBuf, jpegSize, width, height,
-		&jpegSubsamp);
+  return tjDecompressHeader2(handle, jpegBuf, jpegSize, width, height,
+                             &jpegSubsamp);
 }
 
 
 DLLEXPORT tjscalingfactor *tjGetScalingFactors(int *numscalingfactors)
 {
   if (numscalingfactors == NULL) {
-		snprintf(errStr, JMSG_LENGTH_MAX,
-			"tjGetScalingFactors(): Invalid argument");
-		return NULL;
-	}
+    snprintf(errStr, JMSG_LENGTH_MAX,
+             "tjGetScalingFactors(): Invalid argument");
+    return NULL;
+  }
 
-	*numscalingfactors=NUMSF;
-	return (tjscalingfactor *)sf;
+  *numscalingfactors = NUMSF;
+  return (tjscalingfactor *)sf;
 }
 
 
@@ -1229,74 +1249,74 @@ DLLEXPORT int tjDecompress2(tjhandle handle, const unsigned char *jpegBuf,
   JSAMPROW *row_pointer = NULL;
   int i, retval = 0, jpegwidth, jpegheight, scaledw, scaledh;
 
-	getdinstance(handle);
+  GET_DINSTANCE(handle);
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
-	if((this->init&DECOMPRESS)==0)
-		_throw("tjDecompress2(): Instance has not been initialized for decompression");
+  if ((this->init & DECOMPRESS) == 0)
+    THROW("tjDecompress2(): Instance has not been initialized for decompression");
 
   if (jpegBuf == NULL || jpegSize <= 0 || dstBuf == NULL || width < 0 ||
       pitch < 0 || height < 0 || pixelFormat < 0 || pixelFormat >= TJ_NUMPF)
-		_throw("tjDecompress2(): Invalid argument");
+    THROW("tjDecompress2(): Invalid argument");
 
 #ifndef NO_PUTENV
-	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+  if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+  else if (flags & TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+  else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 #endif
 
   if (setjmp(this->jerr.setjmp_buffer)) {
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
-	jpeg_read_header(dinfo, TRUE);
+  jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+  jpeg_read_header(dinfo, TRUE);
   this->dinfo.out_color_space = pf2cs[pixelFormat];
   if (flags & TJFLAG_FASTDCT) this->dinfo.dct_method = JDCT_FASTEST;
-	if(flags&TJFLAG_FASTUPSAMPLE) dinfo->do_fancy_upsampling=FALSE;
+  if (flags & TJFLAG_FASTUPSAMPLE) dinfo->do_fancy_upsampling = FALSE;
 
-	jpegwidth=dinfo->image_width;  jpegheight=dinfo->image_height;
-	if(width==0) width=jpegwidth;
-	if(height==0) height=jpegheight;
+  jpegwidth = dinfo->image_width;  jpegheight = dinfo->image_height;
+  if (width == 0) width = jpegwidth;
+  if (height == 0) height = jpegheight;
   for (i = 0; i < NUMSF; i++) {
-		scaledw=TJSCALED(jpegwidth, sf[i]);
-		scaledh=TJSCALED(jpegheight, sf[i]);
-		if(scaledw<=width && scaledh<=height)
-			break;
-	}
-	if(i>=NUMSF)
-		_throw("tjDecompress2(): Could not scale down to desired image dimensions");
-	width=scaledw;  height=scaledh;
-	dinfo->scale_num=sf[i].num;
-	dinfo->scale_denom=sf[i].denom;
+    scaledw = TJSCALED(jpegwidth, sf[i]);
+    scaledh = TJSCALED(jpegheight, sf[i]);
+    if (scaledw <= width && scaledh <= height)
+      break;
+  }
+  if (i >= NUMSF)
+    THROW("tjDecompress2(): Could not scale down to desired image dimensions");
+  width = scaledw;  height = scaledh;
+  dinfo->scale_num = sf[i].num;
+  dinfo->scale_denom = sf[i].denom;
 
-	jpeg_start_decompress(dinfo);
-	if(pitch==0) pitch=dinfo->output_width*tjPixelSize[pixelFormat];
+  jpeg_start_decompress(dinfo);
+  if (pitch == 0) pitch = dinfo->output_width * tjPixelSize[pixelFormat];
 
   if ((row_pointer =
        (JSAMPROW *)malloc(sizeof(JSAMPROW) * dinfo->output_height)) == NULL)
-		_throw("tjDecompress2(): Memory allocation failure");
+    THROW("tjDecompress2(): Memory allocation failure");
   if (setjmp(this->jerr.setjmp_buffer)) {
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
   for (i = 0; i < (int)dinfo->output_height; i++) {
-		if(flags&TJFLAG_BOTTOMUP)
-			row_pointer[i]=&dstBuf[(dinfo->output_height-i-1)*pitch];
+    if (flags & TJFLAG_BOTTOMUP)
+      row_pointer[i] = &dstBuf[(dinfo->output_height - i - 1) * (size_t)pitch];
     else
-      row_pointer[i] = &dstBuf[i * pitch];
-	}
-	while(dinfo->output_scanline<dinfo->output_height)
-		jpeg_read_scanlines(dinfo, &row_pointer[dinfo->output_scanline],
-			dinfo->output_height-dinfo->output_scanline);
-	jpeg_finish_decompress(dinfo);
+      row_pointer[i] = &dstBuf[i * (size_t)pitch];
+  }
+  while (dinfo->output_scanline < dinfo->output_height)
+    jpeg_read_scanlines(dinfo, &row_pointer[dinfo->output_scanline],
+                        dinfo->output_height - dinfo->output_scanline);
+  jpeg_finish_decompress(dinfo);
 
-	bailout:
-	if(dinfo->global_state>DSTATE_START) jpeg_abort_decompress(dinfo);
-	if(row_pointer) free(row_pointer);
-	if(this->jerr.warning) retval=-1;
+bailout:
+  if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
+  free(row_pointer);
+  if (this->jerr.warning) retval = -1;
   this->jerr.stopOnWarning = FALSE;
-	return retval;
+  return retval;
 }
 
 DLLEXPORT int tjDecompress(tjhandle handle, unsigned char *jpegBuf,
@@ -1304,61 +1324,61 @@ DLLEXPORT int tjDecompress(tjhandle handle, unsigned char *jpegBuf,
                            int width, int pitch, int height, int pixelSize,
                            int flags)
 {
-	if(flags&TJ_YUV)
-		return tjDecompressToYUV(handle, jpegBuf, jpegSize, dstBuf, flags);
-	else
-		return tjDecompress2(handle, jpegBuf, jpegSize, dstBuf, width, pitch,
-			height, getPixelFormat(pixelSize, flags), flags);
+  if (flags & TJ_YUV)
+    return tjDecompressToYUV(handle, jpegBuf, jpegSize, dstBuf, flags);
+  else
+    return tjDecompress2(handle, jpegBuf, jpegSize, dstBuf, width, pitch,
+                         height, getPixelFormat(pixelSize, flags), flags);
 }
 
 
 static int setDecodeDefaults(struct jpeg_decompress_struct *dinfo,
-	int pixelFormat, int subsamp, int flags)
+                             int pixelFormat, int subsamp, int flags)
 {
-	int i;
+  int i;
 
-	dinfo->scale_num=dinfo->scale_denom=1;
+  dinfo->scale_num = dinfo->scale_denom = 1;
 
   if (subsamp == TJSAMP_GRAY) {
-		dinfo->num_components=dinfo->comps_in_scan=1;
-		dinfo->jpeg_color_space=JCS_GRAYSCALE;
+    dinfo->num_components = dinfo->comps_in_scan = 1;
+    dinfo->jpeg_color_space = JCS_GRAYSCALE;
   } else {
-		dinfo->num_components=dinfo->comps_in_scan=3;
-		dinfo->jpeg_color_space=JCS_YCbCr;
-	}
+    dinfo->num_components = dinfo->comps_in_scan = 3;
+    dinfo->jpeg_color_space = JCS_YCbCr;
+  }
 
-	dinfo->comp_info=(jpeg_component_info *)
-		(*dinfo->mem->alloc_small)((j_common_ptr)dinfo, JPOOL_IMAGE,
+  dinfo->comp_info = (jpeg_component_info *)
+    (*dinfo->mem->alloc_small) ((j_common_ptr)dinfo, JPOOL_IMAGE,
                                 dinfo->num_components *
                                 sizeof(jpeg_component_info));
 
   for (i = 0; i < dinfo->num_components; i++) {
-		jpeg_component_info *compptr=&dinfo->comp_info[i];
+    jpeg_component_info *compptr = &dinfo->comp_info[i];
 
-		compptr->h_samp_factor=(i==0)? tjMCUWidth[subsamp]/8:1;
-		compptr->v_samp_factor=(i==0)? tjMCUHeight[subsamp]/8:1;
-		compptr->component_index=i;
-		compptr->component_id=i+1;
+    compptr->h_samp_factor = (i == 0) ? tjMCUWidth[subsamp] / 8 : 1;
+    compptr->v_samp_factor = (i == 0) ? tjMCUHeight[subsamp] / 8 : 1;
+    compptr->component_index = i;
+    compptr->component_id = i + 1;
     compptr->quant_tbl_no = compptr->dc_tbl_no =
       compptr->ac_tbl_no = (i == 0) ? 0 : 1;
-		dinfo->cur_comp_info[i]=compptr;
-	}
-	dinfo->data_precision=8;
+    dinfo->cur_comp_info[i] = compptr;
+  }
+  dinfo->data_precision = 8;
   for (i = 0; i < 2; i++) {
-		if(dinfo->quant_tbl_ptrs[i]==NULL)
-			dinfo->quant_tbl_ptrs[i]=jpeg_alloc_quant_table((j_common_ptr)dinfo);
-	}
+    if (dinfo->quant_tbl_ptrs[i] == NULL)
+      dinfo->quant_tbl_ptrs[i] = jpeg_alloc_quant_table((j_common_ptr)dinfo);
+  }
 
-	return 0;
+  return 0;
 }
 
 
-int my_read_markers(j_decompress_ptr dinfo)
+static int my_read_markers(j_decompress_ptr dinfo)
 {
-	return JPEG_REACHED_SOS;
+  return JPEG_REACHED_SOS;
 }
 
-void my_reset_marker_reader(j_decompress_ptr dinfo)
+static void my_reset_marker_reader(j_decompress_ptr dinfo)
 {
 }
 
@@ -1369,142 +1389,145 @@ DLLEXPORT int tjDecodeYUVPlanes(tjhandle handle,
                                 int height, int pixelFormat, int flags)
 {
   JSAMPROW *row_pointer = NULL;
-	JSAMPLE *_tmpbuf[MAX_COMPONENTS];
-	JSAMPROW *tmpbuf[MAX_COMPONENTS], *inbuf[MAX_COMPONENTS];
+  JSAMPLE *_tmpbuf[MAX_COMPONENTS];
+  JSAMPROW *tmpbuf[MAX_COMPONENTS], *inbuf[MAX_COMPONENTS];
   int i, retval = 0, row, pw0, ph0, pw[MAX_COMPONENTS], ph[MAX_COMPONENTS];
-	JSAMPLE *ptr;
-	jpeg_component_info *compptr;
-	int (*old_read_markers)(j_decompress_ptr);
-	void (*old_reset_marker_reader)(j_decompress_ptr);
+  JSAMPLE *ptr;
+  jpeg_component_info *compptr;
+  int (*old_read_markers) (j_decompress_ptr);
+  void (*old_reset_marker_reader) (j_decompress_ptr);
 
-	getdinstance(handle);
+  GET_DINSTANCE(handle);
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
 
   for (i = 0; i < MAX_COMPONENTS; i++) {
-		tmpbuf[i]=NULL;  _tmpbuf[i]=NULL;  inbuf[i]=NULL;
-	}
+    tmpbuf[i] = NULL;  _tmpbuf[i] = NULL;  inbuf[i] = NULL;
+  }
 
-	if((this->init&DECOMPRESS)==0)
-		_throw("tjDecodeYUVPlanes(): Instance has not been initialized for decompression");
+  if ((this->init & DECOMPRESS) == 0)
+    THROW("tjDecodeYUVPlanes(): Instance has not been initialized for decompression");
 
   if (!srcPlanes || !srcPlanes[0] || subsamp < 0 || subsamp >= NUMSUBOPT ||
       dstBuf == NULL || width <= 0 || pitch < 0 || height <= 0 ||
       pixelFormat < 0 || pixelFormat >= TJ_NUMPF)
-		_throw("tjDecodeYUVPlanes(): Invalid argument");
-	if(subsamp!=TJSAMP_GRAY && (!srcPlanes[1] || !srcPlanes[2]))
-		_throw("tjDecodeYUVPlanes(): Invalid argument");
+    THROW("tjDecodeYUVPlanes(): Invalid argument");
+  if (subsamp != TJSAMP_GRAY && (!srcPlanes[1] || !srcPlanes[2]))
+    THROW("tjDecodeYUVPlanes(): Invalid argument");
 
   if (setjmp(this->jerr.setjmp_buffer)) {
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	if(pixelFormat==TJPF_CMYK)
-		_throw("tjDecodeYUVPlanes(): Cannot decode YUV images into CMYK pixels.");
+  if (pixelFormat == TJPF_CMYK)
+    THROW("tjDecodeYUVPlanes(): Cannot decode YUV images into CMYK pixels.");
 
-	if(pitch==0) pitch=width*tjPixelSize[pixelFormat];
-	dinfo->image_width=width;
-	dinfo->image_height=height;
+  if (pitch == 0) pitch = width * tjPixelSize[pixelFormat];
+  dinfo->image_width = width;
+  dinfo->image_height = height;
 
 #ifndef NO_PUTENV
-	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+  if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+  else if (flags & TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+  else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 #endif
 
+  dinfo->progressive_mode = dinfo->inputctl->has_multiple_scans = FALSE;
+  dinfo->Ss = dinfo->Ah = dinfo->Al = 0;
+  dinfo->Se = DCTSIZE2 - 1;
   if (setDecodeDefaults(dinfo, pixelFormat, subsamp, flags) == -1) {
-		retval=-1;  goto bailout;
-	}
-	old_read_markers=dinfo->marker->read_markers;
-	dinfo->marker->read_markers=my_read_markers;
-	old_reset_marker_reader=dinfo->marker->reset_marker_reader;
-	dinfo->marker->reset_marker_reader=my_reset_marker_reader;
-	jpeg_read_header(dinfo, TRUE);
-	dinfo->marker->read_markers=old_read_markers;
-	dinfo->marker->reset_marker_reader=old_reset_marker_reader;
+    retval = -1;  goto bailout;
+  }
+  old_read_markers = dinfo->marker->read_markers;
+  dinfo->marker->read_markers = my_read_markers;
+  old_reset_marker_reader = dinfo->marker->reset_marker_reader;
+  dinfo->marker->reset_marker_reader = my_reset_marker_reader;
+  jpeg_read_header(dinfo, TRUE);
+  dinfo->marker->read_markers = old_read_markers;
+  dinfo->marker->reset_marker_reader = old_reset_marker_reader;
 
   this->dinfo.out_color_space = pf2cs[pixelFormat];
   if (flags & TJFLAG_FASTDCT) this->dinfo.dct_method = JDCT_FASTEST;
-	dinfo->do_fancy_upsampling=FALSE;
-	dinfo->Se=DCTSIZE2-1;
-	jinit_master_decompress(dinfo);
-	(*dinfo->upsample->start_pass)(dinfo);
+  dinfo->do_fancy_upsampling = FALSE;
+  dinfo->Se = DCTSIZE2 - 1;
+  jinit_master_decompress(dinfo);
+  (*dinfo->upsample->start_pass) (dinfo);
 
-	pw0=PAD(width, dinfo->max_h_samp_factor);
-	ph0=PAD(height, dinfo->max_v_samp_factor);
+  pw0 = PAD(width, dinfo->max_h_samp_factor);
+  ph0 = PAD(height, dinfo->max_v_samp_factor);
 
-	if(pitch==0) pitch=dinfo->output_width*tjPixelSize[pixelFormat];
+  if (pitch == 0) pitch = dinfo->output_width * tjPixelSize[pixelFormat];
 
-	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph0))==NULL)
-		_throw("tjDecodeYUVPlanes(): Memory allocation failure");
+  if ((row_pointer = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph0)) == NULL)
+    THROW("tjDecodeYUVPlanes(): Memory allocation failure");
   for (i = 0; i < height; i++) {
     if (flags & TJFLAG_BOTTOMUP)
-      row_pointer[i] = &dstBuf[(height - i - 1) * pitch];
+      row_pointer[i] = &dstBuf[(height - i - 1) * (size_t)pitch];
     else
-      row_pointer[i] = &dstBuf[i * pitch];
-	}
-	if(height<ph0)
-		for(i=height; i<ph0; i++) row_pointer[i]=row_pointer[height-1];
+      row_pointer[i] = &dstBuf[i * (size_t)pitch];
+  }
+  if (height < ph0)
+    for (i = height; i < ph0; i++) row_pointer[i] = row_pointer[height - 1];
 
   for (i = 0; i < dinfo->num_components; i++) {
-		compptr=&dinfo->comp_info[i];
+    compptr = &dinfo->comp_info[i];
     _tmpbuf[i] =
       (JSAMPLE *)malloc(PAD(compptr->width_in_blocks * DCTSIZE, 32) *
                         compptr->v_samp_factor + 32);
     if (!_tmpbuf[i])
-      _throw("tjDecodeYUVPlanes(): Memory allocation failure");
-		tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*compptr->v_samp_factor);
+      THROW("tjDecodeYUVPlanes(): Memory allocation failure");
+    tmpbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * compptr->v_samp_factor);
     if (!tmpbuf[i])
-      _throw("tjDecodeYUVPlanes(): Memory allocation failure");
+      THROW("tjDecodeYUVPlanes(): Memory allocation failure");
     for (row = 0; row < compptr->v_samp_factor; row++) {
-			unsigned char *_tmpbuf_aligned=
+      unsigned char *_tmpbuf_aligned =
         (unsigned char *)PAD((size_t)_tmpbuf[i], 32);
 
       tmpbuf[i][row] =
         &_tmpbuf_aligned[PAD(compptr->width_in_blocks * DCTSIZE, 32) * row];
-		}
-		pw[i]=pw0*compptr->h_samp_factor/dinfo->max_h_samp_factor;
-		ph[i]=ph0*compptr->v_samp_factor/dinfo->max_v_samp_factor;
-		inbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph[i]);
+    }
+    pw[i] = pw0 * compptr->h_samp_factor / dinfo->max_h_samp_factor;
+    ph[i] = ph0 * compptr->v_samp_factor / dinfo->max_v_samp_factor;
+    inbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph[i]);
     if (!inbuf[i])
-      _throw("tjDecodeYUVPlanes(): Memory allocation failure");
-		ptr=(JSAMPLE *)srcPlanes[i];
+      THROW("tjDecodeYUVPlanes(): Memory allocation failure");
+    ptr = (JSAMPLE *)srcPlanes[i];
     for (row = 0; row < ph[i]; row++) {
-			inbuf[i][row]=ptr;
-			ptr+=(strides && strides[i]!=0)? strides[i]:pw[i];
-		}
-	}
+      inbuf[i][row] = ptr;
+      ptr += (strides && strides[i] != 0) ? strides[i] : pw[i];
+    }
+  }
 
   if (setjmp(this->jerr.setjmp_buffer)) {
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
   for (row = 0; row < ph0; row += dinfo->max_v_samp_factor) {
-		JDIMENSION inrow=0, outrow=0;
+    JDIMENSION inrow = 0, outrow = 0;
 
     for (i = 0, compptr = dinfo->comp_info; i < dinfo->num_components;
          i++, compptr++)
-			jcopy_sample_rows(inbuf[i],
-				row*compptr->v_samp_factor/dinfo->max_v_samp_factor, tmpbuf[i], 0,
-				compptr->v_samp_factor, pw[i]);
-		(dinfo->upsample->upsample)(dinfo, tmpbuf, &inrow,
+      jcopy_sample_rows(inbuf[i],
+        row * compptr->v_samp_factor / dinfo->max_v_samp_factor, tmpbuf[i], 0,
+        compptr->v_samp_factor, pw[i]);
+    (dinfo->upsample->upsample) (dinfo, tmpbuf, &inrow,
                                  dinfo->max_v_samp_factor, &row_pointer[row],
                                  &outrow, dinfo->max_v_samp_factor);
-	}
-	jpeg_abort_decompress(dinfo);
+  }
+  jpeg_abort_decompress(dinfo);
 
-	bailout:
-	if(dinfo->global_state>DSTATE_START) jpeg_abort_decompress(dinfo);
-	if(row_pointer) free(row_pointer);
+bailout:
+  if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
+  free(row_pointer);
   for (i = 0; i < MAX_COMPONENTS; i++) {
-		if(tmpbuf[i]!=NULL) free(tmpbuf[i]);
-		if(_tmpbuf[i]!=NULL) free(_tmpbuf[i]);
-		if(inbuf[i]!=NULL) free(inbuf[i]);
-	}
-	if(this->jerr.warning) retval=-1;
+    free(tmpbuf[i]);
+    free(_tmpbuf[i]);
+    free(inbuf[i]);
+  }
+  if (this->jerr.warning) retval = -1;
   this->jerr.stopOnWarning = FALSE;
-	return retval;
+  return retval;
 }
 
 DLLEXPORT int tjDecodeYUV(tjhandle handle, const unsigned char *srcBuf,
@@ -1512,38 +1535,38 @@ DLLEXPORT int tjDecodeYUV(tjhandle handle, const unsigned char *srcBuf,
                           int width, int pitch, int height, int pixelFormat,
                           int flags)
 {
-	const unsigned char *srcPlanes[3];
-	int pw0, ph0, strides[3], retval=-1;
+  const unsigned char *srcPlanes[3];
+  int pw0, ph0, strides[3], retval = -1;
   tjinstance *this = (tjinstance *)handle;
 
-  if (!this) _throwg("tjDecodeYUV(): Invalid handle");
+  if (!this) THROWG("tjDecodeYUV(): Invalid handle");
   this->isInstanceError = FALSE;
 
-  if (srcBuf == NULL || pad < 0 || !isPow2(pad) || subsamp < 0 ||
+  if (srcBuf == NULL || pad < 0 || !IS_POW2(pad) || subsamp < 0 ||
       subsamp >= NUMSUBOPT || width <= 0 || height <= 0)
-		_throw("tjDecodeYUV(): Invalid argument");
+    THROW("tjDecodeYUV(): Invalid argument");
 
-	pw0=tjPlaneWidth(0, width, subsamp);
-	ph0=tjPlaneHeight(0, height, subsamp);
-	srcPlanes[0]=srcBuf;
-	strides[0]=PAD(pw0, pad);
+  pw0 = tjPlaneWidth(0, width, subsamp);
+  ph0 = tjPlaneHeight(0, height, subsamp);
+  srcPlanes[0] = srcBuf;
+  strides[0] = PAD(pw0, pad);
   if (subsamp == TJSAMP_GRAY) {
-		strides[1]=strides[2]=0;
-		srcPlanes[1]=srcPlanes[2]=NULL;
+    strides[1] = strides[2] = 0;
+    srcPlanes[1] = srcPlanes[2] = NULL;
   } else {
-		int pw1=tjPlaneWidth(1, width, subsamp);
-		int ph1=tjPlaneHeight(1, height, subsamp);
+    int pw1 = tjPlaneWidth(1, width, subsamp);
+    int ph1 = tjPlaneHeight(1, height, subsamp);
 
-		strides[1]=strides[2]=PAD(pw1, pad);
-		srcPlanes[1]=srcPlanes[0]+strides[0]*ph0;
-		srcPlanes[2]=srcPlanes[1]+strides[1]*ph1;
-	}
+    strides[1] = strides[2] = PAD(pw1, pad);
+    srcPlanes[1] = srcPlanes[0] + strides[0] * ph0;
+    srcPlanes[2] = srcPlanes[1] + strides[1] * ph1;
+  }
 
-	return tjDecodeYUVPlanes(handle, srcPlanes, strides, subsamp, dstBuf, width,
-		pitch, height, pixelFormat, flags);
+  return tjDecodeYUVPlanes(handle, srcPlanes, strides, subsamp, dstBuf, width,
+                           pitch, height, pixelFormat, flags);
 
-	bailout:
-	return retval;
+bailout:
+  return retval;
 }
 
 DLLEXPORT int tjDecompressToYUVPlanes(tjhandle handle,
@@ -1553,241 +1576,239 @@ DLLEXPORT int tjDecompressToYUVPlanes(tjhandle handle,
                                       int *strides, int height, int flags)
 {
   int i, sfi, row, retval = 0;
-	int jpegwidth, jpegheight, jpegSubsamp, scaledw, scaledh;
-	int pw[MAX_COMPONENTS], ph[MAX_COMPONENTS], iw[MAX_COMPONENTS],
-		tmpbufsize=0, usetmpbuf=0, th[MAX_COMPONENTS];
+  int jpegwidth, jpegheight, jpegSubsamp, scaledw, scaledh;
+  int pw[MAX_COMPONENTS], ph[MAX_COMPONENTS], iw[MAX_COMPONENTS],
+    tmpbufsize = 0, usetmpbuf = 0, th[MAX_COMPONENTS];
   JSAMPLE *_tmpbuf = NULL, *ptr;
   JSAMPROW *outbuf[MAX_COMPONENTS], *tmpbuf[MAX_COMPONENTS];
-	int dctsize;
+  int dctsize;
 
-	getdinstance(handle);
+  GET_DINSTANCE(handle);
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
 
   for (i = 0; i < MAX_COMPONENTS; i++) {
-		tmpbuf[i]=NULL;  outbuf[i]=NULL;
-	}
+    tmpbuf[i] = NULL;  outbuf[i] = NULL;
+  }
 
-	if((this->init&DECOMPRESS)==0)
-		_throw("tjDecompressToYUVPlanes(): Instance has not been initialized for decompression");
+  if ((this->init & DECOMPRESS) == 0)
+    THROW("tjDecompressToYUVPlanes(): Instance has not been initialized for decompression");
 
   if (jpegBuf == NULL || jpegSize <= 0 || !dstPlanes || !dstPlanes[0] ||
       width < 0 || height < 0)
-		_throw("tjDecompressToYUVPlanes(): Invalid argument");
+    THROW("tjDecompressToYUVPlanes(): Invalid argument");
 
 #ifndef NO_PUTENV
-	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+  if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+  else if (flags & TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+  else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 #endif
 
   if (setjmp(this->jerr.setjmp_buffer)) {
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
   if (!this->headerRead) {
-		jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
-		jpeg_read_header(dinfo, TRUE);
-	}
-	this->headerRead=0;
-	jpegSubsamp=getSubsamp(dinfo);
-	if(jpegSubsamp<0)
-		_throw("tjDecompressToYUVPlanes(): Could not determine subsampling type for JPEG image");
+    jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+    jpeg_read_header(dinfo, TRUE);
+  }
+  this->headerRead = 0;
+  jpegSubsamp = getSubsamp(dinfo);
+  if (jpegSubsamp < 0)
+    THROW("tjDecompressToYUVPlanes(): Could not determine subsampling type for JPEG image");
 
-	if(jpegSubsamp!=TJSAMP_GRAY && (!dstPlanes[1] || !dstPlanes[2]))
-		_throw("tjDecompressToYUVPlanes(): Invalid argument");
+  if (jpegSubsamp != TJSAMP_GRAY && (!dstPlanes[1] || !dstPlanes[2]))
+    THROW("tjDecompressToYUVPlanes(): Invalid argument");
 
-	jpegwidth=dinfo->image_width;  jpegheight=dinfo->image_height;
-	if(width==0) width=jpegwidth;
-	if(height==0) height=jpegheight;
+  jpegwidth = dinfo->image_width;  jpegheight = dinfo->image_height;
+  if (width == 0) width = jpegwidth;
+  if (height == 0) height = jpegheight;
   for (i = 0; i < NUMSF; i++) {
-		scaledw=TJSCALED(jpegwidth, sf[i]);
-		scaledh=TJSCALED(jpegheight, sf[i]);
-		if(scaledw<=width && scaledh<=height)
-			break;
-	}
-	if(i>=NUMSF)
-		_throw("tjDecompressToYUVPlanes(): Could not scale down to desired image dimensions");
-	if(dinfo->num_components>3)
-		_throw("tjDecompressToYUVPlanes(): JPEG image must have 3 or fewer components");
+    scaledw = TJSCALED(jpegwidth, sf[i]);
+    scaledh = TJSCALED(jpegheight, sf[i]);
+    if (scaledw <= width && scaledh <= height)
+      break;
+  }
+  if (i >= NUMSF)
+    THROW("tjDecompressToYUVPlanes(): Could not scale down to desired image dimensions");
+  if (dinfo->num_components > 3)
+    THROW("tjDecompressToYUVPlanes(): JPEG image must have 3 or fewer components");
 
-	width=scaledw;  height=scaledh;
-	dinfo->scale_num=sf[i].num;
-	dinfo->scale_denom=sf[i].denom;
-	sfi=i;
-	jpeg_calc_output_dimensions(dinfo);
+  width = scaledw;  height = scaledh;
+  dinfo->scale_num = sf[i].num;
+  dinfo->scale_denom = sf[i].denom;
+  sfi = i;
+  jpeg_calc_output_dimensions(dinfo);
 
-	dctsize=DCTSIZE*sf[sfi].num/sf[sfi].denom;
+  dctsize = DCTSIZE * sf[sfi].num / sf[sfi].denom;
 
   for (i = 0; i < dinfo->num_components; i++) {
-		jpeg_component_info *compptr=&dinfo->comp_info[i];
-		int ih;
+    jpeg_component_info *compptr = &dinfo->comp_info[i];
+    int ih;
 
-		iw[i]=compptr->width_in_blocks*dctsize;
-		ih=compptr->height_in_blocks*dctsize;
-    pw[i] = PAD(dinfo->output_width, dinfo->max_h_samp_factor) *
-            compptr->h_samp_factor / dinfo->max_h_samp_factor;
-    ph[i] = PAD(dinfo->output_height, dinfo->max_v_samp_factor) *
-            compptr->v_samp_factor / dinfo->max_v_samp_factor;
-		if(iw[i]!=pw[i] || ih!=ph[i]) usetmpbuf=1;
-		th[i]=compptr->v_samp_factor*dctsize;
-		tmpbufsize+=iw[i]*th[i];
-		if((outbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph[i]))==NULL)
-			_throw("tjDecompressToYUVPlanes(): Memory allocation failure");
-		ptr=dstPlanes[i];
+    iw[i] = compptr->width_in_blocks * dctsize;
+    ih = compptr->height_in_blocks * dctsize;
+    pw[i] = tjPlaneWidth(i, dinfo->output_width, jpegSubsamp);
+    ph[i] = tjPlaneHeight(i, dinfo->output_height, jpegSubsamp);
+    if (iw[i] != pw[i] || ih != ph[i]) usetmpbuf = 1;
+    th[i] = compptr->v_samp_factor * dctsize;
+    tmpbufsize += iw[i] * th[i];
+    if ((outbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph[i])) == NULL)
+      THROW("tjDecompressToYUVPlanes(): Memory allocation failure");
+    ptr = dstPlanes[i];
     for (row = 0; row < ph[i]; row++) {
-			outbuf[i][row]=ptr;
-			ptr+=(strides && strides[i]!=0)? strides[i]:pw[i];
-		}
-	}
+      outbuf[i][row] = ptr;
+      ptr += (strides && strides[i] != 0) ? strides[i] : pw[i];
+    }
+  }
   if (usetmpbuf) {
-		if((_tmpbuf=(JSAMPLE *)malloc(sizeof(JSAMPLE)*tmpbufsize))==NULL)
-			_throw("tjDecompressToYUVPlanes(): Memory allocation failure");
-		ptr=_tmpbuf;
+    if ((_tmpbuf = (JSAMPLE *)malloc(sizeof(JSAMPLE) * tmpbufsize)) == NULL)
+      THROW("tjDecompressToYUVPlanes(): Memory allocation failure");
+    ptr = _tmpbuf;
     for (i = 0; i < dinfo->num_components; i++) {
-			if((tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*th[i]))==NULL)
-				_throw("tjDecompressToYUVPlanes(): Memory allocation failure");
+      if ((tmpbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * th[i])) == NULL)
+        THROW("tjDecompressToYUVPlanes(): Memory allocation failure");
       for (row = 0; row < th[i]; row++) {
-				tmpbuf[i][row]=ptr;
-				ptr+=iw[i];
-			}
-		}
-	}
+        tmpbuf[i][row] = ptr;
+        ptr += iw[i];
+      }
+    }
+  }
 
   if (setjmp(this->jerr.setjmp_buffer)) {
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	if(flags&TJFLAG_FASTUPSAMPLE) dinfo->do_fancy_upsampling=FALSE;
-	if(flags&TJFLAG_FASTDCT) dinfo->dct_method=JDCT_FASTEST;
-	dinfo->raw_data_out=TRUE;
+  if (flags & TJFLAG_FASTUPSAMPLE) dinfo->do_fancy_upsampling = FALSE;
+  if (flags & TJFLAG_FASTDCT) dinfo->dct_method = JDCT_FASTEST;
+  dinfo->raw_data_out = TRUE;
 
-	jpeg_start_decompress(dinfo);
-	for(row=0; row<(int)dinfo->output_height;
+  jpeg_start_decompress(dinfo);
+  for (row = 0; row < (int)dinfo->output_height;
        row += dinfo->max_v_samp_factor * dinfo->_min_DCT_scaled_size) {
-		JSAMPARRAY yuvptr[MAX_COMPONENTS];
-		int crow[MAX_COMPONENTS];
+    JSAMPARRAY yuvptr[MAX_COMPONENTS];
+    int crow[MAX_COMPONENTS];
 
     for (i = 0; i < dinfo->num_components; i++) {
-			jpeg_component_info *compptr=&dinfo->comp_info[i];
+      jpeg_component_info *compptr = &dinfo->comp_info[i];
 
       if (jpegSubsamp == TJ_420) {
-				/* When 4:2:0 subsampling is used with IDCT scaling, libjpeg will try
-				   to be clever and use the IDCT to perform upsampling on the U and V
-				   planes.  For instance, if the output image is to be scaled by 1/2
-				   relative to the JPEG image, then the scaling factor and upsampling
-				   effectively cancel each other, so a normal 8x8 IDCT can be used.
-				   However, this is not desirable when using the decompress-to-YUV
-				   functionality in TurboJPEG, since we want to output the U and V
-				   planes in their subsampled form.  Thus, we have to override some
-				   internal libjpeg parameters to force it to use the "scaled" IDCT
-				   functions on the U and V planes. */
-				compptr->_DCT_scaled_size=dctsize;
-				compptr->MCU_sample_width=tjMCUWidth[jpegSubsamp]*
-					sf[sfi].num/sf[sfi].denom*
-					compptr->v_samp_factor/dinfo->max_v_samp_factor;
-				dinfo->idct->inverse_DCT[i] = dinfo->idct->inverse_DCT[0];
-			}
-			crow[i]=row*compptr->v_samp_factor/dinfo->max_v_samp_factor;
-			if(usetmpbuf) yuvptr[i]=tmpbuf[i];
-			else yuvptr[i]=&outbuf[i][crow[i]];
-		}
-		jpeg_read_raw_data(dinfo, yuvptr,
-			dinfo->max_v_samp_factor*dinfo->_min_DCT_scaled_size);
+        /* When 4:2:0 subsampling is used with IDCT scaling, libjpeg will try
+           to be clever and use the IDCT to perform upsampling on the U and V
+           planes.  For instance, if the output image is to be scaled by 1/2
+           relative to the JPEG image, then the scaling factor and upsampling
+           effectively cancel each other, so a normal 8x8 IDCT can be used.
+           However, this is not desirable when using the decompress-to-YUV
+           functionality in TurboJPEG, since we want to output the U and V
+           planes in their subsampled form.  Thus, we have to override some
+           internal libjpeg parameters to force it to use the "scaled" IDCT
+           functions on the U and V planes. */
+        compptr->_DCT_scaled_size = dctsize;
+        compptr->MCU_sample_width = tjMCUWidth[jpegSubsamp] *
+          sf[sfi].num / sf[sfi].denom *
+          compptr->v_samp_factor / dinfo->max_v_samp_factor;
+        dinfo->idct->inverse_DCT[i] = dinfo->idct->inverse_DCT[0];
+      }
+      crow[i] = row * compptr->v_samp_factor / dinfo->max_v_samp_factor;
+      if (usetmpbuf) yuvptr[i] = tmpbuf[i];
+      else yuvptr[i] = &outbuf[i][crow[i]];
+    }
+    jpeg_read_raw_data(dinfo, yuvptr,
+                       dinfo->max_v_samp_factor * dinfo->_min_DCT_scaled_size);
     if (usetmpbuf) {
-			int j;
+      int j;
 
       for (i = 0; i < dinfo->num_components; i++) {
         for (j = 0; j < MIN(th[i], ph[i] - crow[i]); j++) {
-					memcpy(outbuf[i][crow[i]+j], tmpbuf[i][j], pw[i]);
-				}
-			}
-		}
-	}
-	jpeg_finish_decompress(dinfo);
+          memcpy(outbuf[i][crow[i] + j], tmpbuf[i][j], pw[i]);
+        }
+      }
+    }
+  }
+  jpeg_finish_decompress(dinfo);
 
-	bailout:
-	if(dinfo->global_state>DSTATE_START) jpeg_abort_decompress(dinfo);
+bailout:
+  if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
   for (i = 0; i < MAX_COMPONENTS; i++) {
-		if(tmpbuf[i]) free(tmpbuf[i]);
-		if(outbuf[i]) free(outbuf[i]);
-	}
-	if(_tmpbuf) free(_tmpbuf);
-	if(this->jerr.warning) retval=-1;
+    free(tmpbuf[i]);
+    free(outbuf[i]);
+  }
+  free(_tmpbuf);
+  if (this->jerr.warning) retval = -1;
   this->jerr.stopOnWarning = FALSE;
-	return retval;
+  return retval;
 }
 
 DLLEXPORT int tjDecompressToYUV2(tjhandle handle, const unsigned char *jpegBuf,
                                  unsigned long jpegSize, unsigned char *dstBuf,
-	int width, int pad, int height, int flags)
+                                 int width, int pad, int height, int flags)
 {
-	unsigned char *dstPlanes[3];
-	int pw0, ph0, strides[3], retval=-1, jpegSubsamp=-1;
-	int i, jpegwidth, jpegheight, scaledw, scaledh;
+  unsigned char *dstPlanes[3];
+  int pw0, ph0, strides[3], retval = -1, jpegSubsamp = -1;
+  int i, jpegwidth, jpegheight, scaledw, scaledh;
 
-	getdinstance(handle);
+  GET_DINSTANCE(handle);
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
 
   if (jpegBuf == NULL || jpegSize <= 0 || dstBuf == NULL || width < 0 ||
-      pad < 1 || !isPow2(pad) || height < 0)
-		_throw("tjDecompressToYUV2(): Invalid argument");
+      pad < 1 || !IS_POW2(pad) || height < 0)
+    THROW("tjDecompressToYUV2(): Invalid argument");
 
   if (setjmp(this->jerr.setjmp_buffer)) {
-		/* If we get here, the JPEG code has signaled an error. */
-		return -1;
-	}
+    /* If we get here, the JPEG code has signaled an error. */
+    return -1;
+  }
 
-	jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
-	jpeg_read_header(dinfo, TRUE);
-	jpegSubsamp=getSubsamp(dinfo);
-	if(jpegSubsamp<0)
-		_throw("tjDecompressToYUV2(): Could not determine subsampling type for JPEG image");
+  jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+  jpeg_read_header(dinfo, TRUE);
+  jpegSubsamp = getSubsamp(dinfo);
+  if (jpegSubsamp < 0)
+    THROW("tjDecompressToYUV2(): Could not determine subsampling type for JPEG image");
 
-	jpegwidth=dinfo->image_width;  jpegheight=dinfo->image_height;
-	if(width==0) width=jpegwidth;
-	if(height==0) height=jpegheight;
+  jpegwidth = dinfo->image_width;  jpegheight = dinfo->image_height;
+  if (width == 0) width = jpegwidth;
+  if (height == 0) height = jpegheight;
 
   for (i = 0; i < NUMSF; i++) {
-		scaledw=TJSCALED(jpegwidth, sf[i]);
-		scaledh=TJSCALED(jpegheight, sf[i]);
-		if(scaledw<=width && scaledh<=height)
-			break;
-	}
-	if(i>=NUMSF)
-		_throw("tjDecompressToYUV2(): Could not scale down to desired image dimensions");
+    scaledw = TJSCALED(jpegwidth, sf[i]);
+    scaledh = TJSCALED(jpegheight, sf[i]);
+    if (scaledw <= width && scaledh <= height)
+      break;
+  }
+  if (i >= NUMSF)
+    THROW("tjDecompressToYUV2(): Could not scale down to desired image dimensions");
 
-	pw0=tjPlaneWidth(0, width, jpegSubsamp);
-	ph0=tjPlaneHeight(0, height, jpegSubsamp);
-	dstPlanes[0]=dstBuf;
-	strides[0]=PAD(pw0, pad);
+  pw0 = tjPlaneWidth(0, width, jpegSubsamp);
+  ph0 = tjPlaneHeight(0, height, jpegSubsamp);
+  dstPlanes[0] = dstBuf;
+  strides[0] = PAD(pw0, pad);
   if (jpegSubsamp == TJSAMP_GRAY) {
-		strides[1]=strides[2]=0;
-		dstPlanes[1]=dstPlanes[2]=NULL;
+    strides[1] = strides[2] = 0;
+    dstPlanes[1] = dstPlanes[2] = NULL;
   } else {
-		int pw1=tjPlaneWidth(1, width, jpegSubsamp);
-		int ph1=tjPlaneHeight(1, height, jpegSubsamp);
+    int pw1 = tjPlaneWidth(1, width, jpegSubsamp);
+    int ph1 = tjPlaneHeight(1, height, jpegSubsamp);
 
-		strides[1]=strides[2]=PAD(pw1, pad);
-		dstPlanes[1]=dstPlanes[0]+strides[0]*ph0;
-		dstPlanes[2]=dstPlanes[1]+strides[1]*ph1;
-	}
+    strides[1] = strides[2] = PAD(pw1, pad);
+    dstPlanes[1] = dstPlanes[0] + strides[0] * ph0;
+    dstPlanes[2] = dstPlanes[1] + strides[1] * ph1;
+  }
 
-	this->headerRead=1;
-	return tjDecompressToYUVPlanes(handle, jpegBuf, jpegSize, dstPlanes, width,
-		strides, height, flags);
+  this->headerRead = 1;
+  return tjDecompressToYUVPlanes(handle, jpegBuf, jpegSize, dstPlanes, width,
+                                 strides, height, flags);
 
-	bailout:
+bailout:
   this->jerr.stopOnWarning = FALSE;
-	return retval;
+  return retval;
 }
 
 DLLEXPORT int tjDecompressToYUV(tjhandle handle, unsigned char *jpegBuf,
                                 unsigned long jpegSize, unsigned char *dstBuf,
-	int flags)
+                                int flags)
 {
-	return tjDecompressToYUV2(handle, jpegBuf, jpegSize, dstBuf, 0, 4, 0, flags);
+  return tjDecompressToYUV2(handle, jpegBuf, jpegSize, dstBuf, 0, 4, 0, flags);
 }
 
 
@@ -1799,16 +1820,16 @@ DLLEXPORT tjhandle tjInitTransform(void)
   tjhandle handle = NULL;
 
   if ((this = (tjinstance *)malloc(sizeof(tjinstance))) == NULL) {
-		snprintf(errStr, JMSG_LENGTH_MAX,
-			"tjInitTransform(): Memory allocation failure");
-		return NULL;
-	}
-	MEMZERO(this, sizeof(tjinstance));
+    snprintf(errStr, JMSG_LENGTH_MAX,
+             "tjInitTransform(): Memory allocation failure");
+    return NULL;
+  }
+  MEMZERO(this, sizeof(tjinstance));
   snprintf(this->errStr, JMSG_LENGTH_MAX, "No error");
-	handle=_tjInitCompress(this);
-	if(!handle) return NULL;
-	handle=_tjInitDecompress(this);
-	return handle;
+  handle = _tjInitCompress(this);
+  if (!handle) return NULL;
+  handle = _tjInitDecompress(this);
+  return handle;
 }
 
 
@@ -1817,104 +1838,105 @@ DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
                           unsigned char **dstBufs, unsigned long *dstSizes,
                           tjtransform *t, int flags)
 {
-	jpeg_transform_info *xinfo=NULL;
-	jvirt_barray_ptr *srccoefs, *dstcoefs;
+  jpeg_transform_info *xinfo = NULL;
+  jvirt_barray_ptr *srccoefs, *dstcoefs;
   int retval = 0, i, jpegSubsamp, saveMarkers = 0;
 
-	getinstance(handle);
+  GET_INSTANCE(handle);
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
-	if((this->init&COMPRESS)==0 || (this->init&DECOMPRESS)==0)
-		_throw("tjTransform(): Instance has not been initialized for transformation");
+  if ((this->init & COMPRESS) == 0 || (this->init & DECOMPRESS) == 0)
+    THROW("tjTransform(): Instance has not been initialized for transformation");
 
   if (jpegBuf == NULL || jpegSize <= 0 || n < 1 || dstBufs == NULL ||
       dstSizes == NULL || t == NULL || flags < 0)
-		_throw("tjTransform(): Invalid argument");
+    THROW("tjTransform(): Invalid argument");
 
 #ifndef NO_PUTENV
-	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+  if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+  else if (flags & TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+  else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 #endif
 
   if ((xinfo =
        (jpeg_transform_info *)malloc(sizeof(jpeg_transform_info) * n)) == NULL)
-		_throw("tjTransform(): Memory allocation failure");
-	MEMZERO(xinfo, sizeof(jpeg_transform_info)*n);
+    THROW("tjTransform(): Memory allocation failure");
+  MEMZERO(xinfo, sizeof(jpeg_transform_info) * n);
 
   if (setjmp(this->jerr.setjmp_buffer)) {
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+  jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
 
   for (i = 0; i < n; i++) {
-		xinfo[i].transform=xformtypes[t[i].op];
-		xinfo[i].perfect=(t[i].options&TJXOPT_PERFECT)? 1:0;
-		xinfo[i].trim=(t[i].options&TJXOPT_TRIM)? 1:0;
-		xinfo[i].force_grayscale=(t[i].options&TJXOPT_GRAY)? 1:0;
-		xinfo[i].crop=(t[i].options&TJXOPT_CROP)? 1:0;
-		if(n!=1 && t[i].op==TJXOP_HFLIP) xinfo[i].slow_hflip=1;
-		else xinfo[i].slow_hflip=0;
+    xinfo[i].transform = xformtypes[t[i].op];
+    xinfo[i].perfect = (t[i].options & TJXOPT_PERFECT) ? 1 : 0;
+    xinfo[i].trim = (t[i].options & TJXOPT_TRIM) ? 1 : 0;
+    xinfo[i].force_grayscale = (t[i].options & TJXOPT_GRAY) ? 1 : 0;
+    xinfo[i].crop = (t[i].options & TJXOPT_CROP) ? 1 : 0;
+    if (n != 1 && t[i].op == TJXOP_HFLIP) xinfo[i].slow_hflip = 1;
+    else xinfo[i].slow_hflip = 0;
 
     if (xinfo[i].crop) {
-			xinfo[i].crop_xoffset=t[i].r.x;  xinfo[i].crop_xoffset_set=JCROP_POS;
-			xinfo[i].crop_yoffset=t[i].r.y;  xinfo[i].crop_yoffset_set=JCROP_POS;
+      xinfo[i].crop_xoffset = t[i].r.x;  xinfo[i].crop_xoffset_set = JCROP_POS;
+      xinfo[i].crop_yoffset = t[i].r.y;  xinfo[i].crop_yoffset_set = JCROP_POS;
       if (t[i].r.w != 0) {
-				xinfo[i].crop_width=t[i].r.w;  xinfo[i].crop_width_set=JCROP_POS;
+        xinfo[i].crop_width = t[i].r.w;  xinfo[i].crop_width_set = JCROP_POS;
       } else
         xinfo[i].crop_width = JCROP_UNSET;
       if (t[i].r.h != 0) {
-				xinfo[i].crop_height=t[i].r.h;  xinfo[i].crop_height_set=JCROP_POS;
+        xinfo[i].crop_height = t[i].r.h;  xinfo[i].crop_height_set = JCROP_POS;
       } else
         xinfo[i].crop_height = JCROP_UNSET;
-			}
+    }
     if (!(t[i].options & TJXOPT_COPYNONE)) saveMarkers = 1;
-	}
+  }
 
   jcopy_markers_setup(dinfo, saveMarkers ? JCOPYOPT_ALL : JCOPYOPT_NONE);
-	jpeg_read_header(dinfo, TRUE);
-	jpegSubsamp=getSubsamp(dinfo);
-	if(jpegSubsamp<0)
-		_throw("tjTransform(): Could not determine subsampling type for JPEG image");
+  jpeg_read_header(dinfo, TRUE);
+  jpegSubsamp = getSubsamp(dinfo);
+  if (jpegSubsamp < 0)
+    THROW("tjTransform(): Could not determine subsampling type for JPEG image");
 
   for (i = 0; i < n; i++) {
-		if(!jtransform_request_workspace(dinfo, &xinfo[i]))
-			_throw("tjTransform(): Transform is not perfect");
+    if (!jtransform_request_workspace(dinfo, &xinfo[i]))
+      THROW("tjTransform(): Transform is not perfect");
 
     if (xinfo[i].crop) {
       if ((t[i].r.x % xinfo[i].iMCU_sample_width) != 0 ||
           (t[i].r.y % xinfo[i].iMCU_sample_height) != 0) {
-				snprintf(errStr, JMSG_LENGTH_MAX,
-					"To crop this JPEG image, x must be a multiple of %d\n"
-					"and y must be a multiple of %d.\n",
-					xinfo[i].iMCU_sample_width, xinfo[i].iMCU_sample_height);
-				retval=-1;  goto bailout;
-			}
-		}
-	}
+        snprintf(this->errStr, JMSG_LENGTH_MAX,
+                 "To crop this JPEG image, x must be a multiple of %d\n"
+                 "and y must be a multiple of %d.\n",
+                 xinfo[i].iMCU_sample_width, xinfo[i].iMCU_sample_height);
+        this->isInstanceError = TRUE;
+        retval = -1;  goto bailout;
+      }
+    }
+  }
 
-	srccoefs=jpeg_read_coefficients(dinfo);
+  srccoefs = jpeg_read_coefficients(dinfo);
 
   for (i = 0; i < n; i++) {
-		int w, h, alloc=1;
+    int w, h, alloc = 1;
 
     if (!xinfo[i].crop) {
-			w=dinfo->image_width;  h=dinfo->image_height;
+      w = dinfo->image_width;  h = dinfo->image_height;
     } else {
-			w=xinfo[i].crop_width;  h=xinfo[i].crop_height;
-		}
+      w = xinfo[i].crop_width;  h = xinfo[i].crop_height;
+    }
     if (flags & TJFLAG_NOREALLOC) {
-			alloc=0;  dstSizes[i]=tjBufSize(w, h, jpegSubsamp);
-		}
-		if(!(t[i].options&TJXOPT_NOOUTPUT))
-			jpeg_mem_dest_tj(cinfo, &dstBufs[i], &dstSizes[i], alloc);
-		jpeg_copy_critical_parameters(dinfo, cinfo);
+      alloc = 0;  dstSizes[i] = tjBufSize(w, h, jpegSubsamp);
+    }
+    if (!(t[i].options & TJXOPT_NOOUTPUT))
+      jpeg_mem_dest_tj(cinfo, &dstBufs[i], &dstSizes[i], alloc);
+    jpeg_copy_critical_parameters(dinfo, cinfo);
     dstcoefs = jtransform_adjust_parameters(dinfo, cinfo, srccoefs, &xinfo[i]);
     if (flags & TJFLAG_PROGRESSIVE || t[i].options & TJXOPT_PROGRESSIVE)
-				jpeg_simple_progression(cinfo);
+      jpeg_simple_progression(cinfo);
     if (!(t[i].options & TJXOPT_NOOUTPUT)) {
-			jpeg_write_coefficients(cinfo, dstcoefs);
+      jpeg_write_coefficients(cinfo, dstcoefs);
       jcopy_markers_execute(dinfo, cinfo, t[i].options & TJXOPT_COPYNONE ?
                                           JCOPYOPT_NONE : JCOPYOPT_ALL);
     } else
@@ -1925,7 +1947,7 @@ DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
       JDIMENSION by;
 
       for (ci = 0; ci < cinfo->num_components; ci++) {
-				jpeg_component_info *compptr=&cinfo->comp_info[ci];
+        jpeg_component_info *compptr = &cinfo->comp_info[ci];
         tjregion arrayRegion = {
           0, 0, compptr->width_in_blocks * DCTSIZE, DCTSIZE
         };
@@ -1936,29 +1958,29 @@ DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
 
         for (by = 0; by < compptr->height_in_blocks;
              by += compptr->v_samp_factor) {
-					JBLOCKARRAY barray=(dinfo->mem->access_virt_barray)
-						((j_common_ptr)dinfo, dstcoefs[ci], by, compptr->v_samp_factor,
-						TRUE);
+          JBLOCKARRAY barray = (dinfo->mem->access_virt_barray)
+            ((j_common_ptr)dinfo, dstcoefs[ci], by, compptr->v_samp_factor,
+             TRUE);
 
           for (y = 0; y < compptr->v_samp_factor; y++) {
             if (t[i].customFilter(barray[y][0], arrayRegion, planeRegion, ci,
                                   i, &t[i]) == -1)
-							_throw("tjTransform(): Error in custom filter");
-						arrayRegion.y+=DCTSIZE;
-					}
-				}
-			}
-		}
-		if(!(t[i].options&TJXOPT_NOOUTPUT)) jpeg_finish_compress(cinfo);
-	}
+              THROW("tjTransform(): Error in custom filter");
+            arrayRegion.y += DCTSIZE;
+          }
+        }
+      }
+    }
+    if (!(t[i].options & TJXOPT_NOOUTPUT)) jpeg_finish_compress(cinfo);
+  }
 
-	jpeg_finish_decompress(dinfo);
+  jpeg_finish_decompress(dinfo);
 
-	bailout:
-	if(cinfo->global_state>CSTATE_START) jpeg_abort_compress(cinfo);
-	if(dinfo->global_state>DSTATE_START) jpeg_abort_decompress(dinfo);
-	if(xinfo) free(xinfo);
-	if(this->jerr.warning) retval=-1;
+bailout:
+  if (cinfo->global_state > CSTATE_START) jpeg_abort_compress(cinfo);
+  if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
+  free(xinfo);
+  if (this->jerr.warning) retval = -1;
   this->jerr.stopOnWarning = FALSE;
   return retval;
 }
@@ -1968,7 +1990,8 @@ DLLEXPORT unsigned char *tjLoadImage(const char *filename, int *width,
                                      int align, int *height, int *pixelFormat,
                                      int flags)
 {
-  int retval = 0, tempc, pitch;
+  int retval = 0, tempc;
+  size_t pitch;
   tjhandle handle = NULL;
   tjinstance *this;
   j_compress_ptr cinfo = NULL;
@@ -1979,21 +2002,21 @@ DLLEXPORT unsigned char *tjLoadImage(const char *filename, int *width,
 
   if (!filename || !width || align < 1 || !height || !pixelFormat ||
       *pixelFormat < TJPF_UNKNOWN || *pixelFormat >= TJ_NUMPF)
-    _throwg("tjLoadImage(): Invalid argument");
+    THROWG("tjLoadImage(): Invalid argument");
   if ((align & (align - 1)) != 0)
-    _throwg("tjLoadImage(): Alignment must be a power of 2");
+    THROWG("tjLoadImage(): Alignment must be a power of 2");
 
   if ((handle = tjInitCompress()) == NULL) return NULL;
   this = (tjinstance *)handle;
   cinfo = &this->cinfo;
 
   if ((file = fopen(filename, "rb")) == NULL)
-    _throwunix("tjLoadImage(): Cannot open input file");
+    THROW_UNIX("tjLoadImage(): Cannot open input file");
 
   if ((tempc = getc(file)) < 0 || ungetc(tempc, file) == EOF)
-    _throwunix("tjLoadImage(): Could not read input file")
+    THROW_UNIX("tjLoadImage(): Could not read input file")
   else if (tempc == EOF)
-    _throwg("tjLoadImage(): Input file contains no data");
+    THROWG("tjLoadImage(): Input file contains no data");
 
   if (setjmp(this->jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error. */
@@ -2004,14 +2027,14 @@ DLLEXPORT unsigned char *tjLoadImage(const char *filename, int *width,
   else cinfo->in_color_space = pf2cs[*pixelFormat];
   if (tempc == 'B') {
     if ((src = jinit_read_bmp(cinfo, FALSE)) == NULL)
-      _throwg("tjLoadImage(): Could not initialize bitmap loader");
+      THROWG("tjLoadImage(): Could not initialize bitmap loader");
     invert = (flags & TJFLAG_BOTTOMUP) == 0;
   } else if (tempc == 'P') {
     if ((src = jinit_read_ppm(cinfo)) == NULL)
-      _throwg("tjLoadImage(): Could not initialize bitmap loader");
+      THROWG("tjLoadImage(): Could not initialize bitmap loader");
     invert = (flags & TJFLAG_BOTTOMUP) != 0;
   } else
-    _throwg("tjLoadImage(): Unsupported file type");
+    THROWG("tjLoadImage(): Unsupported file type");
 
   src->input_file = file;
   (*src->start_input) (cinfo, src);
@@ -2021,8 +2044,10 @@ DLLEXPORT unsigned char *tjLoadImage(const char *filename, int *width,
   *pixelFormat = cs2pf[cinfo->in_color_space];
 
   pitch = PAD((*width) * tjPixelSize[*pixelFormat], align);
-  if ((dstBuf = (unsigned char *)malloc(pitch * (*height))) == NULL)
-    _throwg("tjLoadImage(): Memory allocation failure");
+  if ((unsigned long long)pitch * (unsigned long long)(*height) >
+      (unsigned long long)((size_t)-1) ||
+      (dstBuf = (unsigned char *)malloc(pitch * (*height))) == NULL)
+    THROWG("tjLoadImage(): Memory allocation failure");
 
   if (setjmp(this->jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error. */
@@ -2049,7 +2074,7 @@ DLLEXPORT unsigned char *tjLoadImage(const char *filename, int *width,
 bailout:
   if (handle) tjDestroy(handle);
   if (file) fclose(file);
-  if (retval < 0 && dstBuf) { free(dstBuf);  dstBuf = NULL; }
+  if (retval < 0) { free(dstBuf);  dstBuf = NULL; }
   return dstBuf;
 }
 
@@ -2069,7 +2094,7 @@ DLLEXPORT int tjSaveImage(const char *filename, unsigned char *buffer,
 
   if (!filename || !buffer || width < 1 || pitch < 0 || height < 1 ||
       pixelFormat < 0 || pixelFormat >= TJ_NUMPF)
-    _throwg("tjSaveImage(): Invalid argument");
+    THROWG("tjSaveImage(): Invalid argument");
 
   if ((handle = tjInitDecompress()) == NULL)
     return -1;
@@ -2077,7 +2102,7 @@ DLLEXPORT int tjSaveImage(const char *filename, unsigned char *buffer,
   dinfo = &this->dinfo;
 
   if ((file = fopen(filename, "wb")) == NULL)
-    _throwunix("tjSaveImage(): Cannot open output file");
+    THROW_UNIX("tjSaveImage(): Cannot open output file");
 
   if (setjmp(this->jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error. */
@@ -2092,11 +2117,11 @@ DLLEXPORT int tjSaveImage(const char *filename, unsigned char *buffer,
   ptr = strrchr(filename, '.');
   if (ptr && !strcasecmp(ptr, ".bmp")) {
     if ((dst = jinit_write_bmp(dinfo, FALSE, FALSE)) == NULL)
-      _throwg("tjSaveImage(): Could not initialize bitmap writer");
+      THROWG("tjSaveImage(): Could not initialize bitmap writer");
     invert = (flags & TJFLAG_BOTTOMUP) == 0;
   } else {
     if ((dst = jinit_write_ppm(dinfo)) == NULL)
-      _throwg("tjSaveImage(): Could not initialize PPM writer");
+      THROWG("tjSaveImage(): Could not initialize PPM writer");
     invert = (flags & TJFLAG_BOTTOMUP) != 0;
   }
 
@@ -2123,5 +2148,5 @@ DLLEXPORT int tjSaveImage(const char *filename, unsigned char *buffer,
 bailout:
   if (handle) tjDestroy(handle);
   if (file) fclose(file);
-	return retval;
+  return retval;
 }
diff --git a/turbojpeg.h b/turbojpeg.h
index 9c0a3713..074f015f 100644
--- a/turbojpeg.h
+++ b/turbojpeg.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2015, 2017 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2015, 2017, 2020 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -1636,7 +1636,8 @@ DLLEXPORT int tjSaveImage(const char *filename, unsigned char *buffer,
  * (re)allocated by the compression and transform functions or that were
  * manually allocated using #tjAlloc().
  *
- * @param buffer address of the buffer to free
+ * @param buffer address of the buffer to free.  If the address is NULL, then
+ * this function has no effect.
  *
  * @sa tjAlloc()
  */
diff --git a/wrbmp.c b/wrbmp.c
index 38a64e87..239f64eb 100644
--- a/wrbmp.c
+++ b/wrbmp.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2013, Linaro Limited.
- * Copyright (C) 2014-2015, 2017, D. R. Commander.
+ * Copyright (C) 2014-2015, 2017, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -303,9 +303,7 @@ write_os2_header(j_decompress_ptr cinfo, bmp_dest_ptr dest)
   int bits_per_pixel, cmap_entries;
 
   /* Compute colormap size and total file size */
-  if (cinfo->out_color_space == JCS_RGB ||
-      (cinfo->out_color_space >= JCS_EXT_RGB &&
-       cinfo->out_color_space <= JCS_EXT_ARGB)) {
+  if (IsExtRGB(cinfo->out_color_space)) {
     if (cinfo->quantize_colors) {
       /* Colormapped RGB */
       bits_per_pixel = 8;
@@ -499,15 +497,14 @@ jinit_write_bmp(j_decompress_ptr cinfo, boolean is_os2,
 
   if (cinfo->out_color_space == JCS_GRAYSCALE) {
     dest->pub.put_pixel_rows = put_gray_rows;
-  } else if (cinfo->out_color_space == JCS_RGB ||
-             (cinfo->out_color_space >= JCS_EXT_RGB &&
-              cinfo->out_color_space <= JCS_EXT_ARGB)) {
+  } else if (IsExtRGB(cinfo->out_color_space)) {
     if (cinfo->quantize_colors)
       dest->pub.put_pixel_rows = put_gray_rows;
     else
       dest->pub.put_pixel_rows = put_pixel_rows;
-  } else if (cinfo->out_color_space == JCS_RGB565 ||
-             cinfo->out_color_space == JCS_CMYK) {
+  } else if (!cinfo->quantize_colors &&
+             (cinfo->out_color_space == JCS_RGB565 ||
+              cinfo->out_color_space == JCS_CMYK)) {
     dest->pub.put_pixel_rows = put_pixel_rows;
   } else {
     ERREXIT(cinfo, JERR_BMP_COLORSPACE);
diff --git a/wrjpgcom.c b/wrjpgcom.c
index faf77b41..8a4e7416 100644
--- a/wrjpgcom.c
+++ b/wrjpgcom.c
@@ -580,7 +580,7 @@ main(int argc, char **argv)
     }
   }
   /* Duplicate the remainder of the source file.
-   * Note that any COM markers occuring after SOF will not be touched.
+   * Note that any COM markers occurring after SOF will not be touched.
    */
   write_marker(marker);
   copy_rest_of_file();
diff --git a/wrppm.c b/wrppm.c
index 819a0a7e..69f91e81 100644
--- a/wrppm.c
+++ b/wrppm.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * Modified 2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2017, D. R. Commander.
+ * Copyright (C) 2017, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -256,6 +256,8 @@ start_output_ppm(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
   case JCS_EXT_ABGR:
   case JCS_EXT_ARGB:
   case JCS_CMYK:
+    if (!IsExtRGB(cinfo->out_color_space) && cinfo->quantize_colors)
+      ERREXIT(cinfo, JERR_PPM_COLORSPACE);
     /* emit header for raw PPM format */
     fprintf(dest->pub.output_file, "P6\n%ld %ld\n%d\n",
             (long)cinfo->output_width, (long)cinfo->output_height, PPM_MAXVAL);
@@ -337,13 +339,14 @@ jinit_write_ppm(j_decompress_ptr cinfo)
       ((j_common_ptr)cinfo, JPOOL_IMAGE,
        cinfo->output_width * cinfo->output_components, (JDIMENSION)1);
     dest->pub.buffer_height = 1;
-    if (IsExtRGB(cinfo->out_color_space))
-      dest->pub.put_pixel_rows = put_rgb;
-    else if (cinfo->out_color_space == JCS_CMYK)
-      dest->pub.put_pixel_rows = put_cmyk;
-    else if (!cinfo->quantize_colors)
-      dest->pub.put_pixel_rows = copy_pixel_rows;
-    else if (cinfo->out_color_space == JCS_GRAYSCALE)
+    if (!cinfo->quantize_colors) {
+      if (IsExtRGB(cinfo->out_color_space))
+        dest->pub.put_pixel_rows = put_rgb;
+      else if (cinfo->out_color_space == JCS_CMYK)
+        dest->pub.put_pixel_rows = put_cmyk;
+      else
+        dest->pub.put_pixel_rows = copy_pixel_rows;
+    } else if (cinfo->out_color_space == JCS_GRAYSCALE)
       dest->pub.put_pixel_rows = put_demapped_gray;
     else
       dest->pub.put_pixel_rows = put_demapped_rgb;