diff --git a/.gitauthors b/.gitauthors
new file mode 100644
index 00000000..7b724b5e
--- /dev/null
+++ b/.gitauthors
@@ -0,0 +1,7 @@
+
+dcommander = DRC <dcommander@users.sourceforge.net>
+astrand = Peter Åstrand <astrand@cendio.se>
+ossman_ = Pierre Ossman <ossman@cendio.se>
+const_k = Constantin Kaplinsky <const@tightvnc.com>
+atkac = Adam Tkac <atkac@redhat.com>
+
diff --git a/BUILDING.txt b/BUILDING.txt
index 70d71240..b94e96f8 100644
--- a/BUILDING.txt
+++ b/BUILDING.txt
@@ -99,7 +99,7 @@ This will generate the following files under .libs/
   cygjpeg-{version}.dll (Cygwin)
       Shared library for the libjpeg API
 
-  By default, {version} is 62.1.0, 7.1.0, or 8.0.2, depending on whether
+  By default, {version} is 62.2.0, 7.2.0, or 8.1.2, depending on whether
   libjpeg v6b (default), v7, or v8 emulation is enabled.  If using Cygwin,
   {version} is 62, 7, or 8.
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f0832421..709b90c7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -180,12 +180,12 @@ endif()
 #
 
 set(JPEG_SOURCES jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c
-  jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c jcphuff.c
-  jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c jdatadst.c jdatasrc.c
-  jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdinput.c jdmainct.c jdmarker.c
-  jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c jerror.c
-  jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c
-  jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c)
+  jccompat.c jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c
+  jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c jdatadst.c
+  jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdinput.c jdmainct.c
+  jdmarker.c jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c
+  jerror.c jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c
+  jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c)
 
 if(WITH_ARITH_ENC OR WITH_ARITH_DEC)
   set(JPEG_SOURCES ${JPEG_SOURCES} jaricom.c)
diff --git a/ChangeLog.txt b/ChangeLog.txt
index b158dc4d..ae88b498 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -1,5 +1,21 @@
-1.4 pre-beta
-============
+1.4.0
+=====
+
+[1] Fixed a build issue on OS X PowerPC platforms (md5cmp failed to build
+because OS X does not provide the le32toh() and htole32() functions.)
+
+[2] The non-SIMD RGB565 color conversion code did not work correctly on big
+endian machines.  This has been fixed.
+
+[3] Fixed an issue in tjPlaneSizeYUV() whereby it would erroneously return 1
+instead of -1 if componentID was > 0 and subsamp was TJSAMP_GRAY.
+
+[3] Fixed an issue in tjBufSizeYUV2() wherby it would erroneously return 0
+instead of -1 if width was < 1.
+
+
+1.3.90 (1.4 beta1)
+==================
 
 [1] New features in the TurboJPEG API:
 -- YUV planar images can now be generated with an arbitrary line padding
diff --git a/Makefile.am b/Makefile.am
index cc5435e2..1b47317c 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -16,13 +16,13 @@ HDRS = jchuff.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
 	jpeg_nbits_table.h
 
 libjpeg_la_SOURCES = $(HDRS) jcapimin.c jcapistd.c jccoefct.c jccolor.c \
-	jcdctmgr.c jchuff.c jcinit.c jcmainct.c jcmarker.c jcmaster.c \
-	jcomapi.c jcparam.c jcphuff.c jcprepct.c jcsample.c jctrans.c \
-	jdapimin.c jdapistd.c jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c \
-	jddctmgr.c jdhuff.c jdinput.c jdmainct.c jdmarker.c jdmaster.c \
-	jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c jerror.c \
-	jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c \
-	jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c
+	jccompat.c jcdctmgr.c jchuff.c jcinit.c jcmainct.c jcmarker.c \
+	jcmaster.c jcomapi.c jcparam.c jcphuff.c jcprepct.c jcsample.c \
+	jctrans.c jdapimin.c jdapistd.c jdatadst.c jdatasrc.c jdcoefct.c \
+	jdcolor.c jddctmgr.c jdhuff.c jdinput.c jdmainct.c jdmarker.c \
+	jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c \
+	jerror.c jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c \
+	jidctint.c jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c
 
 if WITH_ARITH
 libjpeg_la_SOURCES += jaricom.c
diff --git a/cjpeg.c b/cjpeg.c
index a56a1ecd..098dbc32 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -188,6 +188,7 @@ usage (void)
   fprintf(stderr, "  -tune-ssim     Tune trellis optimization for SSIM\n");
   fprintf(stderr, "  -tune-ms-ssim  Tune trellis optimization for MS-SSIM\n");
   fprintf(stderr, "Switches for advanced users:\n");
+  fprintf(stderr, "  -noovershoot   Disable black-on-white deringing via overshoot\n");
 #ifdef C_ARITH_CODING_SUPPORTED
   fprintf(stderr, "  -arithmetic    Use arithmetic coding\n");
 #endif
@@ -303,9 +304,10 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
         cinfo->dct_method = JDCT_IFAST;
       } else if (keymatch(argv[argn], "float", 2)) {
         cinfo->dct_method = JDCT_FLOAT;
-      } else
+      } else {
         fprintf(stderr, "%s: invalid argument for dct\n", progname);
         usage();
+      }
 
     } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) {
       /* Enable debug printouts. */
@@ -323,10 +325,10 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       cinfo->err->trace_level++;
 
     } else if (keymatch(arg, "fastcrush", 4)) {
-      cinfo->optimize_scans = FALSE;
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_OPTIMIZE_SCANS, FALSE);
 
     } else if (keymatch(arg, "flat", 4)) {
-      cinfo->use_flat_quant_tbl = TRUE;
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_USE_FLAT_QUANT_TBL, TRUE);
       jpeg_set_quality(cinfo, 75, TRUE);
 
     } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
@@ -340,12 +342,14 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
     } else if (keymatch(arg, "lambda1", 7)) {
       if (++argn >= argc)	/* advance to next argument */
 	usage();
-      cinfo->lambda_log_scale1 = atof(argv[argn]);
+      jpeg_c_set_float_param(cinfo, JFLOAT_LAMBDA_LOG_SCALE1,
+                             atof(argv[argn]));
 
     } else if (keymatch(arg, "lambda2", 7)) {
       if (++argn >= argc)	/* advance to next argument */
 	usage();
-      cinfo->lambda_log_scale2 = atof(argv[argn]);
+      jpeg_c_set_float_param(cinfo, JFLOAT_LAMBDA_LOG_SCALE2,
+                             atof(argv[argn]));
 
     } else if (keymatch(arg, "maxmemory", 3)) {
       /* Maximum memory in Kb (or Mb with 'm'). */
@@ -361,7 +365,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       cinfo->mem->max_memory_to_use = lval * 1000L;
 
     } else if (keymatch(arg, "opt-dc-scan", 6)) {
-      cinfo->one_dc_scan = FALSE;
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_ONE_DC_SCAN, FALSE);
 
     } else if (keymatch(arg, "optimize", 1) || keymatch(arg, "optimise", 1)) {
       /* Enable entropy parm optimization. */
@@ -448,7 +452,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
 
     } else if (keymatch(arg, "revert", 3)) {
       /* revert to old JPEG default */
-      cinfo->use_moz_defaults = FALSE;
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_USE_MOZ_DEFAULTS, FALSE);
       jpeg_set_defaults(cinfo);
 
     } else if (keymatch(arg, "sample", 2)) {
@@ -487,8 +491,8 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       cinfo->smoothing_factor = val;
 
     } else if (keymatch(arg, "split-dc-scans", 3)) {
-      cinfo->one_dc_scan = FALSE;
-      cinfo->sep_dc_scan = TRUE;
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_ONE_DC_SCAN, FALSE);
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_SEP_DC_SCAN, TRUE);
       
     } else if (keymatch(arg, "targa", 1)) {
       /* Input file is Targa format. */
@@ -496,44 +500,46 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
 
     } else if (keymatch(arg, "notrellis-dc", 11)) {
       /* disable trellis quantization */
-      cinfo->trellis_quant_dc = FALSE;
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_TRELLIS_QUANT_DC, FALSE);
       
     } else if (keymatch(arg, "notrellis", 1)) {
       /* disable trellis quantization */
-      cinfo->trellis_quant = FALSE;
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_TRELLIS_QUANT, FALSE);
       
     } else if (keymatch(arg, "trellis-dc", 9)) {
       /* enable DC trellis quantization */
-      cinfo->trellis_quant_dc = TRUE;
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_TRELLIS_QUANT_DC, TRUE);
       
     } else if (keymatch(arg, "tune-psnr", 6)) {
-      cinfo->use_flat_quant_tbl = TRUE;
-      cinfo->lambda_log_scale1 = 9.0;
-      cinfo->lambda_log_scale2 = 0.0;
-      cinfo->use_lambda_weight_tbl = FALSE;
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_USE_FLAT_QUANT_TBL, TRUE);
+      jpeg_c_set_float_param(cinfo, JFLOAT_LAMBDA_LOG_SCALE1, 9.0);
+      jpeg_c_set_float_param(cinfo, JFLOAT_LAMBDA_LOG_SCALE2, 0.0);
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_USE_LAMBDA_WEIGHT_TBL, FALSE);
       jpeg_set_quality(cinfo, 75, TRUE);
       
     } else if (keymatch(arg, "tune-ssim", 6)) {
-      cinfo->use_flat_quant_tbl = TRUE;
-      cinfo->lambda_log_scale1 = 12.0;
-      cinfo->lambda_log_scale2 = 13.5;
-      cinfo->use_lambda_weight_tbl = FALSE;
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_USE_FLAT_QUANT_TBL, TRUE);
+      jpeg_c_set_float_param(cinfo, JFLOAT_LAMBDA_LOG_SCALE1, 12.0);
+      jpeg_c_set_float_param(cinfo, JFLOAT_LAMBDA_LOG_SCALE2, 13.5);
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_USE_LAMBDA_WEIGHT_TBL, FALSE);
       jpeg_set_quality(cinfo, 75, TRUE);
       
     } else if (keymatch(arg, "tune-ms-ssim", 6)) {
-      cinfo->use_flat_quant_tbl = FALSE;
-      cinfo->lambda_log_scale1 = 14.25;
-      cinfo->lambda_log_scale2 = 12.75;
-      cinfo->use_lambda_weight_tbl = TRUE;
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_USE_FLAT_QUANT_TBL, FALSE);
+      jpeg_c_set_float_param(cinfo, JFLOAT_LAMBDA_LOG_SCALE1, 14.25);
+      jpeg_c_set_float_param(cinfo, JFLOAT_LAMBDA_LOG_SCALE2, 12.75);
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_USE_LAMBDA_WEIGHT_TBL, TRUE);
       jpeg_set_quality(cinfo, 75, TRUE);
       
     } else if (keymatch(arg, "tune-hvs-psnr", 6)) {
-      cinfo->use_flat_quant_tbl = FALSE;
-      cinfo->lambda_log_scale1 = 16.0;
-      cinfo->lambda_log_scale2 = 15.5;
-      cinfo->use_lambda_weight_tbl = TRUE;
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_USE_FLAT_QUANT_TBL, FALSE);
+      jpeg_c_set_float_param(cinfo, JFLOAT_LAMBDA_LOG_SCALE1, 16.0);
+      jpeg_c_set_float_param(cinfo, JFLOAT_LAMBDA_LOG_SCALE2, 15.5);
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_USE_LAMBDA_WEIGHT_TBL, TRUE);
       jpeg_set_quality(cinfo, 75, TRUE);
-      
+
+    } else if (keymatch(arg, "noovershoot", 11)) {
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_OVERSHOOT_DERINGING, FALSE);
     } else {
       fprintf(stderr, "%s: unknown option '%s'\n", progname, arg);
       usage();                  /* bogus switch */
@@ -628,7 +634,8 @@ main (int argc, char **argv)
    */
 
   cinfo.in_color_space = JCS_RGB; /* arbitrary guess */
-  cinfo.use_moz_defaults = TRUE;
+  if (jpeg_c_bool_param_supported(&cinfo, JBOOLEAN_USE_MOZ_DEFAULTS))
+    jpeg_c_set_bool_param(&cinfo, JBOOLEAN_USE_MOZ_DEFAULTS, TRUE);
   jpeg_set_defaults(&cinfo);
 
   /* Scan command line to find file names.
diff --git a/configure.ac b/configure.ac
index 12387c6a..efbbdb86 100644
--- a/configure.ac
+++ b/configure.ac
@@ -189,7 +189,7 @@ fi
 RPM_CONFIG_ARGS=
 
 # Memory source/destination managers
-SO_AGE=0
+SO_AGE=1
 MEM_SRCDST_FUNCTIONS=
 if test "x${with_jpeg8}" != "xyes"; then
   AC_MSG_CHECKING([whether to include in-memory source/destination managers])
@@ -200,7 +200,7 @@ if test "x${with_jpeg8}" != "xyes"; then
     AC_MSG_RESULT(yes)
     AC_DEFINE([MEM_SRCDST_SUPPORTED], [1],
       [Support in-memory source/destination managers])
-    SO_AGE=1
+    SO_AGE=2
     MEM_SRCDST_FUNCTIONS="global:  jpeg_mem_dest;  jpeg_mem_src;";
   else
     AC_MSG_RESULT(no)
@@ -487,6 +487,10 @@ if test "x${with_simd}" != "xno"; then
         fi
       fi
       ;;
+    powerpc*)
+      AC_MSG_RESULT([yes (powerpc)])
+      simd_arch=powerpc
+      ;;
     *)
       AC_MSG_RESULT([no ("$host_cpu")])
       with_simd=no;
@@ -512,6 +516,7 @@ AM_CONDITIONAL([SIMD_X86_64], [test "x$simd_arch" = "xx86_64"])
 AM_CONDITIONAL([SIMD_ARM], [test "x$simd_arch" = "xarm"])
 AM_CONDITIONAL([SIMD_ARM_64], [test "x$simd_arch" = "xaarch64"])
 AM_CONDITIONAL([SIMD_MIPS], [test "x$simd_arch" = "xmips"])
+AM_CONDITIONAL([SIMD_POWERPC], [test "x$simd_arch" = "xpowerpc"])
 AM_CONDITIONAL([X86_64], [test "x$host_cpu" = "xx86_64" -o "x$host_cpu" = "xamd64"])
 AM_CONDITIONAL([WITH_TURBOJPEG], [test "x$with_turbojpeg" != "xno"])
 
diff --git a/example.c b/example.c
index 0b9574ef..72939acf 100644
--- a/example.c
+++ b/example.c
@@ -126,7 +126,7 @@ write_JPEG_file (char * filename, int quality)
   cinfo.image_height = image_height;
   cinfo.input_components = 3;           /* # of color components per pixel */
   cinfo.in_color_space = JCS_RGB;       /* colorspace of input image */
-  cinfo.use_moz_defaults = TRUE;        /* use Mozilla defaults for improved compression */
+  jpeg_c_set_bool_param(&cinfo, JBOOLEAN_USE_MOZ_DEFAULTS, TRUE); /* use Mozilla defaults for improved compression */
   
   /* Now use the library's routine to set default compression parameters.
    * (You must set at least cinfo.in_color_space before calling this,
diff --git a/git-init-svn.sh b/git-init-svn.sh
new file mode 100755
index 00000000..786d775e
--- /dev/null
+++ b/git-init-svn.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+set -e
+
+# Make a local, clean libjpeg-turbo branch that tracks the remote libjpeg-turbo.
+# This will allow pushing of imported libjpeg-turbo commits to the mozjpeg repository.
+# The libjpeg-turbo branch must only contain imported SVN commits (with git-svn-id: in the message).
+git branch -f -t libjpeg-turbo origin/libjpeg-turbo
+
+# Configure git-svn. "git svn fetch" will rebuild remaining git-svn metadata.
+git config svn-remote.svn.url svn://svn.code.sf.net/p/libjpeg-turbo/code
+git config svn-remote.svn.fetch trunk:refs/heads/libjpeg-turbo
+
+# Enable mapping of SVN usernames to git authors.
+git config svn.authorsfile .gitauthors
+
+# Mark which libjpeg-turbo commit has been used to start mozjpeg.
+# Required for accurate merging and blame.
+echo > .git/info/grafts "72b66f9c77b3e4ae363b21e48145f635cec0b193 540789427ccae8e9e778151cbc16ab8ee88ac6a8"
+
+# To get changes from SVN:
+# git svn fetch
+# git push origin libjpeg-turbo
+#
+# To merge SVN changes with mozjpeg:
+# git checkout master
+# git merge libjpeg-turbo
diff --git a/jcapimin.c b/jcapimin.c
index 3b005d3f..b16f9e40 100644
--- a/jcapimin.c
+++ b/jcapimin.c
@@ -4,8 +4,8 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
  * Modified 2003-2010 by Guido Vollbeding.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2014, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains application interface code for the compression half
@@ -22,6 +22,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jmemsys.h"
 
 
 /*
@@ -91,6 +92,13 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
 
   /* OK, I'm ready */
   cinfo->global_state = CSTATE_START;
+
+  /* The master struct is used to store extension parameters, so we allocate it
+   * here.  It is later reallocated by jinit_c_master_control().
+   */
+  cinfo->master = (struct jpeg_comp_master *)
+      jpeg_get_small ((j_common_ptr) cinfo, sizeof(struct jpeg_comp_master));
+  MEMZERO(cinfo->master, sizeof(struct jpeg_comp_master));
 }
 
 
diff --git a/jcapistd.c b/jcapistd.c
index 7c7bc1c3..2cc6d325 100644
--- a/jcapistd.c
+++ b/jcapistd.c
@@ -46,8 +46,9 @@ jpeg_start_compress (j_compress_ptr cinfo, boolean write_all_tables)
     jpeg_suppress_tables(cinfo, FALSE); /* mark all tables to be written */
 
   /* setting up scan optimisation pattern failed, disable scan optimisation */
-  if (cinfo->num_scans_luma == 0 || cinfo->scan_info == NULL || cinfo->num_scans == 0)
-    cinfo->optimize_scans = FALSE;
+  if (cinfo->master->num_scans_luma == 0 || cinfo->scan_info == NULL ||
+      cinfo->num_scans == 0)
+    cinfo->master->optimize_scans = FALSE;
   
   /* (Re)initialize error mgr and destination modules */
   (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
diff --git a/jccoefct.c b/jccoefct.c
index 5d621508..ad8e4e36 100644
--- a/jccoefct.c
+++ b/jccoefct.c
@@ -402,7 +402,12 @@ compress_trellis_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
      */
     for (block_row = 0; block_row < block_rows; block_row++) {
       thisblockrow = buffer[block_row];
-      quantize_trellis(cinfo, dctbl, actbl, thisblockrow, buffer_dst[block_row], blocks_across, cinfo->quant_tbl_ptrs[compptr->quant_tbl_no], cinfo->norm_src[compptr->quant_tbl_no], cinfo->norm_coef[compptr->quant_tbl_no], &lastDC);
+      quantize_trellis(cinfo, dctbl, actbl, thisblockrow,
+                       buffer_dst[block_row], blocks_across,
+                       cinfo->quant_tbl_ptrs[compptr->quant_tbl_no],
+                       cinfo->master->norm_src[compptr->quant_tbl_no],
+                       cinfo->master->norm_coef[compptr->quant_tbl_no],
+                       &lastDC);
       
       if (ndummy > 0) {
 	/* Create dummy blocks at the right edge of the image. */
diff --git a/jccompat.c b/jccompat.c
new file mode 100644
index 00000000..cce130ef
--- /dev/null
+++ b/jccompat.c
@@ -0,0 +1,217 @@
+/*
+ * jccompat.c
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * Copyright (C) 2014, Mozilla Corporation.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains accessor functions for extension parameters.  These
+ * allow for extending the functionality of the libjpeg API without breaking
+ * backward ABI compatibility.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+GLOBAL(boolean)
+jpeg_c_bool_param_supported (j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
+{
+  switch (param) {
+  case JBOOLEAN_USE_MOZ_DEFAULTS:
+  case JBOOLEAN_OPTIMIZE_SCANS:
+  case JBOOLEAN_ONE_DC_SCAN:
+  case JBOOLEAN_SEP_DC_SCAN:
+  case JBOOLEAN_TRELLIS_QUANT:
+  case JBOOLEAN_TRELLIS_QUANT_DC:
+  case JBOOLEAN_TRELLIS_EOB_OPT:
+  case JBOOLEAN_USE_FLAT_QUANT_TBL:
+  case JBOOLEAN_USE_LAMBDA_WEIGHT_TBL:
+  case JBOOLEAN_USE_SCANS_IN_TRELLIS:
+  case JBOOLEAN_TRELLIS_PASSES:
+  case JBOOLEAN_TRELLIS_Q_OPT:
+  case JBOOLEAN_OVERSHOOT_DERINGING:
+    return TRUE;
+  }
+
+  return FALSE;
+}
+
+
+GLOBAL(void)
+jpeg_c_set_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param,
+                       boolean value)
+{
+  switch(param) {
+  case JBOOLEAN_USE_MOZ_DEFAULTS:
+    cinfo->master->use_moz_defaults = value;
+    break;
+  case JBOOLEAN_OPTIMIZE_SCANS:
+    cinfo->master->optimize_scans = value;
+    break;
+  case JBOOLEAN_ONE_DC_SCAN:
+    cinfo->master->one_dc_scan = value;
+    break;
+  case JBOOLEAN_SEP_DC_SCAN:
+    cinfo->master->sep_dc_scan = value;
+    break;
+  case JBOOLEAN_TRELLIS_QUANT:
+    cinfo->master->trellis_quant = value;
+    break;
+  case JBOOLEAN_TRELLIS_QUANT_DC:
+    cinfo->master->trellis_quant_dc = value;
+    break;
+  case JBOOLEAN_TRELLIS_EOB_OPT:
+    cinfo->master->trellis_eob_opt = value;
+    break;
+  case JBOOLEAN_USE_FLAT_QUANT_TBL:
+    cinfo->master->use_flat_quant_tbl = value;
+    break;
+  case JBOOLEAN_USE_LAMBDA_WEIGHT_TBL:
+    cinfo->master->use_lambda_weight_tbl = value;
+    break;
+  case JBOOLEAN_USE_SCANS_IN_TRELLIS:
+    cinfo->master->use_scans_in_trellis = value;
+    break;
+  case JBOOLEAN_TRELLIS_PASSES:
+    cinfo->master->trellis_passes = value;
+    break;
+  case JBOOLEAN_TRELLIS_Q_OPT:
+    cinfo->master->trellis_q_opt = value;
+    break;
+  case JBOOLEAN_OVERSHOOT_DERINGING:
+    cinfo->master->overshoot_deringing = value;
+    break;
+  default:
+    ERREXIT(cinfo, JERR_BAD_PARAM);
+  }
+}
+
+
+GLOBAL(boolean)
+jpeg_c_get_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
+{
+  switch(param) {
+  case JBOOLEAN_USE_MOZ_DEFAULTS:
+    return cinfo->master->use_moz_defaults;
+  case JBOOLEAN_OPTIMIZE_SCANS:
+    return cinfo->master->optimize_scans;
+  case JBOOLEAN_ONE_DC_SCAN:
+    return cinfo->master->one_dc_scan;
+  case JBOOLEAN_SEP_DC_SCAN:
+    return cinfo->master->sep_dc_scan;
+  case JBOOLEAN_TRELLIS_QUANT:
+    return cinfo->master->trellis_quant;
+  case JBOOLEAN_TRELLIS_QUANT_DC:
+    return cinfo->master->trellis_quant_dc;
+  case JBOOLEAN_TRELLIS_EOB_OPT:
+    return cinfo->master->trellis_eob_opt;
+  case JBOOLEAN_USE_FLAT_QUANT_TBL:
+    return cinfo->master->use_flat_quant_tbl;
+  case JBOOLEAN_USE_LAMBDA_WEIGHT_TBL:
+    return cinfo->master->use_lambda_weight_tbl;
+  case JBOOLEAN_USE_SCANS_IN_TRELLIS:
+    return cinfo->master->use_scans_in_trellis;
+  case JBOOLEAN_TRELLIS_PASSES:
+    return cinfo->master->trellis_passes;
+  case JBOOLEAN_TRELLIS_Q_OPT:
+    return cinfo->master->trellis_q_opt;
+  case JBOOLEAN_OVERSHOOT_DERINGING:
+    return cinfo->master->overshoot_deringing;
+  default:
+    ERREXIT(cinfo, JERR_BAD_PARAM);
+  }
+
+  return FALSE;
+}
+
+
+GLOBAL(boolean)
+jpeg_c_float_param_supported (j_compress_ptr cinfo, J_FLOAT_PARAM param)
+{
+  switch (param) {
+  case JFLOAT_LAMBDA_LOG_SCALE1:
+  case JFLOAT_LAMBDA_LOG_SCALE2:
+    return TRUE;
+  }
+
+  return FALSE;
+}
+
+
+GLOBAL(void)
+jpeg_c_set_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param, float value)
+{
+  switch (param) {
+  case JFLOAT_LAMBDA_LOG_SCALE1:
+    cinfo->master->lambda_log_scale1 = value;
+    break;
+  case JFLOAT_LAMBDA_LOG_SCALE2:
+    cinfo->master->lambda_log_scale2 = value;
+    break;
+  default:
+    ERREXIT(cinfo, JERR_BAD_PARAM);
+  }
+}
+
+
+GLOBAL(float)
+jpeg_c_get_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param)
+{
+  switch (param) {
+  case JFLOAT_LAMBDA_LOG_SCALE1:
+    return cinfo->master->lambda_log_scale1;
+  case JFLOAT_LAMBDA_LOG_SCALE2:
+    return cinfo->master->lambda_log_scale2;
+  default:
+    ERREXIT(cinfo, JERR_BAD_PARAM);
+  }
+
+  return -1;
+}
+
+
+GLOBAL(boolean)
+jpeg_c_int_param_supported (j_compress_ptr cinfo, J_INT_PARAM param)
+{
+  switch (param) {
+  case JINT_TRELLIS_FREQ_SPLIT:
+  case JINT_TRELLIS_NUM_LOOPS:
+    return TRUE;
+  }
+
+  return FALSE;
+}
+
+
+GLOBAL(void)
+jpeg_c_set_int_param (j_compress_ptr cinfo, J_INT_PARAM param, int value)
+{
+  switch (param) {
+  case JINT_TRELLIS_FREQ_SPLIT:
+    cinfo->master->trellis_freq_split = value;
+    break;
+  case JINT_TRELLIS_NUM_LOOPS:
+    cinfo->master->trellis_num_loops = value;
+    break;
+  default:
+    ERREXIT(cinfo, JERR_BAD_PARAM);
+  }
+}
+
+
+GLOBAL(int)
+jpeg_c_get_int_param (j_compress_ptr cinfo, J_INT_PARAM param)
+{
+  switch (param) {
+  case JINT_TRELLIS_FREQ_SPLIT:
+    return cinfo->master->trellis_freq_split;
+  case JINT_TRELLIS_NUM_LOOPS:
+    return cinfo->master->trellis_num_loops;
+  default:
+    ERREXIT(cinfo, JERR_BAD_PARAM);
+  }
+
+  return -1;
+}
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 6cab5ab5..938325c6 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -30,6 +30,9 @@
 typedef void (*forward_DCT_method_ptr) (DCTELEM * data);
 typedef void (*float_DCT_method_ptr) (FAST_FLOAT * data);
 
+typedef void (*preprocess_method_ptr)(DCTELEM*, const JQUANT_TBL*);
+typedef void (*float_preprocess_method_ptr)(FAST_FLOAT*, const JQUANT_TBL*);
+
 typedef void (*convsamp_method_ptr) (JSAMPARRAY sample_data,
                                      JDIMENSION start_col,
                                      DCTELEM * workspace);
@@ -51,6 +54,7 @@ typedef struct {
   /* Pointer to the DCT routine actually in use */
   forward_DCT_method_ptr dct;
   convsamp_method_ptr convsamp;
+  preprocess_method_ptr preprocess;
   quantize_method_ptr quantize;
 
   /* The actual post-DCT divisors --- not identical to the quant table
@@ -66,6 +70,7 @@ typedef struct {
   /* Same as above for the floating-point case. */
   float_DCT_method_ptr float_dct;
   float_convsamp_method_ptr float_convsamp;
+  float_preprocess_method_ptr float_preprocess;
   float_quantize_method_ptr float_quantize;
   FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
   FAST_FLOAT * float_workspace;
@@ -349,6 +354,190 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
   }
 }
 
+METHODDEF(float)
+catmull_rom(const DCTELEM value1, const DCTELEM value2, const DCTELEM value3, const DCTELEM value4, const float t, int size)
+{
+  const int tan1 = (value3 - value1) * size;
+  const int tan2 = (value4 - value2) * size;
+
+  const float t2 = t * t;
+  const float t3 = t2 * t;
+
+  const float f1 = 2.f * t3 - 3.f * t2 + 1.f;
+  const float f2 = -2.f * t3 + 3.f * t2;
+  const float f3 = t3 - 2.f * t2 + t;
+  const float f4 = t3 - t2;
+
+  return value2 * f1 + tan1 * f3 +
+         value3 * f2 + tan2 * f4;
+}
+
+/** Prevents visible ringing artifacts near hard edges on white backgrounds.
+
+  1. JPEG can encode samples with higher values than it's possible to display (higher than 255 in RGB),
+     and the decoder will always clamp values to 0-255. To encode 255 you can use any value >= 255,
+     and distortions of the out-of-range values won't be visible as long as they decode to anything >= 255.
+
+  2. From DCT perspective pixels in a block are a waveform. Hard edges form square waves (bad).
+     Edges with white are similar to waveform clipping, and anti-clipping algorithms can turn square waves
+     into softer ones that compress better.
+
+ */
+METHODDEF(void)
+preprocess_deringing(DCTELEM *data, const JQUANT_TBL *quantization_table)
+{
+  const DCTELEM maxsample = 255 - CENTERJSAMPLE;
+  const int size = DCTSIZE * DCTSIZE;
+
+  /* Decoders don't handle overflow of DC very well, so calculate
+     maximum overflow that is safe to do without increasing DC out of range */
+  int sum = 0;
+  int maxsample_count = 0;
+  int i;
+  DCTELEM maxovershoot;
+  int n;
+  
+  for(i=0; i < size; i++) {
+    sum += data[i];
+    if (data[i] >= maxsample) {
+      maxsample_count++;
+    }
+  }
+
+  /* If nothing reaches max value there's nothing to overshoot
+     and if the block is completely flat, it's already the best case. */
+  if (!maxsample_count || maxsample_count == size) {
+    return;
+  }
+
+  /* Too much overshoot is not good: increased amplitude will cost bits, and the cost is proportional to quantization (here using DC quant as a rough guide). */
+  maxovershoot = maxsample + MIN(MIN(31, 2*quantization_table->quantval[0]), (maxsample * size - sum) / maxsample_count);
+
+  n = 0;
+  do {
+    int start, end, length;
+    DCTELEM f1, f2, l1, l2, fslope, lslope;
+    float step, position;
+    
+    /* Pixels are traversed in zig-zag order to process them as a line */
+    if (data[jpeg_natural_order[n]] < maxsample) {
+      n++;
+      continue;
+    }
+
+    /* Find a run of maxsample pixels. Start is the first pixel inside the range, end the first pixel outside. */
+    start = n;
+    while(++n < size && data[jpeg_natural_order[n]] >= maxsample) {}
+    end = n;
+
+    /* the run will be replaced with a catmull-rom interpolation of values from the edges */
+
+    /* Find suitable upward slope from pixels around edges of the run.
+       Just feeding nearby pixels as catmull rom points isn't good enough,
+       as slope with one sample before the edge may have been flattened by clipping,
+       and slope of two samples before the edge could be downward. */
+    f1 = data[jpeg_natural_order[start >= 1 ? start-1 : 0]];
+    f2 = data[jpeg_natural_order[start >= 2 ? start-2 : 0]];
+
+    l1 = data[jpeg_natural_order[end < size-1 ? end : size-1]];
+    l2 = data[jpeg_natural_order[end < size-2 ? end+1 : size-1]];
+
+    fslope = MAX(f1-f2, maxsample-f1);
+    lslope = MAX(l1-l2, maxsample-l1);
+
+    /* if slope at the start/end is unknown, just make the curve symmetric */
+    if (start == 0) {
+      fslope = lslope;
+    }
+    if (end == size) {
+      lslope = fslope;
+    }
+
+    /* The curve fits better if first and last pixel is omitted */
+    length = end - start;
+    step = 1.f/(float)(length + 1);
+    position = step;
+
+    for(i = start; i < end; i++, position += step) {
+      DCTELEM tmp = ceilf(catmull_rom(maxsample - fslope, maxsample, maxsample, maxsample - lslope, position, length));
+      data[jpeg_natural_order[i]] = MIN(tmp, maxovershoot);
+    }
+    n++;
+  }
+  while(n < size);
+}
+
+/*
+  Float version of preprocess_deringing()
+ */
+METHODDEF(void)
+float_preprocess_deringing(FAST_FLOAT *data, const JQUANT_TBL *quantization_table)
+{
+  const FAST_FLOAT maxsample = 255 - CENTERJSAMPLE;
+  const int size = DCTSIZE * DCTSIZE;
+
+  FAST_FLOAT sum = 0;
+  int maxsample_count = 0;
+  int i;
+  int n;
+  FAST_FLOAT maxovershoot;
+  
+  for(i=0; i < size; i++) {
+    sum += data[i];
+    if (data[i] >= maxsample) {
+      maxsample_count++;
+    }
+  }
+
+  if (!maxsample_count || maxsample_count == size) {
+    return;
+  }
+
+  maxovershoot = maxsample + MIN(MIN(31, 2*quantization_table->quantval[0]), (maxsample * size - sum) / maxsample_count);
+
+  n = 0;
+  do {
+    int start, end, length;
+    FAST_FLOAT f1, f2, l1, l2, fslope, lslope;
+    float step, position;
+    
+    if (data[jpeg_natural_order[n]] < maxsample) {
+      n++;
+      continue;
+    }
+
+    start = n;
+    while(++n < size && data[jpeg_natural_order[n]] >= maxsample) {}
+    end = n;
+
+    f1 = data[jpeg_natural_order[start >= 1 ? start-1 : 0]];
+    f2 = data[jpeg_natural_order[start >= 2 ? start-2 : 0]];
+
+    l1 = data[jpeg_natural_order[end < size-1 ? end : size-1]];
+    l2 = data[jpeg_natural_order[end < size-2 ? end+1 : size-1]];
+
+    fslope = MAX(f1-f2, maxsample-f1);
+    lslope = MAX(l1-l2, maxsample-l1);
+
+    if (start == 0) {
+      fslope = lslope;
+    }
+    if (end == size) {
+      lslope = fslope;
+    }
+
+    length = end - start;
+    step = 1.f/(float)(length + 1);
+    position = step;
+
+    for(i = start; i < end; i++, position += step) {
+      FAST_FLOAT tmp = catmull_rom(maxsample - fslope, maxsample, maxsample, maxsample - lslope, position, length);
+      data[jpeg_natural_order[i]] = MIN(tmp, maxovershoot);
+    }
+    n++;
+  }
+  while(n < size);
+}
 
 /*
  * Load data into workspace, applying unsigned->signed conversion.
@@ -449,7 +638,7 @@ quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
       temp = -temp;
       temp += qval>>1;  /* for rounding */
       DIVIDE_BY(temp, qval);
-      temp = -temp;    
+      temp = -temp;
     } else {
       temp += qval>>1;  /* for rounding */
       DIVIDE_BY(temp, qval);
@@ -480,12 +669,14 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
   /* This routine is heavily used, so it's worth coding it tightly. */
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
   DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
+  JQUANT_TBL *qtbl = cinfo->quant_tbl_ptrs[compptr->quant_tbl_no];
   DCTELEM * workspace;
   JDIMENSION bi;
 
   /* Make sure the compiler doesn't look up these every pass */
   forward_DCT_method_ptr do_dct = fdct->dct;
   convsamp_method_ptr do_convsamp = fdct->convsamp;
+  preprocess_method_ptr do_preprocess = fdct->preprocess;
   quantize_method_ptr do_quantize = fdct->quantize;
   workspace = fdct->workspace;
 
@@ -495,6 +686,10 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
     /* Load data into workspace, applying unsigned->signed conversion */
     (*do_convsamp) (sample_data, start_col, workspace);
 
+    if (do_preprocess) {
+      (*do_preprocess) (workspace, qtbl);
+    }
+
     /* Perform the DCT */
     (*do_dct) (workspace);
 
@@ -600,6 +795,7 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
   /* This routine is heavily used, so it's worth coding it tightly. */
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
   FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
+  JQUANT_TBL *qtbl = cinfo->quant_tbl_ptrs[compptr->quant_tbl_no];
   FAST_FLOAT * workspace;
   JDIMENSION bi;
   float v;
@@ -609,6 +805,7 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
   /* Make sure the compiler doesn't look up these every pass */
   float_DCT_method_ptr do_dct = fdct->float_dct;
   float_convsamp_method_ptr do_convsamp = fdct->float_convsamp;
+  float_preprocess_method_ptr do_preprocess = fdct->float_preprocess;
   float_quantize_method_ptr do_quantize = fdct->float_quantize;
   workspace = fdct->float_workspace;
 
@@ -618,13 +815,17 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
     /* Load data into workspace, applying unsigned->signed conversion */
     (*do_convsamp) (sample_data, start_col, workspace);
 
+    if (do_preprocess) {
+      (*do_preprocess) (workspace, qtbl);
+    }
+
     /* Perform the DCT */
     (*do_dct) (workspace);
 
     /* Save unquantized transform coefficients for later trellis quantization */
     /* Currently save as integer values. Could save float values but would require */
     /* modifications to memory allocation and trellis quantization */
-    
+
     if (dst) {
       int i;
       static const double aanscalefactor[DCTSIZE] = {
@@ -673,6 +874,8 @@ static const float jpeg_lambda_weights_csf_luma[64] = {
   0.43454f, 0.42146f, 0.34609f, 0.24072f, 0.15975f, 0.10701f, 0.07558f, 0.05875f,
 };
 
+#define DC_TRELLIS_CANDIDATES 3
+
 GLOBAL(void)
 quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actbl, JBLOCKROW coef_blocks, JBLOCKROW src, JDIMENSION num_blocks,
                  JQUANT_TBL * qtbl, double *norm_src, double *norm_coef, JCOEF *last_dc_val)
@@ -688,7 +891,9 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
   float lambda_base;
   float lambda;
   float lambda_dc;
-  const float *lambda_tbl = (cinfo->use_lambda_weight_tbl) ? jpeg_lambda_weights_csf_luma : jpeg_lambda_weights_flat;
+  const float *lambda_tbl = (cinfo->master->use_lambda_weight_tbl) ?
+                            jpeg_lambda_weights_csf_luma :
+                            jpeg_lambda_weights_flat;
   int Ss, Se;
   float *accumulated_zero_block_cost = NULL;
   float *accumulated_block_cost = NULL;
@@ -701,9 +906,9 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
   int zero_run;
   int run_bits;
   int rate;
-  float *accumulated_dc_cost[3];
-  int *dc_cost_backtrack[3];
-  JCOEF *dc_candidate[3];
+  float *accumulated_dc_cost[DC_TRELLIS_CANDIDATES];
+  int *dc_cost_backtrack[DC_TRELLIS_CANDIDATES];
+  JCOEF *dc_candidate[DC_TRELLIS_CANDIDATES];
 
   Ss = cinfo->Ss;
   Se = cinfo->Se;
@@ -711,7 +916,7 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
     Ss = 1;
   if (Se < Ss)
     return;
-  if (cinfo->trellis_eob_opt) {
+  if (cinfo->master->trellis_eob_opt) {
     accumulated_zero_block_cost = (float *)malloc((num_blocks + 1) * sizeof(float));
     accumulated_block_cost = (float *)malloc((num_blocks + 1) * sizeof(float));
     block_run_start = (int *)malloc(num_blocks * sizeof(int));
@@ -727,8 +932,8 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
     accumulated_block_cost[0] = 0;
     requires_eob[0] = 0;
   }
-  if (cinfo->trellis_quant_dc) {
-    for (i = 0; i < 3; i++) {
+  if (cinfo->master->trellis_quant_dc) {
+    for (i = 0; i < DC_TRELLIS_CANDIDATES; i++) {
       accumulated_dc_cost[i] = (float *)malloc(num_blocks * sizeof(float));
       dc_cost_backtrack[i] = (int *)malloc(num_blocks * sizeof(int));
       dc_candidate[i] = (JCOEF *)malloc(num_blocks * sizeof(JCOEF));
@@ -755,18 +960,19 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
     }
     norm /= 63.0;
     
-    if (cinfo->lambda_log_scale2 > 0.0)
-      lambda = pow(2.0, cinfo->lambda_log_scale1) * lambda_base / (pow(2.0, cinfo->lambda_log_scale2) + norm);
+    if (cinfo->master->lambda_log_scale2 > 0.0)
+      lambda = pow(2.0, cinfo->master->lambda_log_scale1) * lambda_base /
+                   (pow(2.0, cinfo->master->lambda_log_scale2) + norm);
     else
-      lambda = pow(2.0, cinfo->lambda_log_scale1-12.0) * lambda_base;
+      lambda = pow(2.0, cinfo->master->lambda_log_scale1 - 12.0) * lambda_base;
     
     lambda_dc = lambda * lambda_tbl[0];
     
     accumulated_zero_dist[Ss-1] = 0.0;
     accumulated_cost[Ss-1] = 0.0;
-    
-    // Do DC coefficient
-    if (cinfo->trellis_quant_dc) {
+
+    /* Do DC coefficient */
+    if (cinfo->master->trellis_quant_dc) {
       int sign = src[bi][0] >> 31;
       int x = abs(src[bi][0]);
       int q = 8 * qtbl->quantval[0];
@@ -774,20 +980,20 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
       float dc_candidate_dist;
 
       qval = (x + q/2) / q; /* quantized value (round nearest) */
-      for (k = 0; k < 3; k++) {
+      for (k = 0; k < DC_TRELLIS_CANDIDATES; k++) {
         int delta;
         int dc_delta;
         int bits;
-        
-        dc_candidate[k][bi] = qval - 1 + k;
+
+        dc_candidate[k][bi] = qval - DC_TRELLIS_CANDIDATES/2 + k;
         delta = dc_candidate[k][bi] * q - x;
         dc_candidate_dist = delta * delta * lambda_dc;
         dc_candidate[k][bi] *= 1 + 2*sign;
         
         if (bi == 0) {
           dc_delta = dc_candidate[k][bi] - *last_dc_val;
-          
-          // Derive number of suffix bits
+
+          /* Derive number of suffix bits */
           bits = 0;
           dc_delta = abs(dc_delta);
           while (dc_delta) {
@@ -798,10 +1004,10 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
           accumulated_dc_cost[k][0] = cost;
           dc_cost_backtrack[k][0] = -1;
         } else {
-          for (l = 0; l < 3; l++) {
+          for (l = 0; l < DC_TRELLIS_CANDIDATES; l++) {
             dc_delta = dc_candidate[k][bi] - dc_candidate[l][bi-1];
-            
-            // Derive number of suffix bits
+
+            /* Derive number of suffix bits */
             bits = 0;
             dc_delta = abs(dc_delta);
             while (dc_delta) {
@@ -814,14 +1020,14 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
               dc_cost_backtrack[k][bi] = l;
             }
           }
-        }        
+        }
       }
     }
-    
-    // Do AC coefficients
+
+    /* Do AC coefficients */
     for (i = Ss; i <= Se; i++) {
       int z = jpeg_natural_order[i];
-      
+
       int sign = src[bi][z] >> 31;
       int x = abs(src[bi][z]);
       int q = 8 * qtbl->quantval[z];
@@ -919,7 +1125,7 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
       i--;
     }
     
-    if (cinfo->trellis_eob_opt) {
+    if (cinfo->master->trellis_eob_opt) {
       accumulated_zero_block_cost[bi+1] = accumulated_zero_block_cost[bi];
       accumulated_zero_block_cost[bi+1] += cost_all_zeros;
       requires_eob[bi+1] = has_eob;
@@ -953,7 +1159,7 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
     }
   }
   
-  if (cinfo->trellis_eob_opt) {
+  if (cinfo->master->trellis_eob_opt) {
     int last_block = num_blocks;
     best_cost = 1e38;
     
@@ -994,7 +1200,7 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
     free(requires_eob);
   }
   
-  if (cinfo->trellis_q_opt) {
+  if (cinfo->master->trellis_q_opt) {
     for (bi = 0; bi < num_blocks; bi++) {
       for (i = 1; i < DCTSIZE2; i++) {
         norm_src[i] += src[bi][i] * coef_blocks[bi][i];
@@ -1003,9 +1209,9 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
     }
   }
   
-  if (cinfo->trellis_quant_dc) {
+  if (cinfo->master->trellis_quant_dc) {
     j = 0;
-    for (i = 1; i < 3; i++) {
+    for (i = 1; i < DC_TRELLIS_CANDIDATES; i++) {
       if (accumulated_dc_cost[i][num_blocks-1] < accumulated_dc_cost[j][num_blocks-1])
         j = i;
     }
@@ -1013,11 +1219,11 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
       coef_blocks[bi][0] = dc_candidate[j][bi];
       j = dc_cost_backtrack[j][bi];
     }
-    
-    // Save DC predictor
+
+    /* Save DC predictor */
     *last_dc_val = coef_blocks[num_blocks-1][0];
-    
-    for (i = 0; i < 3; i++) {
+
+    for (i = 0; i < DC_TRELLIS_CANDIDATES; i++) {
       free(accumulated_dc_cost[i]);
       free(dc_cost_backtrack[i]);
       free(dc_candidate[i]);
@@ -1089,6 +1295,13 @@ jinit_forward_dct (j_compress_ptr cinfo)
       fdct->convsamp = jsimd_convsamp;
     else
       fdct->convsamp = convsamp;
+
+    if (cinfo->master->overshoot_deringing) {
+      fdct->preprocess = preprocess_deringing;
+    } else {
+      fdct->preprocess = NULL;
+    }
+
     if (jsimd_can_quantize())
       fdct->quantize = jsimd_quantize;
     else
@@ -1101,6 +1314,13 @@ jinit_forward_dct (j_compress_ptr cinfo)
       fdct->float_convsamp = jsimd_convsamp_float;
     else
       fdct->float_convsamp = convsamp_float;
+
+    if (cinfo->master->overshoot_deringing) {
+      fdct->float_preprocess = float_preprocess_deringing;
+    } else {
+      fdct->float_preprocess = NULL;
+    }
+
     if (jsimd_can_quantize_float())
       fdct->float_quantize = jsimd_quantize_float;
     else
diff --git a/jcinit.c b/jcinit.c
index 6dfb4f70..f5ecf5f5 100644
--- a/jcinit.c
+++ b/jcinit.c
@@ -62,7 +62,8 @@ jinit_compress_master (j_compress_ptr cinfo)
 
   /* Need a full-image coefficient buffer in any multi-pass mode. */
   jinit_c_coef_controller(cinfo,
-		(boolean) (cinfo->num_scans > 1 || cinfo->optimize_coding || cinfo->optimize_scans));
+    (boolean) (cinfo->num_scans > 1 || cinfo->optimize_coding ||
+               cinfo->master->optimize_scans));
   jinit_c_main_controller(cinfo, FALSE /* never need full buffer here */);
 
   jinit_marker_writer(cinfo);
diff --git a/jcmarker.c b/jcmarker.c
index 3e78f299..a62da057 100644
--- a/jcmarker.c
+++ b/jcmarker.c
@@ -192,6 +192,7 @@ emit_multi_dqt (j_compress_ptr cinfo)
   int seen[MAX_COMPONENTS] = { 0 };
   int fin_prec = 0;
   int ci;
+  int size = 0;
 
   for (ci = 0; ci < cinfo->num_components; ci++) {
     int tbl_num = cinfo->comp_info[ci].quant_tbl_no;
@@ -210,7 +211,6 @@ emit_multi_dqt (j_compress_ptr cinfo)
 
   emit_marker(cinfo, M_DQT);
 
-  int size = 0;
   for (ci = 0; ci < cinfo->num_components; ci++) {
     int tbl_num = cinfo->comp_info[ci].quant_tbl_no;
 
diff --git a/jcmaster.c b/jcmaster.c
index 17058678..37e577cd 100644
--- a/jcmaster.c
+++ b/jcmaster.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2003-2010 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, D. R. Commander.
+ * Copyright (C) 2010, 2014, D. R. Commander.
  * mozjpeg Modifications:
  * Copyright (C) 2014, Mozilla Corporation.
  * For conditions of distribution and use, see the accompanying README file.
@@ -20,6 +20,7 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jpegcomp.h"
+#include "jmemsys.h"
 
 
 /* Private state */
@@ -193,7 +194,7 @@ validate_script (j_compress_ptr cinfo)
   /* -1 until that coefficient has been seen; then last Al for it */
 #endif
 
-  if (cinfo->optimize_scans) {
+  if (cinfo->master->optimize_scans) {
     cinfo->progressive_mode = TRUE;
     /* When we optimize scans, there is redundancy in the scan list
      * and this function will fail. Therefore skip all this checking
@@ -332,12 +333,18 @@ select_scan_parameters (j_compress_ptr cinfo)
   my_master_ptr master = (my_master_ptr) cinfo->master;
   if (master->pass_number < master->pass_number_scan_opt_base) {
     cinfo->comps_in_scan = 1;
-    if (cinfo->use_scans_in_trellis) {
-      cinfo->cur_comp_info[0] = &cinfo->comp_info[master->pass_number/(4*cinfo->trellis_num_loops)];
-      cinfo->Ss = (master->pass_number%4 < 2) ? 1 : cinfo->trellis_freq_split+1;
-      cinfo->Se = (master->pass_number%4 < 2) ? cinfo->trellis_freq_split : DCTSIZE2-1;
+    if (cinfo->master->use_scans_in_trellis) {
+      cinfo->cur_comp_info[0] =
+        &cinfo->comp_info[master->pass_number / 
+                          (4 * cinfo->master->trellis_num_loops)];
+      cinfo->Ss = (master->pass_number % 4 < 2) ?
+                  1 : cinfo->master->trellis_freq_split + 1;
+      cinfo->Se = (master->pass_number % 4 < 2) ?
+                  cinfo->master->trellis_freq_split : DCTSIZE2 - 1;
     } else {
-      cinfo->cur_comp_info[0] = &cinfo->comp_info[master->pass_number/(2*cinfo->trellis_num_loops)];
+      cinfo->cur_comp_info[0] =
+        &cinfo->comp_info[master->pass_number /
+                          (2 * cinfo->master->trellis_num_loops)];
       cinfo->Ss = 1;
       cinfo->Se = DCTSIZE2-1;
     }
@@ -355,13 +362,16 @@ select_scan_parameters (j_compress_ptr cinfo)
     cinfo->Se = scanptr->Se;
     cinfo->Ah = scanptr->Ah;
     cinfo->Al = scanptr->Al;
-    if (cinfo->optimize_scans) {
+    if (cinfo->master->optimize_scans) {
       /* luma frequency split passes */
-      if (master->scan_number >= cinfo->num_scans_luma_dc+3*cinfo->Al_max_luma+2 &&
-          master->scan_number < cinfo->num_scans_luma)
+      if (master->scan_number >= cinfo->master->num_scans_luma_dc +
+                                 3 * cinfo->master->Al_max_luma + 2 &&
+          master->scan_number < cinfo->master->num_scans_luma)
         cinfo->Al = master->best_Al_luma;
       /* chroma frequency split passes */
-      if (master->scan_number >= cinfo->num_scans_luma+cinfo->num_scans_chroma_dc+(6*cinfo->Al_max_chroma+4) &&
+      if (master->scan_number >= cinfo->master->num_scans_luma +
+                                 cinfo->master->num_scans_chroma_dc +
+                                 (6 * cinfo->master->Al_max_chroma + 4) &&
           master->scan_number < cinfo->num_scans)
         cinfo->Al = master->best_Al_chroma;
     }
@@ -484,7 +494,8 @@ METHODDEF(void)
 prepare_for_pass (j_compress_ptr cinfo)
 {
   my_master_ptr master = (my_master_ptr) cinfo->master;
-  cinfo->trellis_passes = master->pass_number < master->pass_number_scan_opt_base;
+  cinfo->master->trellis_passes =
+    master->pass_number < master->pass_number_scan_opt_base;
 
   switch (master->pass_type) {
   case main_pass:
@@ -537,7 +548,7 @@ prepare_for_pass (j_compress_ptr cinfo)
       select_scan_parameters(cinfo);
       per_scan_setup(cinfo);
     }
-    if (cinfo->optimize_scans) {
+    if (cinfo->master->optimize_scans) {
       master->saved_dest = cinfo->dest;
       cinfo->dest = NULL;
       master->scan_size[master->scan_number] = 0;
@@ -553,13 +564,15 @@ prepare_for_pass (j_compress_ptr cinfo)
     master->pub.call_pass_startup = FALSE;
     break;
   case trellis_pass:
-    if (master->pass_number%(cinfo->num_components*(cinfo->use_scans_in_trellis?4:2)) == 1 && cinfo->trellis_q_opt) {
+    if (master->pass_number %
+        (cinfo->num_components * (cinfo->master->use_scans_in_trellis ? 4 : 2)) == 1 &&
+        cinfo->master->trellis_q_opt) {
       int i, j;
 
       for (i = 0; i < NUM_QUANT_TBLS; i++) {
         for (j = 1; j < DCTSIZE2; j++) {
-          cinfo->norm_src[i][j] = 0.0;
-          cinfo->norm_coef[i][j] = 0.0;
+          cinfo->master->norm_src[i][j] = 0.0;
+          cinfo->master->norm_coef[i][j] = 0.0;
         }
       }
     }
@@ -641,8 +654,11 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
   my_master_ptr master = (my_master_ptr) cinfo->master;
   
   int base_scan_idx = 0;
-  int luma_freq_split_scan_start = cinfo->num_scans_luma_dc + 3 * cinfo->Al_max_luma + 2;
-  int chroma_freq_split_scan_start = cinfo->num_scans_luma+cinfo->num_scans_chroma_dc+(6*cinfo->Al_max_chroma+4);
+  int luma_freq_split_scan_start = cinfo->master->num_scans_luma_dc +
+                                   3 * cinfo->master->Al_max_luma + 2;
+  int chroma_freq_split_scan_start = cinfo->master->num_scans_luma +
+                                     cinfo->master->num_scans_chroma_dc +
+                                     (6 * cinfo->master->Al_max_chroma + 4);
 
   if (next_scan_number > 1 && next_scan_number <= luma_freq_split_scan_start) {
     if ((next_scan_number - 1) % 3 == 2) {
@@ -663,7 +679,8 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
       }
     }
   
-  } else if (next_scan_number > luma_freq_split_scan_start && next_scan_number <= cinfo->num_scans_luma) {
+  } else if (next_scan_number > luma_freq_split_scan_start &&
+             next_scan_number <= cinfo->master->num_scans_luma) {
     if (next_scan_number == luma_freq_split_scan_start + 1) {
       master->best_freq_split_idx_luma = 0;
       master->best_cost = master->scan_size[next_scan_number-1];
@@ -683,21 +700,25 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
       if ((idx == 2 && master->best_freq_split_idx_luma == 0) ||
           (idx == 3 && master->best_freq_split_idx_luma != 2) ||
           (idx == 4 && master->best_freq_split_idx_luma != 4)) {
-        master->scan_number = cinfo->num_scans_luma - 1;
+        master->scan_number = cinfo->master->num_scans_luma - 1;
         master->pass_number = 2 * master->scan_number + 1 + master->pass_number_scan_opt_base;
         master->pub.is_last_pass = (master->pass_number == master->total_passes - 1);
       }
     }
     
-  } else if (cinfo->num_scans > cinfo->num_scans_luma) {
+  } else if (cinfo->num_scans > cinfo->master->num_scans_luma) {
+
+    if (next_scan_number == cinfo->master->num_scans_luma + 
+                            cinfo->master->num_scans_chroma_dc) {
+      base_scan_idx = cinfo->master->num_scans_luma;
 
-    if (next_scan_number == cinfo->num_scans_luma+cinfo->num_scans_chroma_dc) {
-      base_scan_idx = cinfo->num_scans_luma;
-      
       master->interleave_chroma_dc = master->scan_size[base_scan_idx] <= master->scan_size[base_scan_idx+1] + master->scan_size[base_scan_idx+2];
       
-    } else if (next_scan_number > cinfo->num_scans_luma+cinfo->num_scans_chroma_dc && next_scan_number <= chroma_freq_split_scan_start) {
-      base_scan_idx = cinfo->num_scans_luma + cinfo->num_scans_chroma_dc;
+    } else if (next_scan_number > cinfo->master->num_scans_luma +
+                                  cinfo->master->num_scans_chroma_dc &&
+               next_scan_number <= chroma_freq_split_scan_start) {
+      base_scan_idx = cinfo->master->num_scans_luma +
+                      cinfo->master->num_scans_chroma_dc;
       if ((next_scan_number - base_scan_idx) % 6 == 4) {
         int Al = (next_scan_number - base_scan_idx) / 6;
         int i;
@@ -757,10 +778,11 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
     
     copy_buffer(cinfo, 0);
 
-    if (cinfo->num_scans > cinfo->num_scans_luma && !cinfo->one_dc_scan) {
-      base_scan_idx = cinfo->num_scans_luma;
+    if (cinfo->num_scans > cinfo->master->num_scans_luma &&
+        !cinfo->master->one_dc_scan) {
+      base_scan_idx = cinfo->master->num_scans_luma;
       
-      if (master->interleave_chroma_dc && !cinfo->sep_dc_scan)
+      if (master->interleave_chroma_dc && !cinfo->master->sep_dc_scan)
         copy_buffer(cinfo, base_scan_idx);
       else {
         copy_buffer(cinfo, base_scan_idx+1);
@@ -779,7 +801,7 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
     for (Al = master->best_Al_luma-1; Al >= min_Al; Al--)
       copy_buffer(cinfo, 3 + 3*Al);
 
-    if (cinfo->num_scans > cinfo->num_scans_luma) {
+    if (cinfo->num_scans > cinfo->master->num_scans_luma) {
       if (master->best_freq_split_idx_chroma == 0) {
         copy_buffer(cinfo, chroma_freq_split_scan_start);
         copy_buffer(cinfo, chroma_freq_split_scan_start+1);
@@ -791,7 +813,8 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
         copy_buffer(cinfo, chroma_freq_split_scan_start+4*(master->best_freq_split_idx_chroma-1)+5);
       }
       
-      base_scan_idx = cinfo->num_scans_luma + cinfo->num_scans_chroma_dc;
+      base_scan_idx = cinfo->master->num_scans_luma +
+                      cinfo->master->num_scans_chroma_dc;
       
       for (Al = master->best_Al_chroma-1; Al >= min_Al; Al--) {
         copy_buffer(cinfo, base_scan_idx + 6*Al + 4);
@@ -802,7 +825,7 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
     for (Al = min_Al-1; Al >= 0; Al--) {
       copy_buffer(cinfo, 3 + 3*Al);
       
-      if (cinfo->num_scans > cinfo->num_scans_luma) {
+      if (cinfo->num_scans > cinfo->master->num_scans_luma) {
         copy_buffer(cinfo, base_scan_idx + 6*Al + 4);
         copy_buffer(cinfo, base_scan_idx + 6*Al + 5);
       }
@@ -835,7 +858,7 @@ finish_pass_master (j_compress_ptr cinfo)
     /* next pass is either output of scan 0 (after optimization)
      * or output of scan 1 (if no optimization).
      */
-    if (cinfo->trellis_quant)
+    if (cinfo->master->trellis_quant)
       master->pass_type = trellis_pass;
     else {
       master->pass_type = output_pass;
@@ -851,7 +874,7 @@ finish_pass_master (j_compress_ptr cinfo)
     /* next pass is either optimization or output of next scan */
     if (cinfo->optimize_coding)
       master->pass_type = huff_opt_pass;
-    if (cinfo->optimize_scans) {
+    if (cinfo->master->optimize_scans) {
       (*cinfo->dest->term_destination)(cinfo);
       cinfo->dest = master->saved_dest;
       select_scans(cinfo, master->scan_number + 1);
@@ -862,13 +885,16 @@ finish_pass_master (j_compress_ptr cinfo)
   case trellis_pass:
     master->pass_type = (cinfo->optimize_coding || master->pass_number < master->pass_number_scan_opt_base-1) ? huff_opt_pass : output_pass;
       
-    if ((master->pass_number+1)%(cinfo->num_components*(cinfo->use_scans_in_trellis?4:2)) == 0 && cinfo->trellis_q_opt) {
+    if ((master->pass_number + 1) %
+        (cinfo->num_components * (cinfo->master->use_scans_in_trellis ? 4 : 2)) == 0 &&
+        cinfo->master->trellis_q_opt) {
       int i, j;
 
       for (i = 0; i < NUM_QUANT_TBLS; i++) {
         for (j = 1; j < DCTSIZE2; j++) {
-          if (cinfo->norm_coef[i][j] != 0.0) {
-            int q = (int)(cinfo->norm_src[i][j] / cinfo->norm_coef[i][j] + 0.5);
+          if (cinfo->master->norm_coef[i][j] != 0.0) {
+            int q = (int)(cinfo->master->norm_src[i][j] /
+                          cinfo->master->norm_coef[i][j] + 0.5);
             if (q > 254) q = 254;
             if (q < 1) q = 1;
             cinfo->quant_tbl_ptrs[i]->quantval[j] = q;
@@ -895,6 +921,11 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
   master = (my_master_ptr)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
                                   sizeof(my_comp_master));
+  if (cinfo->master) {
+    MEMCOPY(&master->pub, cinfo->master, sizeof(struct jpeg_comp_master));
+    jpeg_free_small((j_common_ptr) cinfo, cinfo->master,
+                    sizeof(struct jpeg_comp_master));
+  }
   cinfo->master = (struct jpeg_comp_master *) master;
   master->pub.prepare_for_pass = prepare_for_pass;
   master->pub.pass_startup = pass_startup;
@@ -937,12 +968,14 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
     master->total_passes = cinfo->num_scans;
   
   master->pass_number_scan_opt_base = 0;
-  if (cinfo->trellis_quant) {
-    master->pass_number_scan_opt_base = ((cinfo->use_scans_in_trellis) ? 4 : 2) * cinfo->num_components * cinfo->trellis_num_loops;
+  if (cinfo->master->trellis_quant) {
+    master->pass_number_scan_opt_base =
+      ((cinfo->master->use_scans_in_trellis) ? 4 : 2) * cinfo->num_components *
+      cinfo->master->trellis_num_loops;
     master->total_passes += master->pass_number_scan_opt_base;
   }
   
-  if (cinfo->optimize_scans) {
+  if (cinfo->master->optimize_scans) {
     int i;
     master->best_Al_chroma = 0;
     
diff --git a/jcparam.c b/jcparam.c
index d3b8ea15..8af8ae64 100644
--- a/jcparam.c
+++ b/jcparam.c
@@ -136,7 +136,7 @@ jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
  */
 {
   /* Set up two quantization tables using the specified scaling */
-  if (cinfo->use_flat_quant_tbl) {
+  if (cinfo->master->use_flat_quant_tbl) {
     jpeg_add_quant_table(cinfo, 0, flat_quant_tbl,
                          scale_factor, force_baseline);
     jpeg_add_quant_table(cinfo, 1, flat_quant_tbl,
@@ -248,7 +248,7 @@ jpeg_set_defaults (j_compress_ptr cinfo)
 #ifdef C_PROGRESSIVE_SUPPORTED
   cinfo->scan_info = NULL;
   cinfo->num_scans = 0;
-  if (!cinfo->use_moz_defaults) {
+  if (!cinfo->master->use_moz_defaults) {
   /* Default is no multiple-scan output */
   cinfo->scan_info = NULL;
   cinfo->num_scans = 0;
@@ -266,7 +266,7 @@ jpeg_set_defaults (j_compress_ptr cinfo)
   cinfo->arith_code = FALSE;
 
 #ifdef ENTROPY_OPT_SUPPORTED
-  if (cinfo->use_moz_defaults)
+  if (cinfo->master->use_moz_defaults)
     /* By default, do extra passes to optimize entropy coding */
     cinfo->optimize_coding = TRUE;
   else
@@ -293,6 +293,8 @@ jpeg_set_defaults (j_compress_ptr cinfo)
   cinfo->do_fancy_downsampling = TRUE;
 #endif
 
+  cinfo->master->overshoot_deringing = cinfo->master->use_moz_defaults;
+
   /* No input smoothing */
   cinfo->smoothing_factor = 0;
 
@@ -322,26 +324,26 @@ jpeg_set_defaults (j_compress_ptr cinfo)
 
   jpeg_default_colorspace(cinfo);
   
-  cinfo->one_dc_scan = TRUE;
+  cinfo->master->one_dc_scan = TRUE;
   
 #ifdef C_PROGRESSIVE_SUPPORTED
-  if (cinfo->use_moz_defaults) {
-    cinfo->optimize_scans = TRUE;
+  if (cinfo->master->use_moz_defaults) {
+    cinfo->master->optimize_scans = TRUE;
     jpeg_simple_progression(cinfo);
   } else
-    cinfo->optimize_scans = FALSE;
+    cinfo->master->optimize_scans = FALSE;
 #endif
   
-  cinfo->trellis_quant = cinfo->use_moz_defaults;
-  cinfo->lambda_log_scale1 = 16.0;
-  cinfo->lambda_log_scale2 = 15.5;
+  cinfo->master->trellis_quant = cinfo->master->use_moz_defaults;
+  cinfo->master->lambda_log_scale1 = 16.0;
+  cinfo->master->lambda_log_scale2 = 15.5;
   
-  cinfo->use_lambda_weight_tbl = TRUE;
-  cinfo->use_scans_in_trellis = FALSE;
-  cinfo->trellis_freq_split = 8;
-  cinfo->trellis_num_loops = 1;
-  cinfo->trellis_q_opt = FALSE;
-  cinfo->trellis_quant_dc = TRUE;
+  cinfo->master->use_lambda_weight_tbl = TRUE;
+  cinfo->master->use_scans_in_trellis = FALSE;
+  cinfo->master->trellis_freq_split = 8;
+  cinfo->master->trellis_num_loops = 1;
+  cinfo->master->trellis_q_opt = FALSE;
+  cinfo->master->trellis_quant_dc = TRUE;
 }
 
 
@@ -574,7 +576,7 @@ jpeg_search_progression (j_compress_ptr cinfo)
   } else if (ncomps == 1) {
     nscans = 23;
   } else {
-    cinfo->num_scans_luma = 0;
+    cinfo->master->num_scans_luma = 0;
     return FALSE;
   }
   
@@ -595,10 +597,12 @@ jpeg_search_progression (j_compress_ptr cinfo)
   cinfo->scan_info = scanptr;
   cinfo->num_scans = nscans;
   
-  cinfo->Al_max_luma = 3;
-  cinfo->num_scans_luma_dc = 1;
-  cinfo->num_frequency_splits = 5;
-  cinfo->num_scans_luma = cinfo->num_scans_luma_dc + (3 * cinfo->Al_max_luma + 2) + (2 * cinfo->num_frequency_splits + 1);
+  cinfo->master->Al_max_luma = 3;
+  cinfo->master->num_scans_luma_dc = 1;
+  cinfo->master->num_frequency_splits = 5;
+  cinfo->master->num_scans_luma =
+    cinfo->master->num_scans_luma_dc + (3 * cinfo->master->Al_max_luma + 2) +
+    (2 * cinfo->master->num_frequency_splits + 1);
   
   /* 23 scans for luma */
   /* 1 scan for DC */
@@ -609,7 +613,7 @@ jpeg_search_progression (j_compress_ptr cinfo)
   /* last 4 done conditionally */
   
   /* luma DC by itself */
-  if (cinfo->one_dc_scan)
+  if (cinfo->master->one_dc_scan)
     scanptr = fill_dc_scans(scanptr, ncomps, 0, 0);
   else
     scanptr = fill_dc_scans(scanptr, 1, 0, 0);
@@ -617,7 +621,7 @@ jpeg_search_progression (j_compress_ptr cinfo)
   scanptr = fill_a_scan(scanptr, 0, 1, 8, 0, 0);
   scanptr = fill_a_scan(scanptr, 0, 9, 63, 0, 0);
   
-  for (Al = 0; Al < cinfo->Al_max_luma; Al++) {
+  for (Al = 0; Al < cinfo->master->Al_max_luma; Al++) {
     scanptr = fill_a_scan(scanptr, 0, 1, 63, Al+1, Al);
     scanptr = fill_a_scan(scanptr, 0, 1, 8, 0, Al+1);
     scanptr = fill_a_scan(scanptr, 0, 9, 63, 0, Al+1);
@@ -625,17 +629,17 @@ jpeg_search_progression (j_compress_ptr cinfo)
   
   scanptr = fill_a_scan(scanptr, 0, 1, 63, 0, 0);
   
-  for (i = 0; i < cinfo->num_frequency_splits; i++) {
+  for (i = 0; i < cinfo->master->num_frequency_splits; i++) {
     scanptr = fill_a_scan(scanptr, 0, 1, frequency_split[i], 0, 0);
     scanptr = fill_a_scan(scanptr, 0, frequency_split[i]+1, 63, 0, 0);
   }
   
   if (ncomps == 1) {
-    cinfo->Al_max_chroma = 0;
-    cinfo->num_scans_chroma_dc = 0;
+    cinfo->master->Al_max_chroma = 0;
+    cinfo->master->num_scans_chroma_dc = 0;
   } else {
-    cinfo->Al_max_chroma = 2;
-    cinfo->num_scans_chroma_dc = 3;
+    cinfo->master->Al_max_chroma = 2;
+    cinfo->master->num_scans_chroma_dc = 3;
     /* 41 scans for chroma */
     
     /* chroma DC combined */
@@ -649,7 +653,7 @@ jpeg_search_progression (j_compress_ptr cinfo)
     scanptr = fill_a_scan(scanptr, 2, 1, 8, 0, 0);
     scanptr = fill_a_scan(scanptr, 2, 9, 63, 0, 0);
 
-    for (Al = 0; Al < cinfo->Al_max_chroma; Al++) {
+    for (Al = 0; Al < cinfo->master->Al_max_chroma; Al++) {
       scanptr = fill_a_scan(scanptr, 1, 1, 63, Al+1, Al);
       scanptr = fill_a_scan(scanptr, 2, 1, 63, Al+1, Al);
       scanptr = fill_a_scan(scanptr, 1, 1, 8, 0, Al+1);
@@ -661,7 +665,7 @@ jpeg_search_progression (j_compress_ptr cinfo)
     scanptr = fill_a_scan(scanptr, 1, 1, 63, 0, 0);
     scanptr = fill_a_scan(scanptr, 2, 1, 63, 0, 0);
 
-    for (i = 0; i < cinfo->num_frequency_splits; i++) {
+    for (i = 0; i < cinfo->master->num_frequency_splits; i++) {
       scanptr = fill_a_scan(scanptr, 1, 1, frequency_split[i], 0, 0);
       scanptr = fill_a_scan(scanptr, 1, frequency_split[i]+1, 63, 0, 0);
       scanptr = fill_a_scan(scanptr, 2, 1, frequency_split[i], 0, 0);
@@ -684,7 +688,7 @@ jpeg_simple_progression (j_compress_ptr cinfo)
   int nscans;
   jpeg_scan_info * scanptr;
 
-  if (cinfo->optimize_scans) {
+  if (cinfo->master->optimize_scans) {
     if (jpeg_search_progression(cinfo) == TRUE)
       return;
   }
@@ -700,7 +704,7 @@ jpeg_simple_progression (j_compress_ptr cinfo)
     nscans = 10;
   } else {
     /* All-purpose script for other color spaces. */
-    if (cinfo->use_moz_defaults == TRUE) {
+    if (cinfo->master->use_moz_defaults == TRUE) {
     if (ncomps > MAX_COMPS_IN_SCAN)
         nscans = 5 * ncomps;	/* 2 DC + 4 AC scans per component */
       else
@@ -732,12 +736,12 @@ jpeg_simple_progression (j_compress_ptr cinfo)
 
   if (ncomps == 3 && cinfo->jpeg_color_space == JCS_YCbCr) {
     /* Custom script for YCbCr color images. */
-    if (cinfo->use_moz_defaults == TRUE) {
+    if (cinfo->master->use_moz_defaults == TRUE) {
       /* scan defined in jpeg_scan_rgb.txt in jpgcrush */
     /* Initial DC scan */
-      if (cinfo->one_dc_scan)
+      if (cinfo->master->one_dc_scan)
         scanptr = fill_dc_scans(scanptr, ncomps, 0, 0);
-      else if (cinfo->sep_dc_scan) {
+      else if (cinfo->master->sep_dc_scan) {
         scanptr = fill_a_scan(scanptr, 0, 0, 0, 0, 0);
         scanptr = fill_a_scan(scanptr, 1, 0, 0, 0, 0);
         scanptr = fill_a_scan(scanptr, 2, 0, 0, 0, 0);
@@ -780,7 +784,7 @@ jpeg_simple_progression (j_compress_ptr cinfo)
     }
   } else {
     /* All-purpose script for other color spaces. */
-    if (cinfo->use_moz_defaults == TRUE) {
+    if (cinfo->master->use_moz_defaults == TRUE) {
       /* scan defined in jpeg_scan_bw.txt in jpgcrush */
       /* DC component, no successive approximation */
       scanptr = fill_dc_scans(scanptr, ncomps, 0, 0);
diff --git a/jcphuff.c b/jcphuff.c
index 4fbb36dd..da2464ad 100644
--- a/jcphuff.c
+++ b/jcphuff.c
@@ -171,7 +171,7 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
           (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
                                       257 * sizeof(long));
       MEMZERO(entropy->count_ptrs[tbl], 257 * sizeof(long));
-      if (cinfo->trellis_passes) {
+      if (cinfo->master->trellis_passes) {
         /* When generating tables for trellis passes, make sure that all */
         /* codewords have an assigned length */
         int i, j;
diff --git a/jcstest.c b/jcstest.c
index c0eec6e7..79829e6b 100644
--- a/jcstest.c
+++ b/jcstest.c
@@ -87,7 +87,8 @@ int main(void)
 
   jpeg_create_compress(&cinfo);
   cinfo.input_components = 3;
-  cinfo.use_moz_defaults = TRUE;
+  if (jpeg_c_bool_param_supported(&cinfo, JBOOLEAN_USE_MOZ_DEFAULTS))
+    jpeg_c_set_bool_param(&cinfo, JBOOLEAN_USE_MOZ_DEFAULTS, TRUE);
   jpeg_set_defaults(&cinfo);
   cinfo.in_color_space = JCS_EXT_RGB;
   jpeg_default_colorspace(&cinfo);
diff --git a/jctrans.c b/jctrans.c
index 8fffde03..f4be9f3b 100644
--- a/jctrans.c
+++ b/jctrans.c
@@ -6,6 +6,8 @@
  * Modified 2000-2009 by Guido Vollbeding.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
+ * mozjpeg Modifications:
+ * Copyright (C) 2014, Mozilla Corporation.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains library routines for transcoding compression,
@@ -41,8 +43,8 @@ GLOBAL(void)
 jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr * coef_arrays)
 {
   /* setting up scan optimisation pattern failed, disable scan optimisation */
-  if (cinfo->num_scans_luma == 0)
-    cinfo->optimize_scans = FALSE;
+  if (cinfo->master->num_scans_luma == 0)
+    cinfo->master->optimize_scans = FALSE;
   
   if (cinfo->global_state != CSTATE_START)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
@@ -91,7 +93,7 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
 #endif
   /* Initialize all parameters to default values */
   jpeg_set_defaults(dstinfo);
-  dstinfo->trellis_quant = FALSE;
+  dstinfo->master->trellis_quant = FALSE;
   
   /* jpeg_set_defaults may choose wrong colorspace, eg YCbCr if input is RGB.
    * Fix it to get the right header markers for the image colorspace.
diff --git a/jdcol565.c b/jdcol565.c
index a2c98f30..695f2621 100644
--- a/jdcol565.c
+++ b/jdcol565.c
@@ -5,6 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modifications:
  * Copyright (C) 2013, Linaro Limited.
+ * Copyright (C) 2014, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains output colorspace conversion routines.
@@ -13,42 +14,11 @@
 /* This file is included by jdcolor.c */
 
 
-#define PACK_SHORT_565(r, g, b)   ((((r) << 8) & 0xf800) |  \
-                                   (((g) << 3) & 0x7E0) | ((b) >> 3))
-#define PACK_TWO_PIXELS(l, r)     ((r << 16) | l)
-#define PACK_NEED_ALIGNMENT(ptr)  (((size_t)(ptr)) & 3)
-
-#define WRITE_TWO_PIXELS(addr, pixels) {  \
-  ((INT16*)(addr))[0] = (pixels);  \
-  ((INT16*)(addr))[1] = (pixels) >> 16;  \
-}
-#define WRITE_TWO_ALIGNED_PIXELS(addr, pixels)  ((*(INT32 *)(addr)) = pixels)
-
-#define DITHER_565_R(r, dither)  ((r) + ((dither) & 0xFF))
-#define DITHER_565_G(g, dither)  ((g) + (((dither) & 0xFF) >> 1))
-#define DITHER_565_B(b, dither)  ((b) + ((dither) & 0xFF))
-
-
-/* Declarations for ordered dithering
- *
- * We use a 4x4 ordered dither array packed into 32 bits.  This array is
- * sufficent for dithering RGB888 to RGB565.
- */
-
-#define DITHER_MASK       0x3
-#define DITHER_ROTATE(x)  (((x) << 24) | (((x) >> 8) & 0x00FFFFFF))
-static const INT32 dither_matrix[4] = {
-  0x0008020A,
-  0x0C040E06,
-  0x030B0109,
-  0x0F070D05
-};
-
-
-METHODDEF(void)
-ycc_rgb565_convert (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION input_row,
-                    JSAMPARRAY output_buf, int num_rows)
+INLINE
+LOCAL(void)
+ycc_rgb565_convert_internal (j_decompress_ptr cinfo,
+                             JSAMPIMAGE input_buf, JDIMENSION input_row,
+                             JSAMPARRAY output_buf, int num_rows)
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
   register int y, cb, cr;
@@ -123,10 +93,11 @@ ycc_rgb565_convert (j_decompress_ptr cinfo,
 }
 
 
-METHODDEF(void)
-ycc_rgb565D_convert (j_decompress_ptr cinfo,
-                     JSAMPIMAGE input_buf, JDIMENSION input_row,
-                     JSAMPARRAY output_buf, int num_rows)
+INLINE
+LOCAL(void)
+ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
+                              JSAMPIMAGE input_buf, JDIMENSION input_row,
+                              JSAMPARRAY output_buf, int num_rows)
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
   register int y, cb, cr;
@@ -208,10 +179,11 @@ ycc_rgb565D_convert (j_decompress_ptr cinfo,
 }
 
 
-METHODDEF(void)
-rgb_rgb565_convert (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION input_row,
-                    JSAMPARRAY output_buf, int num_rows)
+INLINE
+LOCAL(void)
+rgb_rgb565_convert_internal (j_decompress_ptr cinfo,
+                             JSAMPIMAGE input_buf, JDIMENSION input_row,
+                             JSAMPARRAY output_buf, int num_rows)
 {
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
@@ -262,10 +234,11 @@ rgb_rgb565_convert (j_decompress_ptr cinfo,
 }
 
 
-METHODDEF(void)
-rgb_rgb565D_convert (j_decompress_ptr cinfo,
-                     JSAMPIMAGE input_buf, JDIMENSION input_row,
-                     JSAMPARRAY output_buf, int num_rows)
+INLINE
+LOCAL(void)
+rgb_rgb565D_convert_internal (j_decompress_ptr cinfo,
+                              JSAMPIMAGE input_buf, JDIMENSION input_row,
+                              JSAMPARRAY output_buf, int num_rows)
 {
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
@@ -320,10 +293,11 @@ rgb_rgb565D_convert (j_decompress_ptr cinfo,
 }
 
 
-METHODDEF(void)
-gray_rgb565_convert (j_decompress_ptr cinfo,
-                     JSAMPIMAGE input_buf, JDIMENSION input_row,
-                     JSAMPARRAY output_buf, int num_rows)
+INLINE
+LOCAL(void)
+gray_rgb565_convert_internal (j_decompress_ptr cinfo,
+                              JSAMPIMAGE input_buf, JDIMENSION input_row,
+                              JSAMPARRAY output_buf, int num_rows)
 {
   register JSAMPROW inptr, outptr;
   register JDIMENSION col;
@@ -359,10 +333,11 @@ gray_rgb565_convert (j_decompress_ptr cinfo,
 }
 
 
-METHODDEF(void)
-gray_rgb565D_convert (j_decompress_ptr cinfo,
-                      JSAMPIMAGE input_buf, JDIMENSION input_row,
-                      JSAMPARRAY output_buf, int num_rows)
+INLINE
+LOCAL(void)
+gray_rgb565D_convert_internal (j_decompress_ptr cinfo,
+                               JSAMPIMAGE input_buf, JDIMENSION input_row,
+                               JSAMPARRAY output_buf, int num_rows)
 {
   register JSAMPROW inptr, outptr;
   register JDIMENSION col;
diff --git a/jdcolor.c b/jdcolor.c
index ffedabd5..779fa51f 100644
--- a/jdcolor.c
+++ b/jdcolor.c
@@ -544,7 +544,162 @@ ycck_cmyk_convert (j_decompress_ptr cinfo,
 }
 
 
+/*
+ * RGB565 conversion
+ */
+
+#define PACK_SHORT_565_LE(r, g, b)   ((((r) << 8) & 0xF800) |  \
+                                      (((g) << 3) & 0x7E0) | ((b) >> 3))
+#define PACK_SHORT_565_BE(r, g, b)   (((r) & 0xF8) | ((g) >> 5) |  \
+                                      (((g) << 11) & 0xE000) |  \
+                                      (((b) << 5) & 0x1F00))
+
+#define PACK_TWO_PIXELS_LE(l, r)     ((r << 16) | l)
+#define PACK_TWO_PIXELS_BE(l, r)     ((l << 16) | r)
+
+#define PACK_NEED_ALIGNMENT(ptr)     (((size_t)(ptr)) & 3)
+
+#define WRITE_TWO_ALIGNED_PIXELS(addr, pixels)  ((*(int *)(addr)) = pixels)
+
+#define DITHER_565_R(r, dither)  ((r) + ((dither) & 0xFF))
+#define DITHER_565_G(g, dither)  ((g) + (((dither) & 0xFF) >> 1))
+#define DITHER_565_B(b, dither)  ((b) + ((dither) & 0xFF))
+
+
+/* Declarations for ordered dithering
+ *
+ * We use a 4x4 ordered dither array packed into 32 bits.  This array is
+ * sufficent for dithering RGB888 to RGB565.
+ */
+
+#define DITHER_MASK       0x3
+#define DITHER_ROTATE(x)  (((x) << 24) | (((x) >> 8) & 0x00FFFFFF))
+static const INT32 dither_matrix[4] = {
+  0x0008020A,
+  0x0C040E06,
+  0x030B0109,
+  0x0F070D05
+};
+
+
+static INLINE boolean is_big_endian(void)
+{
+  int test_value = 1;
+  if(*(char *)&test_value != 1)
+    return TRUE;
+  return FALSE;
+}
+
+
+/* Include inline routines for RGB565 conversion */
+
+#define PACK_SHORT_565 PACK_SHORT_565_LE
+#define PACK_TWO_PIXELS PACK_TWO_PIXELS_LE
+#define ycc_rgb565_convert_internal ycc_rgb565_convert_le
+#define ycc_rgb565D_convert_internal ycc_rgb565D_convert_le
+#define rgb_rgb565_convert_internal rgb_rgb565_convert_le
+#define rgb_rgb565D_convert_internal rgb_rgb565D_convert_le
+#define gray_rgb565_convert_internal gray_rgb565_convert_le
+#define gray_rgb565D_convert_internal gray_rgb565D_convert_le
 #include "jdcol565.c"
+#undef PACK_SHORT_565
+#undef PACK_TWO_PIXELS
+#undef ycc_rgb565_convert_internal
+#undef ycc_rgb565D_convert_internal
+#undef rgb_rgb565_convert_internal
+#undef rgb_rgb565D_convert_internal
+#undef gray_rgb565_convert_internal
+#undef gray_rgb565D_convert_internal
+
+#define PACK_SHORT_565 PACK_SHORT_565_BE
+#define PACK_TWO_PIXELS PACK_TWO_PIXELS_BE
+#define ycc_rgb565_convert_internal ycc_rgb565_convert_be
+#define ycc_rgb565D_convert_internal ycc_rgb565D_convert_be
+#define rgb_rgb565_convert_internal rgb_rgb565_convert_be
+#define rgb_rgb565D_convert_internal rgb_rgb565D_convert_be
+#define gray_rgb565_convert_internal gray_rgb565_convert_be
+#define gray_rgb565D_convert_internal gray_rgb565D_convert_be
+#include "jdcol565.c"
+#undef PACK_SHORT_565
+#undef PACK_TWO_PIXELS
+#undef ycc_rgb565_convert_internal
+#undef ycc_rgb565D_convert_internal
+#undef rgb_rgb565_convert_internal
+#undef rgb_rgb565D_convert_internal
+#undef gray_rgb565_convert_internal
+#undef gray_rgb565D_convert_internal
+
+
+METHODDEF(void)
+ycc_rgb565_convert (j_decompress_ptr cinfo,
+                    JSAMPIMAGE input_buf, JDIMENSION input_row,
+                    JSAMPARRAY output_buf, int num_rows)
+{
+  if (is_big_endian())
+    ycc_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
+  else
+    ycc_rgb565_convert_le(cinfo, input_buf, input_row, output_buf, num_rows);
+}
+
+
+METHODDEF(void)
+ycc_rgb565D_convert (j_decompress_ptr cinfo,
+                     JSAMPIMAGE input_buf, JDIMENSION input_row,
+                     JSAMPARRAY output_buf, int num_rows)
+{
+  if (is_big_endian())
+    ycc_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
+  else
+    ycc_rgb565D_convert_le(cinfo, input_buf, input_row, output_buf, num_rows);
+}
+
+
+METHODDEF(void)
+rgb_rgb565_convert (j_decompress_ptr cinfo,
+                    JSAMPIMAGE input_buf, JDIMENSION input_row,
+                    JSAMPARRAY output_buf, int num_rows)
+{
+  if (is_big_endian())
+    rgb_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
+  else
+    rgb_rgb565_convert_le(cinfo, input_buf, input_row, output_buf, num_rows);
+}
+
+
+METHODDEF(void)
+rgb_rgb565D_convert (j_decompress_ptr cinfo,
+                     JSAMPIMAGE input_buf, JDIMENSION input_row,
+                     JSAMPARRAY output_buf, int num_rows)
+{
+  if (is_big_endian())
+    rgb_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
+  else
+    rgb_rgb565D_convert_le(cinfo, input_buf, input_row, output_buf, num_rows);
+}
+
+
+METHODDEF(void)
+gray_rgb565_convert (j_decompress_ptr cinfo,
+                     JSAMPIMAGE input_buf, JDIMENSION input_row,
+                     JSAMPARRAY output_buf, int num_rows)
+{
+  if (is_big_endian())
+    gray_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
+  else
+    gray_rgb565_convert_le(cinfo, input_buf, input_row, output_buf, num_rows);
+}
+
+
+METHODDEF(void)
+gray_rgb565D_convert (j_decompress_ptr cinfo,
+                      JSAMPIMAGE input_buf, JDIMENSION input_row,
+                      JSAMPARRAY output_buf, int num_rows)
+{
+  if (is_big_endian())
+    gray_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
+  else
+    gray_rgb565D_convert_le(cinfo, input_buf, input_row, output_buf, num_rows);
+}
 
 
 /*
diff --git a/jdmerge.c b/jdmerge.c
index f89d69f5..e13adb9f 100644
--- a/jdmerge.c
+++ b/jdmerge.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2011, D. R. Commander.
+ * Copyright (C) 2009, 2011, 2014 D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -45,38 +45,6 @@
 #ifdef UPSAMPLE_MERGING_SUPPORTED
 
 
-#define PACK_SHORT_565(r, g, b)   ((((r) << 8) & 0xf800) |  \
-                                   (((g) << 3) & 0x7E0) | ((b) >> 3))
-#define PACK_TWO_PIXELS(l, r)     ((r << 16) | l)
-#define PACK_NEED_ALIGNMENT(ptr)  (((size_t)(ptr)) & 3)
-
-#define WRITE_TWO_PIXELS(addr, pixels) {  \
-  ((INT16*)(addr))[0] = (pixels);  \
-  ((INT16*)(addr))[1] = (pixels) >> 16;  \
-}
-#define WRITE_TWO_ALIGNED_PIXELS(addr, pixels)  ((*(INT32 *)(addr)) = pixels)
-
-#define DITHER_565_R(r, dither)  ((r) + ((dither) & 0xFF))
-#define DITHER_565_G(g, dither)  ((g) + (((dither) & 0xFF) >> 1))
-#define DITHER_565_B(b, dither)  ((b) + ((dither) & 0xFF))
-
-
-/* Declarations for ordered dithering
- *
- * We use a 4x4 ordered dither array packed into 32 bits.  This array is
- * sufficent for dithering RGB888 to RGB565.
- */
-
-#define DITHER_MASK       0x3
-#define DITHER_ROTATE(x)  (((x) << 24) | (((x) >> 8) & 0x00FFFFFF))
-static const INT32 dither_matrix[4] = {
-  0x0008020A,
-  0x0C040E06,
-  0x030B0109,
-  0x0F070D05
-};
-
-
 /* Private subobject */
 
 typedef struct {
@@ -451,72 +419,106 @@ h2v2_merged_upsample (j_decompress_ptr cinfo,
 }
 
 
+/*
+ * RGB565 conversion
+ */
+
+#define PACK_SHORT_565_LE(r, g, b)   ((((r) << 8) & 0xF800) |  \
+                                      (((g) << 3) & 0x7E0) | ((b) >> 3))
+#define PACK_SHORT_565_BE(r, g, b)   (((r) & 0xF8) | ((g) >> 5) |  \
+                                      (((g) << 11) & 0xE000) |  \
+                                      (((b) << 5) & 0x1F00))
+
+#define PACK_TWO_PIXELS_LE(l, r)     ((r << 16) | l)
+#define PACK_TWO_PIXELS_BE(l, r)     ((l << 16) | r)
+
+#define PACK_NEED_ALIGNMENT(ptr)  (((size_t)(ptr)) & 3)
+
+#define WRITE_TWO_PIXELS_LE(addr, pixels) {  \
+  ((INT16*)(addr))[0] = (pixels);  \
+  ((INT16*)(addr))[1] = (pixels) >> 16;  \
+}
+#define WRITE_TWO_PIXELS_BE(addr, pixels) {  \
+  ((INT16*)(addr))[1] = (pixels);  \
+  ((INT16*)(addr))[0] = (pixels) >> 16;  \
+}
+
+#define DITHER_565_R(r, dither)  ((r) + ((dither) & 0xFF))
+#define DITHER_565_G(g, dither)  ((g) + (((dither) & 0xFF) >> 1))
+#define DITHER_565_B(b, dither)  ((b) + ((dither) & 0xFF))
+
+
+/* Declarations for ordered dithering
+ *
+ * We use a 4x4 ordered dither array packed into 32 bits.  This array is
+ * sufficent for dithering RGB888 to RGB565.
+ */
+
+#define DITHER_MASK       0x3
+#define DITHER_ROTATE(x)  (((x) << 24) | (((x) >> 8) & 0x00FFFFFF))
+static const INT32 dither_matrix[4] = {
+  0x0008020A,
+  0x0C040E06,
+  0x030B0109,
+  0x0F070D05
+};
+
+
+/* Include inline routines for RGB565 conversion */
+
+#define PACK_SHORT_565 PACK_SHORT_565_LE
+#define PACK_TWO_PIXELS PACK_TWO_PIXELS_LE
+#define WRITE_TWO_PIXELS WRITE_TWO_PIXELS_LE
+#define h2v1_merged_upsample_565_internal h2v1_merged_upsample_565_le
+#define h2v1_merged_upsample_565D_internal h2v1_merged_upsample_565D_le
+#define h2v2_merged_upsample_565_internal h2v2_merged_upsample_565_le
+#define h2v2_merged_upsample_565D_internal h2v2_merged_upsample_565D_le
+#include "jdmrg565.c"
+#undef PACK_SHORT_565
+#undef PACK_TWO_PIXELS
+#undef WRITE_TWO_PIXELS
+#undef h2v1_merged_upsample_565_internal
+#undef h2v1_merged_upsample_565D_internal
+#undef h2v2_merged_upsample_565_internal
+#undef h2v2_merged_upsample_565D_internal
+
+#define PACK_SHORT_565 PACK_SHORT_565_BE
+#define PACK_TWO_PIXELS PACK_TWO_PIXELS_BE
+#define WRITE_TWO_PIXELS WRITE_TWO_PIXELS_BE
+#define h2v1_merged_upsample_565_internal h2v1_merged_upsample_565_be
+#define h2v1_merged_upsample_565D_internal h2v1_merged_upsample_565D_be
+#define h2v2_merged_upsample_565_internal h2v2_merged_upsample_565_be
+#define h2v2_merged_upsample_565D_internal h2v2_merged_upsample_565D_be
+#include "jdmrg565.c"
+#undef PACK_SHORT_565
+#undef PACK_TWO_PIXELS
+#undef WRITE_TWO_PIXELS
+#undef h2v1_merged_upsample_565_internal
+#undef h2v1_merged_upsample_565D_internal
+#undef h2v2_merged_upsample_565_internal
+#undef h2v2_merged_upsample_565D_internal
+
+
+static INLINE boolean is_big_endian(void)
+{
+  int test_value = 1;
+  if(*(char *)&test_value != 1)
+    return TRUE;
+  return FALSE;
+}
+
+
 METHODDEF(void)
 h2v1_merged_upsample_565 (j_decompress_ptr cinfo,
                           JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
                           JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
-  register int y, cred, cgreen, cblue;
-  int cb, cr;
-  register JSAMPROW outptr;
-  JSAMPROW inptr0, inptr1, inptr2;
-  JDIMENSION col;
-  /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
-  unsigned int r, g, b;
-  INT32 rgb;
-  SHIFT_TEMPS
-
-  inptr0 = input_buf[0][in_row_group_ctr];
-  inptr1 = input_buf[1][in_row_group_ctr];
-  inptr2 = input_buf[2][in_row_group_ctr];
-  outptr = output_buf[0];
-
-  /* Loop for each pair of output pixels */
-  for (col = cinfo->output_width >> 1; col > 0; col--) {
-    /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];
-
-    /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
-    r = range_limit[y + cred];
-    g = range_limit[y + cgreen];
-    b = range_limit[y + cblue];
-    rgb = PACK_SHORT_565(r, g, b);
-
-    y  = GETJSAMPLE(*inptr0++);
-    r = range_limit[y + cred];
-    g = range_limit[y + cgreen];
-    b = range_limit[y + cblue];
-    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
-
-    WRITE_TWO_PIXELS(outptr, rgb);
-    outptr += 4;
-  }
-
-  /* If image width is odd, do the last output column separately */
-  if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
-    r = range_limit[y + cred];
-    g = range_limit[y + cgreen];
-    b = range_limit[y + cblue];
-    rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr = rgb;
-   }
+  if (is_big_endian())
+    h2v1_merged_upsample_565_be(cinfo, input_buf, in_row_group_ctr,
+                                output_buf);
+  else
+    h2v1_merged_upsample_565_le(cinfo, input_buf, in_row_group_ctr,
+                                output_buf);
  }
 
 
@@ -525,70 +527,12 @@ h2v1_merged_upsample_565D (j_decompress_ptr cinfo,
                            JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
                            JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
-  register int y, cred, cgreen, cblue;
-  int cb, cr;
-  register JSAMPROW outptr;
-  JSAMPROW inptr0, inptr1, inptr2;
-  JDIMENSION col;
-  /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
-  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
-  unsigned int r, g, b;
-  INT32 rgb;
-  SHIFT_TEMPS
-
-  inptr0 = input_buf[0][in_row_group_ctr];
-  inptr1 = input_buf[1][in_row_group_ctr];
-  inptr2 = input_buf[2][in_row_group_ctr];
-  outptr = output_buf[0];
-
-  /* Loop for each pair of output pixels */
-  for (col = cinfo->output_width >> 1; col > 0; col--) {
-    /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];
-
-    /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
-    r = range_limit[DITHER_565_R(y + cred, d0)];
-    g = range_limit[DITHER_565_G(y + cgreen, d0)];
-    b = range_limit[DITHER_565_B(y + cblue, d0)];
-    d0 = DITHER_ROTATE(d0);
-    rgb = PACK_SHORT_565(r, g, b);
-
-    y  = GETJSAMPLE(*inptr0++);
-    r = range_limit[DITHER_565_R(y + cred, d0)];
-    g = range_limit[DITHER_565_G(y + cgreen, d0)];
-    b = range_limit[DITHER_565_B(y + cblue, d0)];
-    d0 = DITHER_ROTATE(d0);
-    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
-
-    WRITE_TWO_PIXELS(outptr, rgb);
-    outptr += 4;
-  }
-
-  /* If image width is odd, do the last output column separately */
-  if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
-    r = range_limit[DITHER_565_R(y + cred, d0)];
-    g = range_limit[DITHER_565_G(y + cgreen, d0)];
-    b = range_limit[DITHER_565_B(y + cblue, d0)];
-    rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr = rgb;
-  }
+  if (is_big_endian())
+    h2v1_merged_upsample_565D_be(cinfo, input_buf, in_row_group_ctr,
+                                 output_buf);
+  else
+    h2v1_merged_upsample_565D_le(cinfo, input_buf, in_row_group_ctr,
+                                 output_buf);
 }
 
 
@@ -597,92 +541,12 @@ h2v2_merged_upsample_565 (j_decompress_ptr cinfo,
                           JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
                           JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
-  register int y, cred, cgreen, cblue;
-  int cb, cr;
-  register JSAMPROW outptr0, outptr1;
-  JSAMPROW inptr00, inptr01, inptr1, inptr2;
-  JDIMENSION col;
-  /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
-  unsigned int r, g, b;
-  INT32 rgb;
-  SHIFT_TEMPS
-
-  inptr00 = input_buf[0][in_row_group_ctr * 2];
-  inptr01 = input_buf[0][in_row_group_ctr * 2 + 1];
-  inptr1 = input_buf[1][in_row_group_ctr];
-  inptr2 = input_buf[2][in_row_group_ctr];
-  outptr0 = output_buf[0];
-  outptr1 = output_buf[1];
-
-  /* Loop for each group of output pixels */
-  for (col = cinfo->output_width >> 1; col > 0; col--) {
-    /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];
-
-    /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
-    r = range_limit[y + cred];
-    g = range_limit[y + cgreen];
-    b = range_limit[y + cblue];
-    rgb = PACK_SHORT_565(r, g, b);
-
-    y  = GETJSAMPLE(*inptr00++);
-    r = range_limit[y + cred];
-    g = range_limit[y + cgreen];
-    b = range_limit[y + cblue];
-    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
-
-    WRITE_TWO_PIXELS(outptr0, rgb);
-    outptr0 += 4;
-
-    y  = GETJSAMPLE(*inptr01++);
-    r = range_limit[y + cred];
-    g = range_limit[y + cgreen];
-    b = range_limit[y + cblue];
-    rgb = PACK_SHORT_565(r, g, b);
-
-    y  = GETJSAMPLE(*inptr01++);
-    r = range_limit[y + cred];
-    g = range_limit[y + cgreen];
-    b = range_limit[y + cblue];
-    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
-
-    WRITE_TWO_PIXELS(outptr1, rgb);
-    outptr1 += 4;
-  }
-
-  /* If image width is odd, do the last output column separately */
-  if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];
-
-    y  = GETJSAMPLE(*inptr00);
-    r = range_limit[y + cred];
-    g = range_limit[y + cgreen];
-    b = range_limit[y + cblue];
-    rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr0 = rgb;
-
-    y  = GETJSAMPLE(*inptr01);
-    r = range_limit[y + cred];
-    g = range_limit[y + cgreen];
-    b = range_limit[y + cblue];
-    rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr1 = rgb;
-  }
+  if (is_big_endian())
+    h2v2_merged_upsample_565_be(cinfo, input_buf, in_row_group_ctr,
+                                output_buf);
+  else
+    h2v2_merged_upsample_565_le(cinfo, input_buf, in_row_group_ctr,
+                                output_buf);
 }
 
 
@@ -691,98 +555,12 @@ h2v2_merged_upsample_565D (j_decompress_ptr cinfo,
                            JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
                            JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
-  register int y, cred, cgreen, cblue;
-  int cb, cr;
-  register JSAMPROW outptr0, outptr1;
-  JSAMPROW inptr00, inptr01, inptr1, inptr2;
-  JDIMENSION col;
-  /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
-  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
-  INT32 d1 = dither_matrix[(cinfo->output_scanline+1) & DITHER_MASK];
-  unsigned int r, g, b;
-  INT32 rgb;
-  SHIFT_TEMPS
-
-  inptr00 = input_buf[0][in_row_group_ctr*2];
-  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
-  inptr1 = input_buf[1][in_row_group_ctr];
-  inptr2 = input_buf[2][in_row_group_ctr];
-  outptr0 = output_buf[0];
-  outptr1 = output_buf[1];
-
-  /* Loop for each group of output pixels */
-  for (col = cinfo->output_width >> 1; col > 0; col--) {
-    /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];
-
-    /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
-    r = range_limit[DITHER_565_R(y + cred, d0)];
-    g = range_limit[DITHER_565_G(y + cgreen, d0)];
-    b = range_limit[DITHER_565_B(y + cblue, d0)];
-    d0 = DITHER_ROTATE(d0);
-    rgb = PACK_SHORT_565(r, g, b);
-
-    y  = GETJSAMPLE(*inptr00++);
-    r = range_limit[DITHER_565_R(y + cred, d1)];
-    g = range_limit[DITHER_565_G(y + cgreen, d1)];
-    b = range_limit[DITHER_565_B(y + cblue, d1)];
-    d1 = DITHER_ROTATE(d1);
-    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
-
-    WRITE_TWO_PIXELS(outptr0, rgb);
-    outptr0 += 4;
-
-    y  = GETJSAMPLE(*inptr01++);
-    r = range_limit[DITHER_565_R(y + cred, d0)];
-    g = range_limit[DITHER_565_G(y + cgreen, d0)];
-    b = range_limit[DITHER_565_B(y + cblue, d0)];
-    d0 = DITHER_ROTATE(d0);
-    rgb = PACK_SHORT_565(r, g, b);
-
-    y  = GETJSAMPLE(*inptr01++);
-    r = range_limit[DITHER_565_R(y + cred, d1)];
-    g = range_limit[DITHER_565_G(y + cgreen, d1)];
-    b = range_limit[DITHER_565_B(y + cblue, d1)];
-    d1 = DITHER_ROTATE(d1);
-    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
-
-    WRITE_TWO_PIXELS(outptr1, rgb);
-    outptr1 += 4;
-  }
-
-  /* If image width is odd, do the last output column separately */
-  if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];
-
-    y  = GETJSAMPLE(*inptr00);
-    r = range_limit[DITHER_565_R(y + cred, d0)];
-    g = range_limit[DITHER_565_G(y + cgreen, d0)];
-    b = range_limit[DITHER_565_B(y + cblue, d0)];
-    rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr0 = rgb;
-
-    y  = GETJSAMPLE(*inptr01);
-    r = range_limit[DITHER_565_R(y + cred, d1)];
-    g = range_limit[DITHER_565_G(y + cgreen, d1)];
-    b = range_limit[DITHER_565_B(y + cblue, d1)];
-    rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr1 = rgb;
-  }
+  if (is_big_endian())
+    h2v2_merged_upsample_565D_be(cinfo, input_buf, in_row_group_ctr,
+                                 output_buf);
+  else
+    h2v2_merged_upsample_565D_le(cinfo, input_buf, in_row_group_ctr,
+                                 output_buf);
 }
 
 
diff --git a/jdmrg565.c b/jdmrg565.c
new file mode 100644
index 00000000..0a10bccb
--- /dev/null
+++ b/jdmrg565.c
@@ -0,0 +1,355 @@
+/*
+ * jdmrg565.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2013, Linaro Limited.
+ * Copyright (C) 2014, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains code for merged upsampling/color conversion.
+ */
+
+
+INLINE
+LOCAL(void)
+h2v1_merged_upsample_565_internal (j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf,
+                                   JDIMENSION in_row_group_ctr,
+                                   JSAMPARRAY output_buf)
+{
+  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  register int y, cred, cgreen, cblue;
+  int cb, cr;
+  register JSAMPROW outptr;
+  JSAMPROW inptr0, inptr1, inptr2;
+  JDIMENSION col;
+  /* copy these pointers into registers if possible */
+  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  int * Crrtab = upsample->Cr_r_tab;
+  int * Cbbtab = upsample->Cb_b_tab;
+  INT32 * Crgtab = upsample->Cr_g_tab;
+  INT32 * Cbgtab = upsample->Cb_g_tab;
+  unsigned int r, g, b;
+  INT32 rgb;
+  SHIFT_TEMPS
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  /* Loop for each pair of output pixels */
+  for (col = cinfo->output_width >> 1; col > 0; col--) {
+    /* Do the chroma part of the calculation */
+    cb = GETJSAMPLE(*inptr1++);
+    cr = GETJSAMPLE(*inptr2++);
+    cred = Crrtab[cr];
+    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cblue = Cbbtab[cb];
+
+    /* Fetch 2 Y values and emit 2 pixels */
+    y  = GETJSAMPLE(*inptr0++);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_SHORT_565(r, g, b);
+
+    y  = GETJSAMPLE(*inptr0++);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
+
+    WRITE_TWO_PIXELS(outptr, rgb);
+    outptr += 4;
+  }
+
+  /* If image width is odd, do the last output column separately */
+  if (cinfo->output_width & 1) {
+    cb = GETJSAMPLE(*inptr1);
+    cr = GETJSAMPLE(*inptr2);
+    cred = Crrtab[cr];
+    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cblue = Cbbtab[cb];
+    y  = GETJSAMPLE(*inptr0);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_SHORT_565(r, g, b);
+    *(INT16*)outptr = rgb;
+   }
+ }
+
+
+INLINE
+LOCAL(void)
+h2v1_merged_upsample_565D_internal (j_decompress_ptr cinfo,
+                                    JSAMPIMAGE input_buf,
+                                    JDIMENSION in_row_group_ctr,
+                                    JSAMPARRAY output_buf)
+{
+  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  register int y, cred, cgreen, cblue;
+  int cb, cr;
+  register JSAMPROW outptr;
+  JSAMPROW inptr0, inptr1, inptr2;
+  JDIMENSION col;
+  /* copy these pointers into registers if possible */
+  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  int * Crrtab = upsample->Cr_r_tab;
+  int * Cbbtab = upsample->Cb_b_tab;
+  INT32 * Crgtab = upsample->Cr_g_tab;
+  INT32 * Cbgtab = upsample->Cb_g_tab;
+  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+  unsigned int r, g, b;
+  INT32 rgb;
+  SHIFT_TEMPS
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  /* Loop for each pair of output pixels */
+  for (col = cinfo->output_width >> 1; col > 0; col--) {
+    /* Do the chroma part of the calculation */
+    cb = GETJSAMPLE(*inptr1++);
+    cr = GETJSAMPLE(*inptr2++);
+    cred = Crrtab[cr];
+    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cblue = Cbbtab[cb];
+
+    /* Fetch 2 Y values and emit 2 pixels */
+    y  = GETJSAMPLE(*inptr0++);
+    r = range_limit[DITHER_565_R(y + cred, d0)];
+    g = range_limit[DITHER_565_G(y + cgreen, d0)];
+    b = range_limit[DITHER_565_B(y + cblue, d0)];
+    d0 = DITHER_ROTATE(d0);
+    rgb = PACK_SHORT_565(r, g, b);
+
+    y  = GETJSAMPLE(*inptr0++);
+    r = range_limit[DITHER_565_R(y + cred, d0)];
+    g = range_limit[DITHER_565_G(y + cgreen, d0)];
+    b = range_limit[DITHER_565_B(y + cblue, d0)];
+    d0 = DITHER_ROTATE(d0);
+    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
+
+    WRITE_TWO_PIXELS(outptr, rgb);
+    outptr += 4;
+  }
+
+  /* If image width is odd, do the last output column separately */
+  if (cinfo->output_width & 1) {
+    cb = GETJSAMPLE(*inptr1);
+    cr = GETJSAMPLE(*inptr2);
+    cred = Crrtab[cr];
+    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cblue = Cbbtab[cb];
+    y  = GETJSAMPLE(*inptr0);
+    r = range_limit[DITHER_565_R(y + cred, d0)];
+    g = range_limit[DITHER_565_G(y + cgreen, d0)];
+    b = range_limit[DITHER_565_B(y + cblue, d0)];
+    rgb = PACK_SHORT_565(r, g, b);
+    *(INT16*)outptr = rgb;
+  }
+}
+
+
+INLINE
+LOCAL(void)
+h2v2_merged_upsample_565_internal (j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf,
+                                   JDIMENSION in_row_group_ctr,
+                                   JSAMPARRAY output_buf)
+{
+  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  register int y, cred, cgreen, cblue;
+  int cb, cr;
+  register JSAMPROW outptr0, outptr1;
+  JSAMPROW inptr00, inptr01, inptr1, inptr2;
+  JDIMENSION col;
+  /* copy these pointers into registers if possible */
+  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  int * Crrtab = upsample->Cr_r_tab;
+  int * Cbbtab = upsample->Cb_b_tab;
+  INT32 * Crgtab = upsample->Cr_g_tab;
+  INT32 * Cbgtab = upsample->Cb_g_tab;
+  unsigned int r, g, b;
+  INT32 rgb;
+  SHIFT_TEMPS
+
+  inptr00 = input_buf[0][in_row_group_ctr * 2];
+  inptr01 = input_buf[0][in_row_group_ctr * 2 + 1];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr0 = output_buf[0];
+  outptr1 = output_buf[1];
+
+  /* Loop for each group of output pixels */
+  for (col = cinfo->output_width >> 1; col > 0; col--) {
+    /* Do the chroma part of the calculation */
+    cb = GETJSAMPLE(*inptr1++);
+    cr = GETJSAMPLE(*inptr2++);
+    cred = Crrtab[cr];
+    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cblue = Cbbtab[cb];
+
+    /* Fetch 4 Y values and emit 4 pixels */
+    y  = GETJSAMPLE(*inptr00++);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_SHORT_565(r, g, b);
+
+    y  = GETJSAMPLE(*inptr00++);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
+
+    WRITE_TWO_PIXELS(outptr0, rgb);
+    outptr0 += 4;
+
+    y  = GETJSAMPLE(*inptr01++);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_SHORT_565(r, g, b);
+
+    y  = GETJSAMPLE(*inptr01++);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
+
+    WRITE_TWO_PIXELS(outptr1, rgb);
+    outptr1 += 4;
+  }
+
+  /* If image width is odd, do the last output column separately */
+  if (cinfo->output_width & 1) {
+    cb = GETJSAMPLE(*inptr1);
+    cr = GETJSAMPLE(*inptr2);
+    cred = Crrtab[cr];
+    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cblue = Cbbtab[cb];
+
+    y  = GETJSAMPLE(*inptr00);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_SHORT_565(r, g, b);
+    *(INT16*)outptr0 = rgb;
+
+    y  = GETJSAMPLE(*inptr01);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_SHORT_565(r, g, b);
+    *(INT16*)outptr1 = rgb;
+  }
+}
+
+
+INLINE
+LOCAL(void)
+h2v2_merged_upsample_565D_internal (j_decompress_ptr cinfo,
+                                    JSAMPIMAGE input_buf,
+                                    JDIMENSION in_row_group_ctr,
+                                    JSAMPARRAY output_buf)
+{
+  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  register int y, cred, cgreen, cblue;
+  int cb, cr;
+  register JSAMPROW outptr0, outptr1;
+  JSAMPROW inptr00, inptr01, inptr1, inptr2;
+  JDIMENSION col;
+  /* copy these pointers into registers if possible */
+  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  int * Crrtab = upsample->Cr_r_tab;
+  int * Cbbtab = upsample->Cb_b_tab;
+  INT32 * Crgtab = upsample->Cr_g_tab;
+  INT32 * Cbgtab = upsample->Cb_g_tab;
+  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+  INT32 d1 = dither_matrix[(cinfo->output_scanline+1) & DITHER_MASK];
+  unsigned int r, g, b;
+  INT32 rgb;
+  SHIFT_TEMPS
+
+  inptr00 = input_buf[0][in_row_group_ctr*2];
+  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr0 = output_buf[0];
+  outptr1 = output_buf[1];
+
+  /* Loop for each group of output pixels */
+  for (col = cinfo->output_width >> 1; col > 0; col--) {
+    /* Do the chroma part of the calculation */
+    cb = GETJSAMPLE(*inptr1++);
+    cr = GETJSAMPLE(*inptr2++);
+    cred = Crrtab[cr];
+    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cblue = Cbbtab[cb];
+
+    /* Fetch 4 Y values and emit 4 pixels */
+    y  = GETJSAMPLE(*inptr00++);
+    r = range_limit[DITHER_565_R(y + cred, d0)];
+    g = range_limit[DITHER_565_G(y + cgreen, d0)];
+    b = range_limit[DITHER_565_B(y + cblue, d0)];
+    d0 = DITHER_ROTATE(d0);
+    rgb = PACK_SHORT_565(r, g, b);
+
+    y  = GETJSAMPLE(*inptr00++);
+    r = range_limit[DITHER_565_R(y + cred, d1)];
+    g = range_limit[DITHER_565_G(y + cgreen, d1)];
+    b = range_limit[DITHER_565_B(y + cblue, d1)];
+    d1 = DITHER_ROTATE(d1);
+    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
+
+    WRITE_TWO_PIXELS(outptr0, rgb);
+    outptr0 += 4;
+
+    y  = GETJSAMPLE(*inptr01++);
+    r = range_limit[DITHER_565_R(y + cred, d0)];
+    g = range_limit[DITHER_565_G(y + cgreen, d0)];
+    b = range_limit[DITHER_565_B(y + cblue, d0)];
+    d0 = DITHER_ROTATE(d0);
+    rgb = PACK_SHORT_565(r, g, b);
+
+    y  = GETJSAMPLE(*inptr01++);
+    r = range_limit[DITHER_565_R(y + cred, d1)];
+    g = range_limit[DITHER_565_G(y + cgreen, d1)];
+    b = range_limit[DITHER_565_B(y + cblue, d1)];
+    d1 = DITHER_ROTATE(d1);
+    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
+
+    WRITE_TWO_PIXELS(outptr1, rgb);
+    outptr1 += 4;
+  }
+
+  /* If image width is odd, do the last output column separately */
+  if (cinfo->output_width & 1) {
+    cb = GETJSAMPLE(*inptr1);
+    cr = GETJSAMPLE(*inptr2);
+    cred = Crrtab[cr];
+    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cblue = Cbbtab[cb];
+
+    y  = GETJSAMPLE(*inptr00);
+    r = range_limit[DITHER_565_R(y + cred, d0)];
+    g = range_limit[DITHER_565_G(y + cgreen, d0)];
+    b = range_limit[DITHER_565_B(y + cblue, d0)];
+    rgb = PACK_SHORT_565(r, g, b);
+    *(INT16*)outptr0 = rgb;
+
+    y  = GETJSAMPLE(*inptr01);
+    r = range_limit[DITHER_565_R(y + cred, d1)];
+    g = range_limit[DITHER_565_G(y + cgreen, d1)];
+    b = range_limit[DITHER_565_B(y + cblue, d1)];
+    rgb = PACK_SHORT_565(r, g, b);
+    *(INT16*)outptr1 = rgb;
+  }
+}
diff --git a/jerror.h b/jerror.h
index 402613e0..816fdf9a 100644
--- a/jerror.h
+++ b/jerror.h
@@ -207,6 +207,7 @@ JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
 JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 #endif
 #endif
+JMESSAGE(JERR_BAD_PARAM, "Bogus parameter")
 
 #ifdef JMAKE_ENUM_LIST
 
diff --git a/jpegint.h b/jpegint.h
index 37742dde..bc0d3dbe 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -57,6 +57,38 @@ struct jpeg_comp_master {
   /* State variables made visible to other modules */
   boolean call_pass_startup;    /* True if pass_startup must be called */
   boolean is_last_pass;         /* True during last pass */
+
+  /* Extension parameters */
+  boolean use_moz_defaults; /* TRUE=use Mozilla defaults */
+  boolean optimize_scans; /* TRUE=optimize progressive coding scans */
+  boolean one_dc_scan; /* TRUE=use a single DC scan interleaving all components */
+  boolean sep_dc_scan; /* TRUE=each DC scan is separate */
+  boolean trellis_quant; /* TRUE=use trellis quantization */
+  boolean trellis_quant_dc; /* TRUE=use trellis quant for DC coefficient */
+  boolean trellis_eob_opt; /* TRUE=optimize for sequences of EOB */
+  boolean use_flat_quant_tbl; /* TRUE=use flat quantization table */
+  boolean use_lambda_weight_tbl; /* TRUE=use lambda weighting table */
+  boolean use_scans_in_trellis; /* TRUE=use scans in trellis optimization */
+  boolean trellis_passes; /* TRUE=currently doing trellis-related passes */
+  boolean trellis_q_opt; /* TRUE=optimize quant table in trellis loop */
+  boolean overshoot_deringing; /* TRUE=preprocess input to reduce ringing of edges on white background */
+
+  double norm_src[NUM_QUANT_TBLS][DCTSIZE2];
+  double norm_coef[NUM_QUANT_TBLS][DCTSIZE2];
+
+  int trellis_freq_split; /* splitting point for frequency in trellis quantization */
+  int trellis_num_loops; /* number of trellis loops */
+
+  int num_scans_luma; /* # of entries in scan_info array pertaining to luma (used when optimize_scans is TRUE */
+  int num_scans_luma_dc;
+  int num_scans_chroma_dc;
+  int num_frequency_splits;
+
+  int Al_max_luma; /* maximum value of Al tested when optimizing scans (luma) */
+  int Al_max_chroma; /* maximum value of Al tested when optimizing scans (chroma) */
+
+  float lambda_log_scale1;
+  float lambda_log_scale2;
 };
 
 /* Main buffer control (downsampled-data buffer) */
diff --git a/jpeglib.h b/jpeglib.h
index e5d78ebd..2f000723 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -265,6 +265,45 @@ typedef enum {
 } J_DITHER_MODE;
 
 
+/* These 32-bit GUIDs and the corresponding jpeg_*_get_*_param()/
+ * jpeg_*_set_*_param() functions allow for extending the libjpeg API without
+ * breaking backward ABI compatibility.  The actual parameters are stored in
+ * the opaque jpeg_comp_master and jpeg_decomp_master structs.
+ */
+
+/* Boolean extension parameters */
+
+typedef enum {
+  JBOOLEAN_USE_MOZ_DEFAULTS = 0xAE2F5D7F, /* TRUE=use Mozilla defaults */
+  JBOOLEAN_OPTIMIZE_SCANS = 0x680C061E, /* TRUE=optimize progressive coding scans */
+  JBOOLEAN_ONE_DC_SCAN = 0x3DA6A269, /* TRUE=use a single DC scan interleaving all components */
+  JBOOLEAN_SEP_DC_SCAN = 0xE20DFA9F, /* TRUE=each DC scan is separate */
+  JBOOLEAN_TRELLIS_QUANT = 0xC5122033, /* TRUE=use trellis quantization */
+  JBOOLEAN_TRELLIS_QUANT_DC = 0x339D4C0C, /* TRUE=use trellis quant for DC coefficient */
+  JBOOLEAN_TRELLIS_EOB_OPT = 0xD7F73780, /* TRUE=optimize for sequences of EOB */
+  JBOOLEAN_USE_FLAT_QUANT_TBL = 0xE807EC6C, /* TRUE=use flat quantization table */
+  JBOOLEAN_USE_LAMBDA_WEIGHT_TBL = 0x339DB65F, /* TRUE=use lambda weighting table */
+  JBOOLEAN_USE_SCANS_IN_TRELLIS = 0xFD841435, /* TRUE=use scans in trellis optimization */
+  JBOOLEAN_TRELLIS_PASSES = 0x3FF8A439, /* TRUE=currently doing trellis-related passes */
+  JBOOLEAN_TRELLIS_Q_OPT = 0xE12AE269, /* TRUE=optimize quant table in trellis loop */
+  JBOOLEAN_OVERSHOOT_DERINGING = 0x3F4BBBF9 /* TRUE=preprocess input to reduce ringing of edges on white background */
+} J_BOOLEAN_PARAM;
+
+/* Floating point parameters */
+
+typedef enum {
+  JFLOAT_LAMBDA_LOG_SCALE1 = 0x5B61A599,
+  JFLOAT_LAMBDA_LOG_SCALE2 = 0xB9BBAE03
+} J_FLOAT_PARAM;
+
+/* Integer parameters */
+
+typedef enum {
+  JINT_TRELLIS_FREQ_SPLIT = 0x6FAFF127, /* splitting point for frequency in trellis quantization */
+  JINT_TRELLIS_NUM_LOOPS = 0xB63EBF39 /* number of trellis loops */
+} J_INT_PARAM;
+
+
 /* Common fields between JPEG compression and decompression master structs. */
 
 #define jpeg_common_fields \
@@ -374,36 +413,6 @@ struct jpeg_compress_struct {
   int smoothing_factor;		/* 1..100, or 0 for no input smoothing */
   J_DCT_METHOD dct_method;	/* DCT algorithm selector */
 
-  boolean use_moz_defaults; /* TRUE=use Mozilla defaults */
-  boolean optimize_scans; /* TRUE=optimize progressive coding scans */
-  boolean one_dc_scan; /* TRUE=use a single DC scan interleaving all components */
-  boolean sep_dc_scan; /* TRUE=each DC scan is separate */
-  boolean trellis_quant; /* TRUE=use trellis quantization */
-  boolean trellis_quant_dc; /* TRUE=use trellis quant for DC coefficient */
-  boolean trellis_eob_opt; /* TRUE=optimize for sequences of EOB */
-  boolean use_flat_quant_tbl; /* TRUE=use flat quantization table */
-  boolean use_lambda_weight_tbl; /* TRUE=use lambda weighting table */
-  boolean use_scans_in_trellis; /* TRUE=use scans in trellis optimization */
-  boolean trellis_passes; /* TRUE=currently doing trellis-related passes */
-  boolean trellis_q_opt; /* TRUE=optimize quant table in trellis loop */
-  
-  double norm_src[NUM_QUANT_TBLS][DCTSIZE2];
-  double norm_coef[NUM_QUANT_TBLS][DCTSIZE2];
-
-  int trellis_freq_split; /* splitting point for frequency in trellis quantization */
-  int trellis_num_loops; /* number of trellis loops */
-  
-  int num_scans_luma; /* # of entries in scan_info array pertaining to luma (used when optimize_scans is TRUE */
-  int num_scans_luma_dc;
-  int num_scans_chroma_dc;
-  int num_frequency_splits;
-  
-  int Al_max_luma; /* maximum value of Al tested when optimizing scans (luma) */
-  int Al_max_chroma; /* maximum value of Al tested when optimizing scans (chroma) */
-
-  float lambda_log_scale1;
-  float lambda_log_scale2;
-  
   /* The restart interval can be specified in absolute MCUs by setting
    * restart_interval, or in MCU rows by setting restart_in_rows
    * (in which case the correct restart_interval will be figured
@@ -1073,6 +1082,27 @@ EXTERN(void) jpeg_destroy (j_common_ptr cinfo);
 /* Default restart-marker-resync procedure for use by data source modules */
 EXTERN(boolean) jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired);
 
+/* Accessor functions for extension parameters */
+EXTERN(boolean) jpeg_c_bool_param_supported (j_compress_ptr cinfo,
+                                             J_BOOLEAN_PARAM param);
+EXTERN(void) jpeg_c_set_bool_param (j_compress_ptr cinfo,
+                                    J_BOOLEAN_PARAM param, boolean value);
+EXTERN(boolean) jpeg_c_get_bool_param (j_compress_ptr cinfo,
+                                       J_BOOLEAN_PARAM param);
+
+EXTERN(boolean) jpeg_c_float_param_supported (j_compress_ptr cinfo,
+                                              J_FLOAT_PARAM param);
+EXTERN(void) jpeg_c_set_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param,
+                                     float value);
+EXTERN(float) jpeg_c_get_float_param (j_compress_ptr cinfo,
+                                      J_FLOAT_PARAM param);
+
+EXTERN(boolean) jpeg_c_int_param_supported (j_compress_ptr cinfo,
+                                            J_INT_PARAM param);
+EXTERN(void) jpeg_c_set_int_param (j_compress_ptr cinfo, J_INT_PARAM param,
+                                   int value);
+EXTERN(int) jpeg_c_get_int_param (j_compress_ptr cinfo, J_INT_PARAM param);
+
 
 /* These marker codes are exported since applications and data source modules
  * are likely to want to use them.
diff --git a/jpegtran.c b/jpegtran.c
index 5ee342db..8275d1a9 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -234,7 +234,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
         usage();
 
     } else if (keymatch(arg, "fastcrush", 4)) {
-      cinfo->optimize_scans = FALSE;
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_OPTIMIZE_SCANS, FALSE);
       
     } else if (keymatch(arg, "grayscale", 1) || keymatch(arg, "greyscale",1)) {
       /* Force to grayscale. */
@@ -310,7 +310,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
 
     } else if (keymatch(arg, "revert", 3)) {
       /* revert to old JPEG default */
-      cinfo->use_moz_defaults = FALSE;
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_USE_MOZ_DEFAULTS, FALSE);
       
     } else if (keymatch(arg, "rotate", 2)) {
       /* Rotate 90, 180, or 270 degrees (measured clockwise). */
@@ -415,7 +415,8 @@ main (int argc, char **argv)
   /* Initialize the JPEG compression object with default error handling. */
   dstinfo.err = jpeg_std_error(&jdsterr);
   jpeg_create_compress(&dstinfo);
-  dstinfo.use_moz_defaults = TRUE;
+  if (jpeg_c_bool_param_supported(&dstinfo, JBOOLEAN_USE_MOZ_DEFAULTS))
+    jpeg_c_set_bool_param(&dstinfo, JBOOLEAN_USE_MOZ_DEFAULTS, TRUE);
 
   /* Scan command line to find file names.
    * It is convenient to use just one switch-parsing routine, but the switch
@@ -469,7 +470,8 @@ main (int argc, char **argv)
 #endif
 
   /* Specify data source for decompression */
-  memsrc = dstinfo.use_moz_defaults; /* needed to revert to original */
+  if (jpeg_c_bool_param_supported(&dstinfo, JBOOLEAN_USE_MOZ_DEFAULTS))
+    memsrc = jpeg_c_get_bool_param(&dstinfo, JBOOLEAN_USE_MOZ_DEFAULTS); /* needed to revert to original */
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
   if (memsrc) {
     size_t nbytes;
@@ -555,7 +557,8 @@ main (int argc, char **argv)
 
   /* Specify data destination for compression */
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
-  if (dstinfo.use_moz_defaults)
+  if (jpeg_c_bool_param_supported(&dstinfo, JBOOLEAN_USE_MOZ_DEFAULTS) &&
+      jpeg_c_get_bool_param(&dstinfo, JBOOLEAN_USE_MOZ_DEFAULTS))
     jpeg_mem_dest(&dstinfo, &outbuffer, &outsize);
   else
 #endif
@@ -577,7 +580,8 @@ main (int argc, char **argv)
   /* Finish compression and release memory */
   jpeg_finish_compress(&dstinfo);
   
-  if (dstinfo.use_moz_defaults) {
+  if (jpeg_c_bool_param_supported(&dstinfo, JBOOLEAN_USE_MOZ_DEFAULTS) &&
+      jpeg_c_get_bool_param(&dstinfo, JBOOLEAN_USE_MOZ_DEFAULTS)) {
     size_t nbytes;
     
     unsigned char *buffer = outbuffer;
diff --git a/md5/md5.c b/md5/md5.c
index 7193e95b..b30df974 100644
--- a/md5/md5.c
+++ b/md5/md5.c
@@ -38,6 +38,15 @@ static void MD5Transform(unsigned int [4], const unsigned char [64]);
 #define Decode memcpy
 #else 
 
+/*
+ * OS X doesn't have le32toh() or htole32()
+ */
+#ifdef __APPLE__
+#include <libkern/OSByteOrder.h>
+#define le32toh(x) OSSwapLittleToHostInt32(x)
+#define htole32(x) OSSwapHostToLittleInt32(x)
+#endif
+
 /*
  * Encodes input (unsigned int) into output (unsigned char). Assumes len is
  * a multiple of 4.
diff --git a/rdswitch.c b/rdswitch.c
index 75b2081c..66f8e59c 100644
--- a/rdswitch.c
+++ b/rdswitch.c
@@ -314,7 +314,8 @@ static const unsigned int flat_quant_tbl[DCTSIZE2] = {
 LOCAL(void)
 jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
 {
-  if (cinfo->use_flat_quant_tbl) {
+  if (jpeg_c_bool_param_supported(cinfo, JBOOLEAN_USE_FLAT_QUANT_TBL) &&
+      jpeg_c_get_bool_param(cinfo, JBOOLEAN_USE_FLAT_QUANT_TBL)) {
     jpeg_add_quant_table(cinfo, 0, flat_quant_tbl,
                          q_scale_factor[0], force_baseline);
     jpeg_add_quant_table(cinfo, 1, flat_quant_tbl,
diff --git a/simd/Makefile.am b/simd/Makefile.am
index bc631452..5e664494 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -70,6 +70,13 @@ libsimd_la_SOURCES = jsimd_mips.c jsimd_mips_dspr2_asm.h jsimd_mips_dspr2.S
 
 endif
 
+if SIMD_POWERPC
+
+libsimd_la_SOURCES = jsimd_powerpc.c jsimd_powerpc_altivec.c
+libsimd_la_CFLAGS = -maltivec 
+
+endif
+
 AM_CPPFLAGS = -I$(top_srcdir)
 
 .asm.lo:
diff --git a/simd/jsimd.h b/simd/jsimd.h
index c5abd458..b0329728 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -2,7 +2,7 @@
  * simd/jsimd.h
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2011 D. R. Commander
+ * Copyright (C) 2011, 2014 D. R. Commander
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California
  * Copyright (C) 2014 Linaro Limited
  *
@@ -21,6 +21,7 @@
 #define JSIMD_SSE2       0x08
 #define JSIMD_ARM_NEON   0x10
 #define JSIMD_MIPS_DSPR2 0x20
+#define JSIMD_ALTIVEC    0x40
 
 /* SIMD Ext: retrieve SIMD/CPU information */
 EXTERN(unsigned int) jpeg_simd_cpu_support (void);
@@ -554,6 +555,8 @@ EXTERN(void) jsimd_fdct_ifast_neon (DCTELEM * data);
 
 EXTERN(void) jsimd_fdct_ifast_mips_dspr2 (DCTELEM * data);
 
+EXTERN(void) jsimd_fdct_ifast_altivec (DCTELEM * data);
+
 /* Floating Point Forward DCT */
 EXTERN(void) jsimd_fdct_float_3dnow (FAST_FLOAT * data);
 
diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c
new file mode 100644
index 00000000..a9a5965d
--- /dev/null
+++ b/simd/jsimd_powerpc.c
@@ -0,0 +1,358 @@
+/*
+ * jsimd_powerpc64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009-2011, 2014 D. R. Commander
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+
+static unsigned int simd_support = ~0;
+
+LOCAL(void)
+init_simd (void)
+{
+  char *env = NULL;
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = JSIMD_ALTIVEC;
+
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565 (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                       JDIMENSION output_row, int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
+                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                        JDIMENSION output_row, int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
+                          JSAMPIMAGE input_buf, JDIMENSION input_row,
+                          JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr,
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr,
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr,
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr,
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+                DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+                      FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+  jsimd_fdct_ifast_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+                DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+                      FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+}
diff --git a/simd/jsimd_powerpc_altivec.c b/simd/jsimd_powerpc_altivec.c
new file mode 100644
index 00000000..84132d03
--- /dev/null
+++ b/simd/jsimd_powerpc_altivec.c
@@ -0,0 +1,190 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+#include <altivec.h>
+
+#define TRANSPOSE(row, col)  \
+{  \
+  __vector short row04l, row04h, row15l, row15h,  \
+                 row26l, row26h, row37l, row37h;  \
+  __vector short col01e, col01o, col23e, col23o,  \
+                 col45e, col45o, col67e, col67o;  \
+  \
+                                       /* transpose coefficients (phase 1) */ \
+  row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
+  row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \
+  row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \
+  row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \
+  row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \
+  row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \
+  row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \
+  row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
+  \
+                                       /* transpose coefficients (phase 2) */ \
+  col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61} */ \
+  col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
+  col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
+  col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
+  col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \
+  col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \
+  col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \
+  col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
+  \
+                                       /* transpose coefficients (phase 3) */ \
+  col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */   \
+  col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71} */   \
+  col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */   \
+  col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */   \
+  col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */   \
+  col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */   \
+  col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */   \
+  col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */   \
+}
+
+static const __vector short constants __attribute__((aligned(16))) =
+{
+  98 << 5,   /* FIX(0.382683433) */
+  139 << 5,  /* FIX(0.541196100) */
+  181 << 5,  /* FIX(0.707106781) */
+  334 << 5   /* FIX(1.306562965) */
+};
+
+#define DO_DCT()  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(tmp0, tmp3);  \
+  tmp13 = vec_sub(tmp0, tmp3);  \
+  tmp11 = vec_add(tmp1, tmp2);  \
+  tmp12 = vec_sub(tmp1, tmp2);  \
+  \
+  out0  = vec_add(tmp10, tmp11);  \
+  out4  = vec_sub(tmp10, tmp11);  \
+  \
+  z1 = vec_add(tmp12, tmp13);  \
+  z1 = vec_sl(z1, PRE_MULTIPLY_SCALE_BITS);  \
+  z1 = vec_madds(z1, PW_0707, zero);  \
+  \
+  out2 = vec_add(tmp13, z1);  \
+  out6 = vec_sub(tmp13, z1);  \
+  \
+  /* Odd part */  \
+  \
+  tmp10 = vec_add(tmp4, tmp5);  \
+  tmp11 = vec_add(tmp5, tmp6);  \
+  tmp12 = vec_add(tmp6, tmp7);  \
+  \
+  tmp10 = vec_sl(tmp10, PRE_MULTIPLY_SCALE_BITS);  \
+  tmp12 = vec_sl(tmp12, PRE_MULTIPLY_SCALE_BITS);  \
+  z5 = vec_sub(tmp10, tmp12);  \
+  z5 = vec_madds(z5, PW_0382, zero);  \
+  \
+  z2 = vec_madds(tmp10, PW_0541, zero);  \
+  z2 = vec_add(z2, z5);  \
+  \
+  z4 = vec_madds(tmp12, PW_1306, zero);  \
+  z4 = vec_add(z4, z5);  \
+  \
+  tmp11 = vec_sl(tmp11, PRE_MULTIPLY_SCALE_BITS);  \
+  z3 = vec_madds(tmp11, PW_0707, zero);  \
+  \
+  z11 = vec_add(tmp7, z3);  \
+  z13 = vec_sub(tmp7, z3);  \
+  \
+  out5 = vec_add(z13, z2);  \
+  out3 = vec_sub(z13, z2);  \
+  out1 = vec_add(z11, z4);  \
+  out7 = vec_sub(z11, z4);  \
+}
+
+void
+jsimd_fdct_ifast_altivec (DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    z1, z2, z3, z4, z5, z11, z13,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+
+  /* Constants */
+  __vector short zero = vec_splat_s16(0),
+    PW_0382 = vec_splat(constants, 0),
+    PW_0541 = vec_splat(constants, 1),
+    PW_0707 = vec_splat(constants, 2),
+    PW_1306 = vec_splat(constants, 3);
+  __vector unsigned short PRE_MULTIPLY_SCALE_BITS = vec_splat_u16(2);
+
+  /* Pass 1: process rows. */
+
+  row0 = *(__vector short *)&data[0];
+  row1 = *(__vector short *)&data[8];
+  row2 = *(__vector short *)&data[16];
+  row3 = *(__vector short *)&data[24];
+  row4 = *(__vector short *)&data[32];
+  row5 = *(__vector short *)&data[40];
+  row6 = *(__vector short *)&data[48];
+  row7 = *(__vector short *)&data[56];
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_DCT();
+
+  /* Pass 2: process columns. */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_DCT();
+
+  *(__vector short *)&data[0] = out0;
+  *(__vector short *)&data[8] = out1;
+  *(__vector short *)&data[16] = out2;
+  *(__vector short *)&data[24] = out3;
+  *(__vector short *)&data[32] = out4;
+  *(__vector short *)&data[40] = out5;
+  *(__vector short *)&data[48] = out6;
+  *(__vector short *)&data[56] = out7;
+}
diff --git a/turbojpeg.c b/turbojpeg.c
index 657be0b9..dec18e3e 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c
@@ -206,7 +206,7 @@ static int setCompDefaults(struct jpeg_compress_struct *cinfo,
 	}
 
 	cinfo->input_components=tjPixelSize[pixelFormat];
-	cinfo->use_moz_defaults = TRUE;
+	cinfo->master->use_moz_defaults = TRUE;
 	jpeg_set_defaults(cinfo);
 
 	if((env=getenv("TJ_OPTIMIZE"))!=NULL && strlen(env)>0 && !strcmp(env, "1"))
@@ -241,7 +241,7 @@ static int setCompDefaults(struct jpeg_compress_struct *cinfo,
 	else jpeg_set_colorspace(cinfo, JCS_YCbCr);
 
 	/* Set scan pattern again as colorspace might have changed */
-	if (cinfo->use_moz_defaults)
+	if (cinfo->master->use_moz_defaults)
 		jpeg_simple_progression(cinfo);
   
 	cinfo->comp_info[0].h_samp_factor=tjMCUWidth[subsamp]/8;
@@ -616,9 +616,10 @@ DLLEXPORT unsigned long DLLCALL tjBufSizeYUV2(int width, int pad, int height,
 	nc=(subsamp==TJSAMP_GRAY? 1:3);
 	for(i=0; i<nc; i++)
 	{
-		int stride=PAD(tjPlaneWidth(i, width, subsamp), pad);
+		int pw=tjPlaneWidth(i, width, subsamp);
+		int stride=PAD(pw, pad);
 		int ph=tjPlaneHeight(i, height, subsamp);
-		if(stride<0 || ph<0) return -1;
+		if(pw<0 || ph<0) return -1;
 		else retval+=stride*ph;
 	}
 
@@ -692,6 +693,7 @@ DLLEXPORT unsigned long DLLCALL tjPlaneSizeYUV(int componentID, int width,
 
 	pw=tjPlaneWidth(componentID, width, subsamp);
 	ph=tjPlaneHeight(componentID, height, subsamp);
+	if(pw<0 || ph<0) return -1;
 
 	if(stride==0) stride=pw;
 	else stride=abs(stride);
diff --git a/win/jpeg62-memsrcdst.def b/win/jpeg62-memsrcdst.def
old mode 100755
new mode 100644
index 4511c8ef..2876559d
--- a/win/jpeg62-memsrcdst.def
+++ b/win/jpeg62-memsrcdst.def
@@ -102,3 +102,13 @@ EXPORTS
 	jzero_far @ 101 ; 
 	jpeg_mem_dest @ 102 ; 
 	jpeg_mem_src @ 103 ; 
+	jpeg_c_bool_param_supported @ 200 ; 
+	jpeg_c_set_bool_param @ 201 ; 
+	jpeg_c_get_bool_param @ 202 ; 
+	jpeg_c_float_param_supported @ 203 ; 
+	jpeg_c_set_float_param @ 204 ; 
+	jpeg_c_get_float_param @ 205 ; 
+	jpeg_c_int_param_supported @ 206 ; 
+	jpeg_c_set_int_param @ 207 ; 
+	jpeg_c_get_int_param @ 208 ; 
+	jpeg_float_quality_scaling @ 1000 ; 
diff --git a/win/jpeg62.def b/win/jpeg62.def
old mode 100755
new mode 100644
index 3c33fbf2..0101d758
--- a/win/jpeg62.def
+++ b/win/jpeg62.def
@@ -100,3 +100,13 @@ EXPORTS
 	jpeg_write_tables @ 99 ; 
 	jround_up @ 100 ; 
 	jzero_far @ 101 ; 
+	jpeg_c_bool_param_supported @ 200 ; 
+	jpeg_c_set_bool_param @ 201 ; 
+	jpeg_c_get_bool_param @ 202 ; 
+	jpeg_c_float_param_supported @ 203 ; 
+	jpeg_c_set_float_param @ 204 ; 
+	jpeg_c_get_float_param @ 205 ; 
+	jpeg_c_int_param_supported @ 206 ; 
+	jpeg_c_set_int_param @ 207 ; 
+	jpeg_c_get_int_param @ 208 ; 
+	jpeg_float_quality_scaling @ 1000 ; 
diff --git a/win/jpeg7-memsrcdst.def b/win/jpeg7-memsrcdst.def
index 8c9f517a..7b424a7f 100644
--- a/win/jpeg7-memsrcdst.def
+++ b/win/jpeg7-memsrcdst.def
@@ -104,3 +104,13 @@ EXPORTS
 	jzero_far @ 103 ; 
 	jpeg_mem_dest @ 104 ; 
 	jpeg_mem_src @ 105 ; 
+	jpeg_c_bool_param_supported @ 200 ; 
+	jpeg_c_set_bool_param @ 201 ; 
+	jpeg_c_get_bool_param @ 202 ; 
+	jpeg_c_float_param_supported @ 203 ; 
+	jpeg_c_set_float_param @ 204 ; 
+	jpeg_c_get_float_param @ 205 ; 
+	jpeg_c_int_param_supported @ 206 ; 
+	jpeg_c_set_int_param @ 207 ; 
+	jpeg_c_get_int_param @ 208 ; 
+	jpeg_float_quality_scaling @ 1000 ; 
diff --git a/win/jpeg7.def b/win/jpeg7.def
index 5ca227b8..82375781 100644
--- a/win/jpeg7.def
+++ b/win/jpeg7.def
@@ -102,3 +102,13 @@ EXPORTS
 	jpeg_write_tables @ 101 ; 
 	jround_up @ 102 ; 
 	jzero_far @ 103 ; 
+	jpeg_c_bool_param_supported @ 200 ; 
+	jpeg_c_set_bool_param @ 201 ; 
+	jpeg_c_get_bool_param @ 202 ; 
+	jpeg_c_float_param_supported @ 203 ; 
+	jpeg_c_set_float_param @ 204 ; 
+	jpeg_c_get_float_param @ 205 ; 
+	jpeg_c_int_param_supported @ 206 ; 
+	jpeg_c_set_int_param @ 207 ; 
+	jpeg_c_get_int_param @ 208 ; 
+	jpeg_float_quality_scaling @ 1000 ; 
diff --git a/win/jpeg8.def b/win/jpeg8.def
index 3fa6111c..ee2ccfe3 100644
--- a/win/jpeg8.def
+++ b/win/jpeg8.def
@@ -105,3 +105,13 @@ EXPORTS
 	jpeg_write_tables @ 104 ; 
 	jround_up @ 105 ; 
 	jzero_far @ 106 ; 
+	jpeg_c_bool_param_supported @ 200 ; 
+	jpeg_c_set_bool_param @ 201 ; 
+	jpeg_c_get_bool_param @ 202 ; 
+	jpeg_c_float_param_supported @ 203 ; 
+	jpeg_c_set_float_param @ 204 ; 
+	jpeg_c_get_float_param @ 205 ; 
+	jpeg_c_int_param_supported @ 206 ; 
+	jpeg_c_set_int_param @ 207 ; 
+	jpeg_c_get_int_param @ 208 ; 
+	jpeg_float_quality_scaling @ 1000 ; 
diff --git a/win/jsimdcfg.inc b/win/jsimdcfg.inc
old mode 100755
new mode 100644
diff --git a/wrbmp.c b/wrbmp.c
index 3a85441c..b7ecb491 100644
--- a/wrbmp.c
+++ b/wrbmp.c
@@ -5,6 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2013, Linaro Limited.
+ * Copyright (C) 2014, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains routines to write output images in Microsoft "BMP"
@@ -20,6 +21,7 @@
  */
 
 #include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+#include "jconfigint.h"
 
 #ifdef BMP_SUPPORTED
 
@@ -64,6 +66,15 @@ LOCAL(void) write_colormap
          int map_entry_size);
 
 
+static INLINE boolean is_big_endian(void)
+{
+  int test_value = 1;
+  if(*(char *)&test_value != 1)
+    return TRUE;
+  return FALSE;
+}
+
+
 /*
  * Write some pixel data.
  * In this module rows_supplied will always be 1.
@@ -93,18 +104,18 @@ put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
   outptr = image_ptr[0];
 
   if(cinfo->out_color_space == JCS_RGB565) {
-    #define red_mask    0xF800
-    #define green_mask  0x7E0
-    #define blue_mask   0x1F
-    unsigned char  r, g, b;
+    boolean big_endian = is_big_endian();
     unsigned short *inptr2 = (unsigned short *)inptr;
     for (col = cinfo->output_width; col > 0; col--) {
-      r = (*inptr2 & red_mask) >> 11;
-      g = (*inptr2 & green_mask) >> 5;
-      b = (*inptr2 & blue_mask);
-      outptr[0] = b << 3;
-      outptr[1] = g << 2;
-      outptr[2] = r << 3;
+      if (big_endian) {
+        outptr[0] = (*inptr2 >> 5) & 0xF8;
+        outptr[1] = ((*inptr2 << 5) & 0xE0) | ((*inptr2 >> 11) & 0x1C);
+        outptr[2] = *inptr2 & 0xF8;
+      } else {
+        outptr[0] = (*inptr2 << 3) & 0xF8;
+        outptr[1] = (*inptr2 >> 3) & 0xFC;
+        outptr[2] = (*inptr2 >> 8) & 0xF8;
+      }
       outptr += 3;
       inptr2++;
     }
diff --git a/yuvjpeg.c b/yuvjpeg.c
index 542ca826..0a94079c 100644
--- a/yuvjpeg.c
+++ b/yuvjpeg.c
@@ -217,7 +217,8 @@ int main(int argc, char *argv[]) {
 
   jpeg_stdio_dest(&cinfo, jpg_fd);
 
-  cinfo.use_moz_defaults = TRUE;
+  if (jpeg_c_bool_param_supported(&cinfo, JBOOLEAN_USE_MOZ_DEFAULTS))
+    jpeg_c_set_bool_param(&cinfo, JBOOLEAN_USE_MOZ_DEFAULTS, TRUE);
 
   cinfo.image_width = luma_width;
   cinfo.image_height = luma_height;