From 41ebea736dfcaf867e6009fbb32c371ff44409b6 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Tue, 9 Sep 2014 16:06:32 -0400
Subject: [PATCH 001/108] Make provisions for more default quant tables

---
 cjpeg.c    | 10 +++---
 jcparam.c  | 93 +++++++++++++++++++++++++++---------------------------
 jpeglib.h  |  2 +-
 rdswitch.c | 86 +++++++++++++++++++++++++++-----------------------
 4 files changed, 98 insertions(+), 93 deletions(-)

diff --git a/cjpeg.c b/cjpeg.c
index a56a1ecd..94b1aecb 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -326,7 +326,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       cinfo->optimize_scans = FALSE;
 
     } else if (keymatch(arg, "flat", 4)) {
-      cinfo->use_flat_quant_tbl = TRUE;
+      cinfo->quant_tbl_master_idx = 1;
       jpeg_set_quality(cinfo, 75, TRUE);
 
     } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
@@ -507,28 +507,28 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       cinfo->trellis_quant_dc = TRUE;
       
     } else if (keymatch(arg, "tune-psnr", 6)) {
-      cinfo->use_flat_quant_tbl = TRUE;
+      cinfo->quant_tbl_master_idx = 1;
       cinfo->lambda_log_scale1 = 9.0;
       cinfo->lambda_log_scale2 = 0.0;
       cinfo->use_lambda_weight_tbl = FALSE;
       jpeg_set_quality(cinfo, 75, TRUE);
       
     } else if (keymatch(arg, "tune-ssim", 6)) {
-      cinfo->use_flat_quant_tbl = TRUE;
+      cinfo->quant_tbl_master_idx = 1;
       cinfo->lambda_log_scale1 = 12.0;
       cinfo->lambda_log_scale2 = 13.5;
       cinfo->use_lambda_weight_tbl = FALSE;
       jpeg_set_quality(cinfo, 75, TRUE);
       
     } else if (keymatch(arg, "tune-ms-ssim", 6)) {
-      cinfo->use_flat_quant_tbl = FALSE;
+      cinfo->quant_tbl_master_idx = 0;
       cinfo->lambda_log_scale1 = 14.25;
       cinfo->lambda_log_scale2 = 12.75;
       cinfo->use_lambda_weight_tbl = TRUE;
       jpeg_set_quality(cinfo, 75, TRUE);
       
     } else if (keymatch(arg, "tune-hvs-psnr", 6)) {
-      cinfo->use_flat_quant_tbl = FALSE;
+      cinfo->quant_tbl_master_idx = 0;
       cinfo->lambda_log_scale1 = 16.0;
       cinfo->lambda_log_scale2 = 15.5;
       cinfo->use_lambda_weight_tbl = TRUE;
diff --git a/jcparam.c b/jcparam.c
index d3b8ea15..ef8826a0 100644
--- a/jcparam.c
+++ b/jcparam.c
@@ -70,36 +70,49 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
  * The spec says that the values given produce "good" quality, and
  * when divided by 2, "very good" quality.
  */
-static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
-  16,  11,  10,  16,  24,  40,  51,  61,
-  12,  12,  14,  19,  26,  58,  60,  55,
-  14,  13,  16,  24,  40,  57,  69,  56,
-  14,  17,  22,  29,  51,  87,  80,  62,
-  18,  22,  37,  56,  68, 109, 103,  77,
-  24,  35,  55,  64,  81, 104, 113,  92,
-  49,  64,  78,  87, 103, 121, 120, 101,
-  72,  92,  95,  98, 112, 100, 103,  99
+static const unsigned int std_luminance_quant_tbl[][DCTSIZE2] = {
+  {
+    16,  11,  10,  16,  24,  40,  51,  61,
+    12,  12,  14,  19,  26,  58,  60,  55,
+    14,  13,  16,  24,  40,  57,  69,  56,
+    14,  17,  22,  29,  51,  87,  80,  62,
+    18,  22,  37,  56,  68, 109, 103,  77,
+    24,  35,  55,  64,  81, 104, 113,  92,
+    49,  64,  78,  87, 103, 121, 120, 101,
+    72,  92,  95,  98, 112, 100, 103,  99
+  },
+  {
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16
+  }
 };
-static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
-  17,  18,  24,  47,  99,  99,  99,  99,
-  18,  21,  26,  66,  99,  99,  99,  99,
-  24,  26,  56,  99,  99,  99,  99,  99,
-  47,  66,  99,  99,  99,  99,  99,  99,
-  99,  99,  99,  99,  99,  99,  99,  99,
-  99,  99,  99,  99,  99,  99,  99,  99,
-  99,  99,  99,  99,  99,  99,  99,  99,
-  99,  99,  99,  99,  99,  99,  99,  99
-};
-
-static const unsigned int flat_quant_tbl[DCTSIZE2] = {
-  16,  16,  16,  16,  16,  16,  16,  16,
-  16,  16,  16,  16,  16,  16,  16,  16,
-  16,  16,  16,  16,  16,  16,  16,  16,
-  16,  16,  16,  16,  16,  16,  16,  16,
-  16,  16,  16,  16,  16,  16,  16,  16,
-  16,  16,  16,  16,  16,  16,  16,  16,
-  16,  16,  16,  16,  16,  16,  16,  16,
-  16,  16,  16,  16,  16,  16,  16,  16
+static const unsigned int std_chrominance_quant_tbl[][DCTSIZE2] = {
+  {
+    17,  18,  24,  47,  99,  99,  99,  99,
+    18,  21,  26,  66,  99,  99,  99,  99,
+    24,  26,  56,  99,  99,  99,  99,  99,
+    47,  66,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99
+  },
+  {
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16
+  }
 };
 
 #if JPEG_LIB_VERSION >= 70
@@ -111,18 +124,11 @@ jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
  */
 {
   /* Set up two quantization tables using the specified scaling */
-  if (cinfo->use_flat_quant_tbl) {
-    jpeg_add_quant_table(cinfo, 0, flat_quant_tbl,
-                         cinfo->q_scale_factor[0], force_baseline);
-    jpeg_add_quant_table(cinfo, 1, flat_quant_tbl,
-                         cinfo->q_scale_factor[1], force_baseline);
-  } else {
-  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
+  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl[cinfo->quant_tbl_master_idx],
                        cinfo->q_scale_factor[0], force_baseline);
-  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
+  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl[cinfo->quant_tbl_master_idx],
                        cinfo->q_scale_factor[1], force_baseline);
 }
-}
 #endif
 
 
@@ -136,18 +142,11 @@ jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
  */
 {
   /* Set up two quantization tables using the specified scaling */
-  if (cinfo->use_flat_quant_tbl) {
-    jpeg_add_quant_table(cinfo, 0, flat_quant_tbl,
-                         scale_factor, force_baseline);
-    jpeg_add_quant_table(cinfo, 1, flat_quant_tbl,
-                         scale_factor, force_baseline);
-  } else {
-  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
+  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl[cinfo->quant_tbl_master_idx],
                        scale_factor, force_baseline);
-  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
+  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl[cinfo->quant_tbl_master_idx],
                        scale_factor, force_baseline);
 }
-}
 
 
 GLOBAL(int)
diff --git a/jpeglib.h b/jpeglib.h
index e5d78ebd..96df1475 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -381,7 +381,7 @@ struct jpeg_compress_struct {
   boolean trellis_quant; /* TRUE=use trellis quantization */
   boolean trellis_quant_dc; /* TRUE=use trellis quant for DC coefficient */
   boolean trellis_eob_opt; /* TRUE=optimize for sequences of EOB */
-  boolean use_flat_quant_tbl; /* TRUE=use flat quantization table */
+  int quant_tbl_master_idx; /* Quantization table master index */
   boolean use_lambda_weight_tbl; /* TRUE=use lambda weighting table */
   boolean use_scans_in_trellis; /* TRUE=use scans in trellis optimization */
   boolean trellis_passes; /* TRUE=currently doing trellis-related passes */
diff --git a/rdswitch.c b/rdswitch.c
index 75b2081c..f63d6ebe 100644
--- a/rdswitch.c
+++ b/rdswitch.c
@@ -279,52 +279,58 @@ bogus:
  * The spec says that the values given produce "good" quality, and
  * when divided by 2, "very good" quality.
  */
-static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
-  16,  11,  10,  16,  24,  40,  51,  61,
-  12,  12,  14,  19,  26,  58,  60,  55,
-  14,  13,  16,  24,  40,  57,  69,  56,
-  14,  17,  22,  29,  51,  87,  80,  62,
-  18,  22,  37,  56,  68, 109, 103,  77,
-  24,  35,  55,  64,  81, 104, 113,  92,
-  49,  64,  78,  87, 103, 121, 120, 101,
-  72,  92,  95,  98, 112, 100, 103,  99
+static const unsigned int std_luminance_quant_tbl[][DCTSIZE2] = {
+  {
+    16,  11,  10,  16,  24,  40,  51,  61,
+    12,  12,  14,  19,  26,  58,  60,  55,
+    14,  13,  16,  24,  40,  57,  69,  56,
+    14,  17,  22,  29,  51,  87,  80,  62,
+    18,  22,  37,  56,  68, 109, 103,  77,
+    24,  35,  55,  64,  81, 104, 113,  92,
+    49,  64,  78,  87, 103, 121, 120, 101,
+    72,  92,  95,  98, 112, 100, 103,  99
+  },
+  {
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16
+  }
 };
-static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
-  17,  18,  24,  47,  99,  99,  99,  99,
-  18,  21,  26,  66,  99,  99,  99,  99,
-  24,  26,  56,  99,  99,  99,  99,  99,
-  47,  66,  99,  99,  99,  99,  99,  99,
-  99,  99,  99,  99,  99,  99,  99,  99,
-  99,  99,  99,  99,  99,  99,  99,  99,
-  99,  99,  99,  99,  99,  99,  99,  99,
-  99,  99,  99,  99,  99,  99,  99,  99
-};
-
-static const unsigned int flat_quant_tbl[DCTSIZE2] = {
-  16,  16,  16,  16,  16,  16,  16,  16,
-  16,  16,  16,  16,  16,  16,  16,  16,
-  16,  16,  16,  16,  16,  16,  16,  16,
-  16,  16,  16,  16,  16,  16,  16,  16,
-  16,  16,  16,  16,  16,  16,  16,  16,
-  16,  16,  16,  16,  16,  16,  16,  16,
-  16,  16,  16,  16,  16,  16,  16,  16,
-  16,  16,  16,  16,  16,  16,  16,  16
+static const unsigned int std_chrominance_quant_tbl[][DCTSIZE2] = {
+  {
+    17,  18,  24,  47,  99,  99,  99,  99,
+    18,  21,  26,  66,  99,  99,  99,  99,
+    24,  26,  56,  99,  99,  99,  99,  99,
+    47,  66,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99
+  },
+  {
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16,
+    16,  16,  16,  16,  16,  16,  16,  16
+  }
 };
 
 LOCAL(void)
 jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
 {
-  if (cinfo->use_flat_quant_tbl) {
-    jpeg_add_quant_table(cinfo, 0, flat_quant_tbl,
-                         q_scale_factor[0], force_baseline);
-    jpeg_add_quant_table(cinfo, 1, flat_quant_tbl,
-                         q_scale_factor[1], force_baseline);
-  } else {
-    jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
-                         q_scale_factor[0], force_baseline);
-    jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
-                         q_scale_factor[1], force_baseline);
-  }
+  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl[cinfo->quant_tbl_master_idx],
+                       q_scale_factor[0], force_baseline);
+  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl[cinfo->quant_tbl_master_idx],
+                       q_scale_factor[1], force_baseline);
 }
 #endif
 

From 83d6a8e6434ab892b636b0c8f0019dee2bc2e3f9 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Tue, 30 Sep 2014 10:24:52 -0400
Subject: [PATCH 002/108] Add new quantization tables

New quantization tables tuned for PSNR-HVS are added.
Per-coefficient lambda weights are now derived based on the
quantization table entries.
---
 cjpeg.c    | 18 +++++++--------
 jcdctmgr.c | 13 +++++++++--
 jcparam.c  | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 rdswitch.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 144 insertions(+), 15 deletions(-)

diff --git a/cjpeg.c b/cjpeg.c
index 94b1aecb..c6ec4486 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -508,29 +508,29 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       
     } else if (keymatch(arg, "tune-psnr", 6)) {
       cinfo->quant_tbl_master_idx = 1;
-      cinfo->lambda_log_scale1 = 9.0;
+      cinfo->lambda_log_scale1 = 9.25;
       cinfo->lambda_log_scale2 = 0.0;
       cinfo->use_lambda_weight_tbl = FALSE;
       jpeg_set_quality(cinfo, 75, TRUE);
       
     } else if (keymatch(arg, "tune-ssim", 6)) {
       cinfo->quant_tbl_master_idx = 1;
-      cinfo->lambda_log_scale1 = 12.0;
-      cinfo->lambda_log_scale2 = 13.5;
+      cinfo->lambda_log_scale1 = 11.25;
+      cinfo->lambda_log_scale2 = 12.75;
       cinfo->use_lambda_weight_tbl = FALSE;
       jpeg_set_quality(cinfo, 75, TRUE);
       
     } else if (keymatch(arg, "tune-ms-ssim", 6)) {
-      cinfo->quant_tbl_master_idx = 0;
-      cinfo->lambda_log_scale1 = 14.25;
-      cinfo->lambda_log_scale2 = 12.75;
+      cinfo->quant_tbl_master_idx = 2;
+      cinfo->lambda_log_scale1 = 12.25;
+      cinfo->lambda_log_scale2 = 13.25;
       cinfo->use_lambda_weight_tbl = TRUE;
       jpeg_set_quality(cinfo, 75, TRUE);
       
     } else if (keymatch(arg, "tune-hvs-psnr", 6)) {
-      cinfo->quant_tbl_master_idx = 0;
-      cinfo->lambda_log_scale1 = 16.0;
-      cinfo->lambda_log_scale2 = 15.5;
+      cinfo->quant_tbl_master_idx = 4;
+      cinfo->lambda_log_scale1 = 13.75;
+      cinfo->lambda_log_scale2 = 15.25;
       cinfo->use_lambda_weight_tbl = TRUE;
       jpeg_set_quality(cinfo, 75, TRUE);
       
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 6cab5ab5..249fd83e 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -704,6 +704,8 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
   float *accumulated_dc_cost[3];
   int *dc_cost_backtrack[3];
   JCOEF *dc_candidate[3];
+  int mode = 1;
+  float lambda_table[DCTSIZE2];
 
   Ss = cinfo->Ss;
   Se = cinfo->Se;
@@ -739,13 +741,20 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
       }
     }
   }
+  
   norm = 0.0;
   for (i = 1; i < DCTSIZE2; i++) {
     norm += qtbl->quantval[i] * qtbl->quantval[i];
   }
   norm /= 63.0;
-  
-  lambda_base = 1.0 / norm;
+
+  if (mode == 1) {
+    lambda_base = 1.0;
+    lambda_tbl = lambda_table;
+    for (i = 0; i < DCTSIZE2; i++)
+      lambda_table[i] = 1.0 / (qtbl->quantval[i] * qtbl->quantval[i]);
+  } else
+    lambda_base = 1.0 / norm;
   
   for (bi = 0; bi < num_blocks; bi++) {
     
diff --git a/jcparam.c b/jcparam.c
index ef8826a0..5cb1281d 100644
--- a/jcparam.c
+++ b/jcparam.c
@@ -70,7 +70,7 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
  * The spec says that the values given produce "good" quality, and
  * when divided by 2, "very good" quality.
  */
-static const unsigned int std_luminance_quant_tbl[][DCTSIZE2] = {
+static const unsigned int std_luminance_quant_tbl[5][DCTSIZE2] = {
   {
     16,  11,  10,  16,  24,  40,  51,  61,
     12,  12,  14,  19,  26,  58,  60,  55,
@@ -90,9 +90,39 @@ static const unsigned int std_luminance_quant_tbl[][DCTSIZE2] = {
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16
+  },
+  {
+    12, 17, 20, 21, 30, 34, 56, 63,
+    18, 20, 20, 26, 28, 51, 61, 55,
+    19, 20, 21, 26, 33, 58, 69, 55,
+    26, 26, 26, 30, 46, 87, 86, 66,
+    31, 33, 36, 40, 46, 96, 100, 73,
+    40, 35, 46, 62, 81, 100, 111, 91,
+    46, 66, 76, 86, 102, 121, 120, 101,
+    68, 90, 90, 96, 113, 102, 105, 103
+  },
+  {
+    11, 15, 17, 19, 29, 40, 55, 62,
+    16, 16, 18, 24, 33, 59, 60, 56,
+    16, 19, 22, 29, 43, 59, 69, 58,
+    22, 22, 30, 43, 59, 89, 81, 65,
+    27, 35, 47, 75, 69, 115, 105, 78,
+    49, 45, 59, 70, 85, 106, 113, 93,
+    52, 67, 78, 89, 103, 123, 120, 102,
+    71, 92, 96, 98, 112, 100, 103, 99
+  },
+  {
+    9, 10, 12, 14, 27, 32, 51, 62,
+    11, 12, 14, 19, 27, 44, 59, 73,
+    12, 14, 18, 25, 42, 59, 79, 78,
+    17, 18, 25, 42, 61, 92, 87, 92,
+    23, 28, 42, 75, 79, 112, 112, 99,
+    40, 42, 59, 84, 88, 124, 132, 111,
+    42, 64, 78, 95, 105, 126, 125, 99,
+    70, 75, 100, 102, 116, 100, 107, 98
   }
 };
-static const unsigned int std_chrominance_quant_tbl[][DCTSIZE2] = {
+static const unsigned int std_chrominance_quant_tbl[5][DCTSIZE2] = {
   {
     17,  18,  24,  47,  99,  99,  99,  99,
     18,  21,  26,  66,  99,  99,  99,  99,
@@ -112,6 +142,36 @@ static const unsigned int std_chrominance_quant_tbl[][DCTSIZE2] = {
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16
+  },
+  {
+    8, 12, 15, 15, 86, 96, 96, 98,
+    13, 13, 15, 26, 90, 96, 99, 98,
+    12, 15, 18, 96, 99, 99, 99, 99,
+    17, 16, 90, 96, 99, 99, 99, 99,
+    96, 96, 99, 99, 99, 99, 99, 99,
+    99, 99, 99, 99, 99, 99, 99, 99,
+    99, 99, 99, 99, 99, 99, 99, 99,
+    99, 99, 99, 99, 99, 99, 99, 99
+  },
+  {
+    8, 14, 17, 21, 90, 98, 98, 99,
+    14, 15, 17, 46, 92, 93, 90, 97,
+    16, 21, 44, 94, 96, 96, 98, 98,
+    27, 31, 84, 85, 96, 96, 98, 98,
+    31, 92, 96, 94, 99, 99, 99, 99,
+    99, 99, 99, 99, 99, 99, 99, 99,
+    99, 99, 99, 99, 99, 99, 99, 99,
+    99, 99, 99, 99, 99, 99, 99, 99
+  },
+  {
+    9, 10, 17, 19, 62, 89, 91, 97,
+    12, 13, 18, 29, 84, 91, 88, 98,
+    14, 19, 29, 93, 95, 95, 98, 97,
+    20, 26, 84, 88, 95, 95, 98, 94,
+    26, 86, 91, 93, 97, 99, 98, 99,
+    99, 100, 98, 99, 99, 99, 99, 99,
+    99, 99, 99, 99, 99, 99, 99, 99,
+    97, 97, 99, 99, 99, 99, 97, 99
   }
 };
 
diff --git a/rdswitch.c b/rdswitch.c
index f63d6ebe..02b5af80 100644
--- a/rdswitch.c
+++ b/rdswitch.c
@@ -279,7 +279,7 @@ bogus:
  * The spec says that the values given produce "good" quality, and
  * when divided by 2, "very good" quality.
  */
-static const unsigned int std_luminance_quant_tbl[][DCTSIZE2] = {
+static const unsigned int std_luminance_quant_tbl[5][DCTSIZE2] = {
   {
     16,  11,  10,  16,  24,  40,  51,  61,
     12,  12,  14,  19,  26,  58,  60,  55,
@@ -299,9 +299,39 @@ static const unsigned int std_luminance_quant_tbl[][DCTSIZE2] = {
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16
+  },
+  {
+    12, 17, 20, 21, 30, 34, 56, 63,
+    18, 20, 20, 26, 28, 51, 61, 55,
+    19, 20, 21, 26, 33, 58, 69, 55,
+    26, 26, 26, 30, 46, 87, 86, 66,
+    31, 33, 36, 40, 46, 96, 100, 73,
+    40, 35, 46, 62, 81, 100, 111, 91,
+    46, 66, 76, 86, 102, 121, 120, 101,
+    68, 90, 90, 96, 113, 102, 105, 103
+  },
+  {
+    11, 15, 17, 19, 29, 40, 55, 62,
+    16, 16, 18, 24, 33, 59, 60, 56,
+    16, 19, 22, 29, 43, 59, 69, 58,
+    22, 22, 30, 43, 59, 89, 81, 65,
+    27, 35, 47, 75, 69, 115, 105, 78,
+    49, 45, 59, 70, 85, 106, 113, 93,
+    52, 67, 78, 89, 103, 123, 120, 102,
+    71, 92, 96, 98, 112, 100, 103, 99
+  },
+  {
+    9, 10, 12, 14, 27, 32, 51, 62,
+    11, 12, 14, 19, 27, 44, 59, 73,
+    12, 14, 18, 25, 42, 59, 79, 78,
+    17, 18, 25, 42, 61, 92, 87, 92,
+    23, 28, 42, 75, 79, 112, 112, 99,
+    40, 42, 59, 84, 88, 124, 132, 111,
+    42, 64, 78, 95, 105, 126, 125, 99,
+    70, 75, 100, 102, 116, 100, 107, 98
   }
 };
-static const unsigned int std_chrominance_quant_tbl[][DCTSIZE2] = {
+static const unsigned int std_chrominance_quant_tbl[5][DCTSIZE2] = {
   {
     17,  18,  24,  47,  99,  99,  99,  99,
     18,  21,  26,  66,  99,  99,  99,  99,
@@ -321,6 +351,36 @@ static const unsigned int std_chrominance_quant_tbl[][DCTSIZE2] = {
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16
+  },
+  {
+    8, 12, 15, 15, 86, 96, 96, 98,
+    13, 13, 15, 26, 90, 96, 99, 98,
+    12, 15, 18, 96, 99, 99, 99, 99,
+    17, 16, 90, 96, 99, 99, 99, 99,
+    96, 96, 99, 99, 99, 99, 99, 99,
+    99, 99, 99, 99, 99, 99, 99, 99,
+    99, 99, 99, 99, 99, 99, 99, 99,
+    99, 99, 99, 99, 99, 99, 99, 99
+  },
+  {
+    8, 14, 17, 21, 90, 98, 98, 99,
+    14, 15, 17, 46, 92, 93, 90, 97,
+    16, 21, 44, 94, 96, 96, 98, 98,
+    27, 31, 84, 85, 96, 96, 98, 98,
+    31, 92, 96, 94, 99, 99, 99, 99,
+    99, 99, 99, 99, 99, 99, 99, 99,
+    99, 99, 99, 99, 99, 99, 99, 99,
+    99, 99, 99, 99, 99, 99, 99, 99
+  },
+  {
+    9, 10, 17, 19, 62, 89, 91, 97,
+    12, 13, 18, 29, 84, 91, 88, 98,
+    14, 19, 29, 93, 95, 95, 98, 97,
+    20, 26, 84, 88, 95, 95, 98, 94,
+    26, 86, 91, 93, 97, 99, 98, 99,
+    99, 100, 98, 99, 99, 99, 99, 99,
+    99, 99, 99, 99, 99, 99, 99, 99,
+    97, 97, 99, 99, 99, 99, 97, 99
   }
 };
 

From 02fc068dc80d8353cbc735d82906694330d1591e Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Wed, 29 Oct 2014 16:10:48 +0100
Subject: [PATCH 003/108] Add quantization tables

Add various quantization tables suggested in #84
Still need to do chroma
---
 jcparam.c  | 100 +++++++++++++++++++++++++++++++++++++++++++----------
 rdswitch.c |  95 ++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 159 insertions(+), 36 deletions(-)

diff --git a/jcparam.c b/jcparam.c
index 5cb1281d..2cd94542 100644
--- a/jcparam.c
+++ b/jcparam.c
@@ -70,8 +70,9 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
  * The spec says that the values given produce "good" quality, and
  * when divided by 2, "very good" quality.
  */
-static const unsigned int std_luminance_quant_tbl[5][DCTSIZE2] = {
+static const unsigned int std_luminance_quant_tbl[6][DCTSIZE2] = {
   {
+    // JPEG Annex K
     16,  11,  10,  16,  24,  40,  51,  61,
     12,  12,  14,  19,  26,  58,  60,  55,
     14,  13,  16,  24,  40,  57,  69,  56,
@@ -82,6 +83,7 @@ static const unsigned int std_luminance_quant_tbl[5][DCTSIZE2] = {
     72,  92,  95,  98, 112, 100, 103,  99
   },
   {
+    // Flat
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16,
@@ -102,14 +104,15 @@ static const unsigned int std_luminance_quant_tbl[5][DCTSIZE2] = {
     68, 90, 90, 96, 113, 102, 105, 103
   },
   {
-    11, 15, 17, 19, 29, 40, 55, 62,
-    16, 16, 18, 24, 33, 59, 60, 56,
-    16, 19, 22, 29, 43, 59, 69, 58,
-    22, 22, 30, 43, 59, 89, 81, 65,
-    27, 35, 47, 75, 69, 115, 105, 78,
-    49, 45, 59, 70, 85, 106, 113, 93,
-    52, 67, 78, 89, 103, 123, 120, 102,
-    71, 92, 96, 98, 112, 100, 103, 99
+    // From http://www.imagemagick.org/discourse-server/viewtopic.php?f=22&t=20333&p=98008#p98008
+    16,  16,  16,  18,  25,  37,  56,  85,
+    16,  17,  20,  27,  34,  40,  53,  75,
+    16,  20,  24,  31,  43,  62,  91,  135,
+    18,  27,  31,  40,  53,  74,  106, 156,
+    25,  34,  43,  53,  69,  94,  131, 189,
+    37,  40,  62,  74,  94,  124, 169, 238,
+    56,  53,  91,  106, 131, 169, 226, 311,
+    85,  75,  135, 156, 189, 238, 311, 418
   },
   {
     9, 10, 12, 14, 27, 32, 51, 62,
@@ -120,10 +123,58 @@ static const unsigned int std_luminance_quant_tbl[5][DCTSIZE2] = {
     40, 42, 59, 84, 88, 124, 132, 111,
     42, 64, 78, 95, 105, 126, 125, 99,
     70, 75, 100, 102, 116, 100, 107, 98
+  },
+  {
+#if 0
+    // DCTune perceptual optimization of compressed dental X-Rays (1997) Watson, Taylor, Borthwick
+    7, 8, 10, 14, 23, 44, 95, 241,
+    8, 8, 11, 15, 25, 47, 102, 255,
+    10, 11, 13, 19, 31, 58, 127, 255,
+    14, 15, 19, 27, 44, 83, 181, 255,
+    23, 25, 31, 44, 72, 136, 255, 255,
+    44, 47, 58, 83, 136, 255, 255, 255,
+    95, 102, 127, 181, 255, 255, 255, 255,
+    241, 255, 255, 255, 255, 255, 255, 255
+#endif
+#if 0
+    // A visual detection model for DCT coefficient quantization (12/9/93) Ahumada, Watson, Peterson
+    15, 11, 11, 12, 15, 19, 25, 32,
+    11, 13, 10, 10, 12, 15, 19, 24,
+    11, 10, 14, 14, 16, 18, 22, 27,
+    12, 10, 14, 18, 21, 24, 28, 33,
+    15, 12, 16, 21, 26, 31, 36, 42,
+    19, 15, 18, 24, 31, 38, 45, 53,
+    25, 19, 22, 28, 36, 45, 55, 65,
+    32, 24, 27, 33, 42, 53, 65, 77
+#endif
+#if 1
+    // Relevance of human vision to JPEG-DCT compression (1992) Klein, Silverstein and Carney.
+    10, 12, 14, 19, 26, 38, 57, 86,
+    12, 18, 21, 28, 35, 41, 54, 76,
+    14, 21, 25, 32, 44, 63, 92, 136,
+    19, 28, 32, 41, 54, 75, 107, 157,
+    26, 35, 44, 54, 70, 95, 132, 190,
+    38, 41, 63, 75, 95, 125, 170, 239,
+    57, 54, 92, 107, 132, 170, 227, 312,
+    86, 76, 136, 157, 190, 239, 312, 419
+#endif
+#if 0
+    // An improved detection model for DCT coefficient quantization (1993) Peterson, Ahumada and Watson
+    14, 10, 11, 14, 19, 25, 34, 45,
+    10, 11, 11, 12, 15, 20, 26, 33,
+    11, 11, 15, 18, 21, 25, 31, 38,
+    14, 12, 18, 24, 28, 33, 39, 47,
+    19, 15, 21, 28, 36, 43, 51, 59,
+    25, 20, 25, 33, 43, 54, 64, 74,
+    34, 26, 31, 39, 51, 64, 77, 91,
+    45, 33, 38, 47, 59, 74, 91, 108
+#endif
   }
 };
-static const unsigned int std_chrominance_quant_tbl[5][DCTSIZE2] = {
+
+static const unsigned int std_chrominance_quant_tbl[6][DCTSIZE2] = {
   {
+    // JPEG Annex K
     17,  18,  24,  47,  99,  99,  99,  99,
     18,  21,  26,  66,  99,  99,  99,  99,
     24,  26,  56,  99,  99,  99,  99,  99,
@@ -134,6 +185,7 @@ static const unsigned int std_chrominance_quant_tbl[5][DCTSIZE2] = {
     99,  99,  99,  99,  99,  99,  99,  99
   },
   {
+    // Flat
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16,
@@ -154,14 +206,15 @@ static const unsigned int std_chrominance_quant_tbl[5][DCTSIZE2] = {
     99, 99, 99, 99, 99, 99, 99, 99
   },
   {
-    8, 14, 17, 21, 90, 98, 98, 99,
-    14, 15, 17, 46, 92, 93, 90, 97,
-    16, 21, 44, 94, 96, 96, 98, 98,
-    27, 31, 84, 85, 96, 96, 98, 98,
-    31, 92, 96, 94, 99, 99, 99, 99,
-    99, 99, 99, 99, 99, 99, 99, 99,
-    99, 99, 99, 99, 99, 99, 99, 99,
-    99, 99, 99, 99, 99, 99, 99, 99
+    // From http://www.imagemagick.org/discourse-server/viewtopic.php?f=22&t=20333&p=98008#p98008
+    16,  16,  16,  18,  25,  37,  56,  85,
+    16,  17,  20,  27,  34,  40,  53,  75,
+    16,  20,  24,  31,  43,  62,  91,  135,
+    18,  27,  31,  40,  53,  74,  106, 156,
+    25,  34,  43,  53,  69,  94,  131, 189,
+    37,  40,  62,  74,  94,  124, 169, 238,
+    56,  53,  91,  106, 131, 169, 226, 311,
+    85,  75,  135, 156, 189, 238, 311, 418
   },
   {
     9, 10, 17, 19, 62, 89, 91, 97,
@@ -172,6 +225,17 @@ static const unsigned int std_chrominance_quant_tbl[5][DCTSIZE2] = {
     99, 100, 98, 99, 99, 99, 99, 99,
     99, 99, 99, 99, 99, 99, 99, 99,
     97, 97, 99, 99, 99, 99, 97, 99
+  },
+  {
+    // JPEG Annex K
+    17,  18,  24,  47,  99,  99,  99,  99,
+    18,  21,  26,  66,  99,  99,  99,  99,
+    24,  26,  56,  99,  99,  99,  99,  99,
+    47,  66,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99
   }
 };
 
diff --git a/rdswitch.c b/rdswitch.c
index 02b5af80..7fac1b0d 100644
--- a/rdswitch.c
+++ b/rdswitch.c
@@ -279,7 +279,7 @@ bogus:
  * The spec says that the values given produce "good" quality, and
  * when divided by 2, "very good" quality.
  */
-static const unsigned int std_luminance_quant_tbl[5][DCTSIZE2] = {
+static const unsigned int std_luminance_quant_tbl[6][DCTSIZE2] = {
   {
     16,  11,  10,  16,  24,  40,  51,  61,
     12,  12,  14,  19,  26,  58,  60,  55,
@@ -311,14 +311,15 @@ static const unsigned int std_luminance_quant_tbl[5][DCTSIZE2] = {
     68, 90, 90, 96, 113, 102, 105, 103
   },
   {
-    11, 15, 17, 19, 29, 40, 55, 62,
-    16, 16, 18, 24, 33, 59, 60, 56,
-    16, 19, 22, 29, 43, 59, 69, 58,
-    22, 22, 30, 43, 59, 89, 81, 65,
-    27, 35, 47, 75, 69, 115, 105, 78,
-    49, 45, 59, 70, 85, 106, 113, 93,
-    52, 67, 78, 89, 103, 123, 120, 102,
-    71, 92, 96, 98, 112, 100, 103, 99
+    // From http://www.imagemagick.org/discourse-server/viewtopic.php?f=22&t=20333&p=98008#p98008
+    16,  16,  16,  18,  25,  37,  56,  85,
+    16,  17,  20,  27,  34,  40,  53,  75,
+    16,  20,  24,  31,  43,  62,  91,  135,
+    18,  27,  31,  40,  53,  74,  106, 156,
+    25,  34,  43,  53,  69,  94,  131, 189,
+    37,  40,  62,  74,  94,  124, 169, 238,
+    56,  53,  91,  106, 131, 169, 226, 311,
+    85,  75,  135, 156, 189, 238, 311, 418
   },
   {
     9, 10, 12, 14, 27, 32, 51, 62,
@@ -329,9 +330,56 @@ static const unsigned int std_luminance_quant_tbl[5][DCTSIZE2] = {
     40, 42, 59, 84, 88, 124, 132, 111,
     42, 64, 78, 95, 105, 126, 125, 99,
     70, 75, 100, 102, 116, 100, 107, 98
+  },
+  {
+#if 0
+    // DCTune perceptual optimization of compressed dental X-Rays (1997) Watson, Taylor, Borthwick
+    7, 8, 10, 14, 23, 44, 95, 241,
+    8, 8, 11, 15, 25, 47, 102, 255,
+    10, 11, 13, 19, 31, 58, 127, 255,
+    14, 15, 19, 27, 44, 83, 181, 255,
+    23, 25, 31, 44, 72, 136, 255, 255,
+    44, 47, 58, 83, 136, 255, 255, 255,
+    95, 102, 127, 181, 255, 255, 255, 255,
+    241, 255, 255, 255, 255, 255, 255, 255
+#endif
+#if 0
+    // A visual detection model for DCT coefficient quantization (12/9/93) Ahumada, Watson, Peterson
+    15, 11, 11, 12, 15, 19, 25, 32,
+    11, 13, 10, 10, 12, 15, 19, 24,
+    11, 10, 14, 14, 16, 18, 22, 27,
+    12, 10, 14, 18, 21, 24, 28, 33,
+    15, 12, 16, 21, 26, 31, 36, 42,
+    19, 15, 18, 24, 31, 38, 45, 53,
+    25, 19, 22, 28, 36, 45, 55, 65,
+    32, 24, 27, 33, 42, 53, 65, 77
+#endif
+#if 1
+    // Relevance of human vision to JPEG-DCT compression (1992) Klein, Silverstein and Carney.
+    10, 12, 14, 19, 26, 38, 57, 86,
+    12, 18, 21, 28, 35, 41, 54, 76,
+    14, 21, 25, 32, 44, 63, 92, 136,
+    19, 28, 32, 41, 54, 75, 107, 157,
+    26, 35, 44, 54, 70, 95, 132, 190,
+    38, 41, 63, 75, 95, 125, 170, 239,
+    57, 54, 92, 107, 132, 170, 227, 312,
+    86, 76, 136, 157, 190, 239, 312, 419
+#endif
+#if 0
+    // An improved detection model for DCT coefficient quantization (1993) Peterson, Ahumada and Watson
+    14, 10, 11, 14, 19, 25, 34, 45,
+    10, 11, 11, 12, 15, 20, 26, 33,
+    11, 11, 15, 18, 21, 25, 31, 38,
+    14, 12, 18, 24, 28, 33, 39, 47,
+    19, 15, 21, 28, 36, 43, 51, 59,
+    25, 20, 25, 33, 43, 54, 64, 74,
+    34, 26, 31, 39, 51, 64, 77, 91,
+    45, 33, 38, 47, 59, 74, 91, 108
+#endif
   }
 };
-static const unsigned int std_chrominance_quant_tbl[5][DCTSIZE2] = {
+
+static const unsigned int std_chrominance_quant_tbl[6][DCTSIZE2] = {
   {
     17,  18,  24,  47,  99,  99,  99,  99,
     18,  21,  26,  66,  99,  99,  99,  99,
@@ -363,14 +411,15 @@ static const unsigned int std_chrominance_quant_tbl[5][DCTSIZE2] = {
     99, 99, 99, 99, 99, 99, 99, 99
   },
   {
-    8, 14, 17, 21, 90, 98, 98, 99,
-    14, 15, 17, 46, 92, 93, 90, 97,
-    16, 21, 44, 94, 96, 96, 98, 98,
-    27, 31, 84, 85, 96, 96, 98, 98,
-    31, 92, 96, 94, 99, 99, 99, 99,
-    99, 99, 99, 99, 99, 99, 99, 99,
-    99, 99, 99, 99, 99, 99, 99, 99,
-    99, 99, 99, 99, 99, 99, 99, 99
+    // From http://www.imagemagick.org/discourse-server/viewtopic.php?f=22&t=20333&p=98008#p98008
+    16,  16,  16,  18,  25,  37,  56,  85,
+    16,  17,  20,  27,  34,  40,  53,  75,
+    16,  20,  24,  31,  43,  62,  91,  135,
+    18,  27,  31,  40,  53,  74,  106, 156,
+    25,  34,  43,  53,  69,  94,  131, 189,
+    37,  40,  62,  74,  94,  124, 169, 238,
+    56,  53,  91,  106, 131, 169, 226, 311,
+    85,  75,  135, 156, 189, 238, 311, 418
   },
   {
     9, 10, 17, 19, 62, 89, 91, 97,
@@ -381,6 +430,16 @@ static const unsigned int std_chrominance_quant_tbl[5][DCTSIZE2] = {
     99, 100, 98, 99, 99, 99, 99, 99,
     99, 99, 99, 99, 99, 99, 99, 99,
     97, 97, 99, 99, 99, 99, 97, 99
+  },
+  {
+    17,  18,  24,  47,  99,  99,  99,  99,
+    18,  21,  26,  66,  99,  99,  99,  99,
+    24,  26,  56,  99,  99,  99,  99,  99,
+    47,  66,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99    
   }
 };
 

From 8a12b6a6a0d3846d2b9f4ba67cfbd3a7fa3d71d3 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Thu, 30 Oct 2014 14:52:41 +0100
Subject: [PATCH 004/108] Adjust lambda parameters

Parameters were retuned for grayscale and quality range 0-100
---
 cjpeg.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cjpeg.c b/cjpeg.c
index c6ec4486..a205c57d 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -508,29 +508,29 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       
     } else if (keymatch(arg, "tune-psnr", 6)) {
       cinfo->quant_tbl_master_idx = 1;
-      cinfo->lambda_log_scale1 = 9.25;
+      cinfo->lambda_log_scale1 = 9.0;
       cinfo->lambda_log_scale2 = 0.0;
       cinfo->use_lambda_weight_tbl = FALSE;
       jpeg_set_quality(cinfo, 75, TRUE);
       
     } else if (keymatch(arg, "tune-ssim", 6)) {
       cinfo->quant_tbl_master_idx = 1;
-      cinfo->lambda_log_scale1 = 11.25;
+      cinfo->lambda_log_scale1 = 11.5;
       cinfo->lambda_log_scale2 = 12.75;
       cinfo->use_lambda_weight_tbl = FALSE;
       jpeg_set_quality(cinfo, 75, TRUE);
       
     } else if (keymatch(arg, "tune-ms-ssim", 6)) {
       cinfo->quant_tbl_master_idx = 2;
-      cinfo->lambda_log_scale1 = 12.25;
-      cinfo->lambda_log_scale2 = 13.25;
+      cinfo->lambda_log_scale1 = 12.0;
+      cinfo->lambda_log_scale2 = 13.0;
       cinfo->use_lambda_weight_tbl = TRUE;
       jpeg_set_quality(cinfo, 75, TRUE);
       
     } else if (keymatch(arg, "tune-hvs-psnr", 6)) {
       cinfo->quant_tbl_master_idx = 4;
-      cinfo->lambda_log_scale1 = 13.75;
-      cinfo->lambda_log_scale2 = 15.25;
+      cinfo->lambda_log_scale1 = 14.75;
+      cinfo->lambda_log_scale2 = 16.5;
       cinfo->use_lambda_weight_tbl = TRUE;
       jpeg_set_quality(cinfo, 75, TRUE);
       

From 7b008bdf12e5fc12400cecb6cbf14b0ac19d4cc5 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Thu, 30 Oct 2014 15:13:13 +0100
Subject: [PATCH 005/108] Add switch to cjpeg to select quant table

Replace -flat command line argument with more generic -quant_table
argument that selects one of several predefined quantization tables
---
 cjpeg.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/cjpeg.c b/cjpeg.c
index a205c57d..09c3fe6f 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -203,6 +203,13 @@ usage (void)
   fprintf(stderr, "  -dct float     Use floating-point DCT method%s\n",
           (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
 #endif
+  fprintf(stderr, "  -quant-table N Use predefined quantization table N:\n");
+  fprintf(stderr, "                 - 0 JPEG Annex K\n");
+  fprintf(stderr, "                 - 1 Flat\n");
+  fprintf(stderr, "                 - 2 Custom, tuned for MS-SSIM\n");
+  fprintf(stderr, "                 - 3 ImageMagick table by N. Robidoux\n");
+  fprintf(stderr, "                 - 4 Custom, tuned for PSNR-HVS\n");
+  fprintf(stderr, "                 - 5 Table from paper by Klein, Silverstein and Carney\n");
   fprintf(stderr, "  -restart N     Set restart interval in rows, or in blocks with B\n");
 #ifdef INPUT_SMOOTHING_SUPPORTED
   fprintf(stderr, "  -smooth N      Smooth dithered input (N=1..100 is strength)\n");
@@ -325,10 +332,6 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
     } else if (keymatch(arg, "fastcrush", 4)) {
       cinfo->optimize_scans = FALSE;
 
-    } else if (keymatch(arg, "flat", 4)) {
-      cinfo->quant_tbl_master_idx = 1;
-      jpeg_set_quality(cinfo, 75, TRUE);
-
     } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
       /* Force a monochrome JPEG file to be generated. */
       jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
@@ -427,6 +430,12 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       qtablefile = argv[argn];
       /* We postpone actually reading the file in case -quality comes later. */
 
+    } else if (keymatch(arg, "quant_table", 2)) {
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      cinfo->quant_tbl_master_idx = atoi(argv[argn]);
+      jpeg_set_quality(cinfo, 75, TRUE);
+      
     } else if (keymatch(arg, "restart", 1)) {
       /* Restart interval in MCU rows (or in MCUs with 'b'). */
       long lval;

From 4618c247dfe29aa9524d829d7a211e9baab65180 Mon Sep 17 00:00:00 2001
From: DRC <information@virtualgl.org>
Date: Wed, 5 Nov 2014 20:10:37 -0600
Subject: [PATCH 006/108] Various wordsmithing and cosmetic changes to remove
 libjpeg-turbo-specific information This patch does the following: --
 Implements some (hopefully non-controversial) changes to the package
 descriptions, in order to prevent confusion (the existing descriptions from
 libjpeg-turbo are not appropriate for mozjpeg.) -- Replaces "libmozjpeg" with
 "mozjpeg" in all documentation and comments.  The project is called
 "mozjpeg", and it doesn't actually generate a library called "libmozjpeg", so
 it doesn't make sense to use "libmozjpeg" to describe it. -- Replaces
 "MozJPEG" with "TurboJPEG" in all documentation and comments. "MozJPEG"
 appears to have been the product of blindly searching/replacing instances of
 "Turbo".  TurboJPEG is the name of the API, and that name still applies to
 the implementation in mozjpeg.  Furthermore, the TurboJPEG libraries are
 still called "libturbojpeg" in mozjpeg. -- Attempts to remove build
 instructions that are irrelevant or not applicable to mozjpeg.  Further work
 possibly needs to be done here-- for instance, it doesn't make much sense to
 have build instructions for mobile devices when the library is not intended
 to be used for decoding. -- Changes the vendor in the DEB and RPM files from
 "The libmozjpeg Project" to "Mozilla Research". -- Changes the source tarball
 location in the RPM spec file to correctly point to the release tarball on
 github. -- Changes the source directory in the RPM spec file to
 "mozjpeg-%{version}", which is the actual name of the source directory in the
 mozjpeg tarballs.

---
 BUILDING.txt               | 219 +++++++++++++++++--------------------
 CMakeLists.txt             |   6 +-
 LICENSE.txt                |   2 +-
 configure.ac               |  14 +--
 release/License.rtf        |   2 +-
 release/ReadMe.txt         |   6 +-
 release/Welcome.rtf        |   2 +-
 release/deb-control.tmpl   |  37 +++----
 release/libmozjpeg.spec.in |  45 ++++----
 release/uninstall.in       |   2 +-
 10 files changed, 149 insertions(+), 186 deletions(-)

diff --git a/BUILDING.txt b/BUILDING.txt
index 70d71240..6f43c792 100644
--- a/BUILDING.txt
+++ b/BUILDING.txt
@@ -35,21 +35,9 @@ Build Requirements
 
    NOTE: the NASM build will fail if texinfo is not installed.
 
--- GCC v4.1 or later recommended for best performance
-   * Beginning with Xcode 4, Apple stopped distributing GCC and switched to
-     the LLVM compiler.  Xcode v4.0 through v4.6 provides a GCC front end
-     called LLVM-GCC.  Unfortunately, as of this writing, neither LLVM-GCC nor
-     the LLVM (clang) compiler produces optimal performance with libmozjpeg.
-     Building libmozjpeg with LLVM-GCC v4.2 results in a 10% performance
-     degradation when compressing using 64-bit code, relative to building
-     libmozjpeg with GCC v4.2.  Building libmozjpeg with LLVM (clang)
-     results in a 20% performance degradation when compressing using 64-bit
-     code, relative to building libmozjpeg with GCC v4.2.  If you are
-     running Snow Leopard or earlier, it is suggested that you continue to use
-     Xcode v3.2.6, which provides GCC v4.2.  If you are using Lion or later, it
-     is suggested that you install Apple GCC v4.2 through MacPorts.
+-- GCC v4.1 (or later) or clang recommended
 
--- If building the MozJPEG Java wrapper, JDK or OpenJDK 1.5 or later is
+-- If building the TurboJPEG Java wrapper, JDK or OpenJDK 1.5 or later is
    required.  Some systems, such as Solaris 10 and later and Red Hat Enterprise
    Linux 5 and later, have this pre-installed.  On OS X 10.5 and later, it will
    be necessary to install the Java Developer Package, which can be downloaded
@@ -64,19 +52,19 @@ Out-of-Tree Builds
 
 Binary objects, libraries, and executables are generated in the same directory
 from which configure was executed (the "binary directory"), and this directory
-need not necessarily be the same as the libmozjpeg source directory.  You
+need not necessarily be the same as the mozjpeg source directory.  You
 can create multiple independent binary directories, in which different versions
-of libmozjpeg can be built from the same source tree using different
+of mozjpeg can be built from the same source tree using different
 compilers or settings.  In the sections below, {build_directory} refers to the
-binary directory, whereas {source_directory} refers to the libmozjpeg source
+binary directory, whereas {source_directory} refers to the mozjpeg source
 directory.  For in-tree builds, these directories are the same.
 
 
-======================
-Building libmozjpeg
-======================
+================
+Building mozjpeg
+================
 
-The following procedure will build libmozjpeg on Linux, FreeBSD, Cygwin, and
+The following procedure will build mozjpeg on Linux, FreeBSD, Cygwin, and
 Solaris/x86 systems (on Solaris, this generates a 32-bit library.  See below
 for 64-bit build instructions.)
 
@@ -86,9 +74,6 @@ for 64-bit build instructions.)
   sh {source_directory}/configure [additional configure flags]
   make
 
-NOTE: Running autoreconf in the source directory is usually only necessary if
-building libmozjpeg from the SVN repository.
-
 This will generate the following files under .libs/
 
   libjpeg.a
@@ -110,28 +95,28 @@ This will generate the following files under .libs/
   libjpeg.dll.a (Cygwin)
       Import library for the libjpeg API
 
-  libmozjpeg.a
-      Static link library for the MozJPEG API
+  libturbojpeg.a
+      Static link library for the TurboJPEG API
 
-  libmozjpeg.so.0.0.0 (Linux, Unix)
-  libmozjpeg.0.0.0.dylib (OS X)
-  cygmozjpeg-0.dll (Cygwin)
-      Shared library for the MozJPEG API
+  libturbojpeg.so.0.1.0 (Linux, Unix)
+  libturbojpeg.0.1.0.dylib (OS X)
+  cygturbojpeg-0.dll (Cygwin)
+      Shared library for the TurboJPEG API
 
-  libmozjpeg.so (Linux, Unix)
-  libmozjpeg.dylib (OS X)
-      Development symlink for the MozJPEG API
+  libturbojpeg.so (Linux, Unix)
+  libturbojpeg.dylib (OS X)
+      Development symlink for the TurboJPEG API
 
-  libmozjpeg.dll.a (Cygwin)
-      Import library for the MozJPEG API
+  libturbojpeg.dll.a (Cygwin)
+      Import library for the TurboJPEG API
 
 
 libjpeg v7 or v8 API/ABI Emulation
 ----------------------------------
 
 Add --with-jpeg7 to the configure command line to build a version of
-libmozjpeg that is API/ABI-compatible with libjpeg v7.  Add --with-jpeg8 to
-the configure command to build a version of libmozjpeg that is
+mozjpeg that is API/ABI-compatible with libjpeg v7.  Add --with-jpeg8 to
+the configure command to build a version of mozjpeg that is
 API/ABI-compatible with libjpeg v8.  See README-turbo.txt for more information
 on libjpeg v7 and v8 emulation.
 
@@ -140,7 +125,7 @@ In-Memory Source/Destination Managers
 -------------------------------------
 
 When using libjpeg v6b or v7 API/ABI emulation, add --without-mem-srcdst to the
-configure command line to build a version of libmozjpeg that lacks the
+configure command line to build a version of mozjpeg that lacks the
 jpeg_mem_src() and jpeg_mem_dest() functions.  These functions were not part of
 the original libjpeg v6b and v7 APIs, so removing them ensures strict
 conformance with those APIs.  See README-turbo.txt for more information.
@@ -150,7 +135,7 @@ Arithmetic Coding Support
 -------------------------
 
 Since the patent on arithmetic coding has expired, this functionality has been
-included in this release of libmozjpeg.  libmozjpeg's implementation is
+included in this release of mozjpeg.  mozjpeg's implementation is
 based on the implementation in libjpeg v8, but it works when emulating libjpeg
 v7 or v6b as well.  The default is to enable both arithmetic encoding and
 decoding, but those who have philosophical objections to arithmetic coding can
@@ -158,11 +143,11 @@ add --without-arith-enc or --without-arith-dec to the configure command line to
 disable encoding or decoding (respectively.)
 
 
-MozJPEG Java Wrapper
+TurboJPEG Java Wrapper
 ----------------------
 Add --with-java to the configure command line to incorporate an optional Java
-Native Interface wrapper into the MozJPEG shared library and build the Java
-front-end classes to support it.  This allows the MozJPEG shared library to
+Native Interface wrapper into the TurboJPEG shared library and build the Java
+front-end classes to support it.  This allows the TurboJPEG shared library to
 be used directly from Java applications.  See java/README for more details.
 
 You can set the JAVAC, JAR, and JAVA configure variables to specify
@@ -173,9 +158,9 @@ to specify arguments that should be passed to the C compiler when building the
 JNI wrapper.  Run 'configure --help' for more details.
 
 
-========================
-Installing libmozjpeg
-========================
+==================
+Installing mozjpeg
+==================
 
 If you intend to install these libraries and the associated header files, then
 replace 'make' in the instructions above with
@@ -195,9 +180,9 @@ files in /opt/libmozjpeg/lib32 (32-bit) or /opt/libmozjpeg/lib64
 NOTE: You can specify a prefix of /usr and a libdir of, for instance,
 /usr/lib64 to overwrite the system's version of libjpeg.  If you do this,
 however, then be sure to BACK UP YOUR SYSTEM'S INSTALLATION OF LIBJPEG before
-overwriting it.  It is recommended that you instead install libmozjpeg into
+overwriting it.  It is recommended that you instead install mozjpeg into
 a non-system directory and manipulate the LD_LIBRARY_PATH or create symlinks
-to force applications to use libmozjpeg instead of libjpeg.  See
+to force applications to use mozjpeg instead of libjpeg.  See
 README-turbo.txt for more information.
 
 
@@ -284,10 +269,10 @@ Add
 
   CC=cc
 
-to the configure command line.  libmozjpeg will automatically be built with
+to the configure command line.  mozjpeg will automatically be built with
 the maximum optimization level (-xO5) unless you override CFLAGS.
 
-To build a 64-bit version of libmozjpeg using Oracle Solaris Studio, add
+To build a 64-bit version of mozjpeg using Oracle Solaris Studio, add
 
   --host x86_64-pc-solaris CC=cc CFLAGS='-xO5 -m64' LDFLAGS=-m64
 
@@ -304,19 +289,19 @@ Use CMake (see recipes below)
 ARM Support
 ===========
 
-This release of libmozjpeg can use ARM NEON SIMD instructions to accelerate
+This release of mozjpeg can use ARM NEON SIMD instructions to accelerate
 JPEG compression/decompression by approximately 2-4x on ARMv7 and later
-platforms.  If libmozjpeg is configured on an ARM Linux platform, then the
+platforms.  If mozjpeg is configured on an ARM Linux platform, then the
 build system will automatically include the NEON SIMD routines, if they are
 supported.  Build instructions for other ARM-based platforms follow.
 
 
-Building libmozjpeg for iOS
-------------------------------
+Building mozjpeg for iOS
+------------------------
 
 iOS platforms, such as the iPhone and iPad, use ARM processors, some of which
 support NEON instructions.  Additional steps are required in order to build
-libmozjpeg for these platforms.
+mozjpeg for these platforms.
 
 Additional build requirements:
 
@@ -355,7 +340,7 @@ Set the following shell variables for simplicity:
   Xcode 5.0.x and later:
   IOS_CFLAGS="-arch armv7s"
 
-Follow the procedure under "Building libmozjpeg" above, adding
+Follow the procedure under "Building mozjpeg" above, adding
 
   --host arm-apple-darwin10 --enable-static --disable-shared \
     CC="$IOS_GCC" LD="$IOS_GCC" \
@@ -375,12 +360,6 @@ the SDK.
 Once built, lipo can be used to combine the ARM v6, v7, and/or v7s variants
 into a universal library.
 
-NOTE: If you are building libmozjpeg from the "official" project tarball,
-then it is highly likely that you will need to run 'autoreconf -fiv' in the
-source tree prior to building ARM v7 or v7s iOS binaries using the techniques
-described above.  Otherwise, you may get a libtool error such as "unable to
-infer tagged configuration."
-
 
 Building libjpeg-turbo for Android
 ----------------------------------
@@ -440,12 +419,12 @@ Build Requirements
    http://msdn.microsoft.com/en-us/windows/bb980924.aspx
 
    The Windows SDK includes both 32-bit and 64-bit Visual C++ compilers and
-   everything necessary to build libmozjpeg.
+   everything necessary to build mozjpeg.
 
    * You can also use Microsoft Visual Studio Express Edition, which is a free
      download.  (NOTE: versions prior to 2012 can only be used to build 32-bit
      code.)
-   * If you intend to build libmozjpeg from the command line, then add the
+   * If you intend to build mozjpeg from the command line, then add the
      appropriate compiler and SDK directories to the INCLUDE, LIB, and PATH
      environment variables.  This is generally accomplished by executing
      vcvars32.bat or vcvars64.bat and SetEnv.cmd.  vcvars32.bat and
@@ -467,7 +446,7 @@ Build Requirements
 -- NASM (http://www.nasm.us/) 0.98 or later (NASM 2.05 or later is required for
    a 64-bit build)
 
--- If building the MozJPEG Java wrapper, JDK 1.5 or later is required.  This
+-- If building the TurboJPEG Java wrapper, JDK 1.5 or later is required.  This
    can be downloaded from http://www.java.com.
 
 
@@ -477,17 +456,17 @@ Out-of-Tree Builds
 
 Binary objects, libraries, and executables are generated in the same directory
 from which cmake was executed (the "binary directory"), and this directory need
-not necessarily be the same as the libmozjpeg source directory.  You can
+not necessarily be the same as the mozjpeg source directory.  You can
 create multiple independent binary directories, in which different versions of
-libmozjpeg can be built from the same source tree using different compilers
+mozjpeg can be built from the same source tree using different compilers
 or settings.  In the sections below, {build_directory} refers to the binary
-directory, whereas {source_directory} refers to the libmozjpeg source
+directory, whereas {source_directory} refers to the mozjpeg source
 directory.  For in-tree builds, these directories are the same.
 
 
-======================
-Building libmozjpeg
-======================
+================
+Building mozjpeg
+================
 
 
 Visual C++ (Command Line)
@@ -497,7 +476,7 @@ Visual C++ (Command Line)
   cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release {source_directory}
   nmake
 
-This will build either a 32-bit or a 64-bit version of libmozjpeg, depending
+This will build either a 32-bit or a 64-bit version of mozjpeg, depending
 on which version of cl.exe is in the PATH.
 
 The following files will be generated under {build_directory}:
@@ -508,12 +487,12 @@ The following files will be generated under {build_directory}:
       DLL for the libjpeg API
   sharedlib/jpeg.lib
       Import library for the libjpeg API
-  mozjpeg-static.lib
-      Static link library for the MozJPEG API
-  mozjpeg.dll
-      DLL for the MozJPEG API
-  mozjpeg.lib
-      Import library for the MozJPEG API
+  turbojpeg-static.lib
+      Static link library for the TurboJPEG API
+  turbojpeg.dll
+      DLL for the TurboJPEG API
+  turbojpeg.lib
+      Import library for the TurboJPEG API
 
 {version} is 62, 7, or 8, depending on whether libjpeg v6b (default), v7, or
 v8 emulation is enabled.
@@ -536,7 +515,7 @@ and 64-bit builds.
 
 You can then open ALL_BUILD.vcproj in Visual Studio and build one of the
 configurations in that project ("Debug", "Release", etc.) to generate a full
-build of libmozjpeg.
+build of mozjpeg.
 
 This will generate the following files under {build_directory}:
 
@@ -546,12 +525,12 @@ This will generate the following files under {build_directory}:
       DLL for the libjpeg API
   sharedlib/{configuration}/jpeg.lib
       Import library for the libjpeg API
-  {configuration}/mozjpeg-static.lib
-      Static link library for the MozJPEG API
-  {configuration}/mozjpeg.dll
-      DLL for the MozJPEG API
-  {configuration}/mozjpeg.lib
-      Import library for the MozJPEG API
+  {configuration}/turbojpeg-static.lib
+      Static link library for the TurboJPEG API
+  {configuration}/turbojpeg.dll
+      DLL for the TurboJPEG API
+  {configuration}/turbojpeg.lib
+      Import library for the TurboJPEG API
 
 {configuration} is Debug, Release, RelWithDebInfo, or MinSizeRel, depending on
 the configuration you built in the IDE, and {version} is 62, 7, or 8,
@@ -576,12 +555,12 @@ This will generate the following files under {build_directory}
       DLL for the libjpeg API
   sharedlib/libjpeg.dll.a
       Import library for the libjpeg API
-  libmozjpeg.a
-      Static link library for the MozJPEG API
-  libmozjpeg.dll
-      DLL for the MozJPEG API
-  libmozjpeg.dll.a
-      Import library for the MozJPEG API
+  libturbojpeg.a
+      Static link library for the TurboJPEG API
+  libturbojpeg.dll
+      DLL for the TurboJPEG API
+  libturbojpeg.dll.a
+      Import library for the TurboJPEG API
 
 {version} is 62, 7, or 8, depending on whether libjpeg v6b (default), v7, or
 v8 emulation is enabled.
@@ -599,8 +578,8 @@ libjpeg v7 or v8 API/ABI Emulation
 -----------------------------------
 
 Add "-DWITH_JPEG7=1" to the cmake command line to build a version of
-libmozjpeg that is API/ABI-compatible with libjpeg v7.  Add "-DWITH_JPEG8=1"
-to the cmake command to build a version of libmozjpeg that is
+mozjpeg that is API/ABI-compatible with libjpeg v7.  Add "-DWITH_JPEG8=1"
+to the cmake command to build a version of mozjpeg that is
 API/ABI-compatible with libjpeg v8.  See README-turbo.txt for more information
 on libjpeg v7 and v8 emulation.
 
@@ -609,7 +588,7 @@ In-Memory Source/Destination Managers
 -------------------------------------
 
 When using libjpeg v6b or v7 API/ABI emulation, add -DWITH_MEM_SRCDST=0 to the
-CMake command line to build a version of libmozjpeg that lacks the
+CMake command line to build a version of mozjpeg that lacks the
 jpeg_mem_src() and jpeg_mem_dest() functions.  These functions were not part of
 the original libjpeg v6b and v7 APIs, so removing them ensures strict
 conformance with those APIs.  See README-turbo.txt for more information.
@@ -619,7 +598,7 @@ Arithmetic Coding Support
 -------------------------
 
 Since the patent on arithmetic coding has expired, this functionality has been
-included in this release of libmozjpeg.  libmozjpeg's implementation is
+included in this release of mozjpeg.  mozjpeg's implementation is
 based on the implementation in libjpeg v8, but it works when emulating libjpeg
 v7 or v6b as well.  The default is to enable both arithmetic encoding and
 decoding, but those who have philosophical objections to arithmetic coding can
@@ -627,11 +606,11 @@ add "-DWITH_ARITH_ENC=0" or "-DWITH_ARITH_DEC=0" to the cmake command line to
 disable encoding or decoding (respectively.)
 
 
-MozJPEG Java Wrapper
+TurboJPEG Java Wrapper
 ----------------------
 Add "-DWITH_JAVA=1" to the cmake command line to incorporate an optional Java
-Native Interface wrapper into the MozJPEG shared library and build the Java
-front-end classes to support it.  This allows the MozJPEG shared library to
+Native Interface wrapper into the TurboJPEG shared library and build the Java
+front-end classes to support it.  This allows the TurboJPEG shared library to
 be used directly from Java applications.  See java/README for more details.
 
 If you are using CMake 2.8, you can set the Java_JAVAC_EXECUTABLE,
@@ -642,11 +621,11 @@ instead.  You can also set the JAVACFLAGS CMake variable to specify arguments
 that should be passed to the Java compiler when building the front-end classes.
 
 
-========================
-Installing libmozjpeg
-========================
+==================
+Installing mozjpeg
+==================
 
-You can use the build system to install libmozjpeg into a directory of your
+You can use the build system to install mozjpeg into a directory of your
 choosing (as opposed to creating an installer.)  To do this, add:
 
   -DCMAKE_INSTALL_PREFIX={install_directory}
@@ -679,7 +658,7 @@ Build Recipes
     {source_directory}
   make
 
-This produces a 64-bit build of libmozjpeg that does not depend on
+This produces a 64-bit build of mozjpeg that does not depend on
 cygwin1.dll or other Cygwin DLL's.  The mingw64-x86_64-gcc-core and
 mingw64-x86_64-gcc-g++ packages (and their dependencies) must be installed.
 
@@ -694,7 +673,7 @@ mingw64-x86_64-gcc-g++ packages (and their dependencies) must be installed.
     {source_directory}
   make
 
-This produces a 32-bit build of libmozjpeg that does not depend on
+This produces a 32-bit build of mozjpeg that does not depend on
 cygwin1.dll or other Cygwin DLL's.  The mingw64-i686-gcc-core and
 mingw64-i686-gcc-g++ packages (and their dependencies) must be installed.
 
@@ -746,7 +725,7 @@ make udmg [BUILDDIR32={32-bit build directory}]
 
   On 64-bit OS X systems, this creates a Macintosh package and disk image that
   contains universal i386/x86-64 binaries.  You should first configure a 32-bit
-  out-of-tree build of libmozjpeg, then configure a 64-bit out-of-tree
+  out-of-tree build of mozjpeg, then configure a 64-bit out-of-tree
   build, then run 'make udmg' from the 64-bit build directory.  The build
   system will look for the 32-bit build under {source_directory}/osxx86 by
   default, but you can override this by setting the BUILDDIR32 variable on the
@@ -758,13 +737,13 @@ make iosdmg [BUILDDIR32={32-bit build directory}] \
   [BUILDDIRARMV7S={ARM v7s build directory}]
 
   On OS X systems, this creates a Macintosh package and disk image in which the
-  libmozjpeg static libraries contain ARM architectures necessary to build
+  mozjpeg static libraries contain ARM architectures necessary to build
   iOS applications.  If building on an x86-64 system, the binaries will also
   contain the i386 architecture, as with 'make udmg' above.  You should first
-  configure ARM v6, ARM v7, and/or ARM v7s out-of-tree builds of libmozjpeg
-  (see "Building libmozjpeg for iOS" above.)  If you are building an x86-64
-  version of libmozjpeg, you should configure a 32-bit out-of-tree build as
-  well.  Next, build libmozjpeg as you would normally, using an out-of-tree
+  configure ARM v6, ARM v7, and/or ARM v7s out-of-tree builds of mozjpeg
+  (see "Building mozjpeg for iOS" above.)  If you are building an x86-64
+  version of mozjpeg, you should configure a 32-bit out-of-tree build as
+  well.  Next, build mozjpeg as you would normally, using an out-of-tree
   build.  When it is built, run 'make iosdmg' from the build directory.  The
   build system will look for the ARM v6 build under {source_directory}/iosarmv6
   by default, the ARM v7 build under {source_directory}/iosarmv7 by default,
@@ -808,24 +787,24 @@ Building a Windows installer requires the Nullsoft Install System
 **     Regression testing
 *******************************************************************************
 
-The most common way to test libmozjpeg is by invoking 'make test' on
+The most common way to test mozjpeg is by invoking 'make test' on
 Unix/Linux platforms or 'ctest' on Windows platforms, once the build has
 completed.  This runs a series of tests to ensure that mathematical
-compatibility has been maintained between libmozjpeg and libjpeg v6b.  This
-also invokes the MozJPEG unit tests, which ensure that the colorspace
-extensions, YUV encoding, decompression scaling, and other features of the
-MozJPEG C and Java APIs are working properly (and, by extension, that the
-equivalent features of the underlying libjpeg API are also working.)
+compatibility has been maintained.  This also invokes the TurboJPEG unit tests,
+which ensure that the colorspace extensions, YUV encoding, decompression
+scaling, and other features of the TurboJPEG C and Java APIs are working
+properly (and, by extension, that the equivalent features of the underlying
+libjpeg API are also working.)
 
 Invoking 'make testclean' or 'nmake testclean' (if using NMake) or building
 the 'testclean' target (if using the Visual Studio IDE) will clean up the
 output images generated by 'make test'.
 
-On Unix/Linux platforms, more extensive tests of the MozJPEG C and Java
-wrappers can be run by invoking 'make tjtest'.  These extended MozJPEG tests
-essentially iterate through all of the available features of the MozJPEG APIs
-that are not covered by the MozJPEG unit tests (this includes the lossless
+On Unix/Linux platforms, more extensive tests of the TurboJPEG C and Java
+wrappers can be run by invoking 'make tjtest'.  These extended TurboJPEG tests
+essentially iterate through all of the available features of the TurboJPEG APIs
+that are not covered by the TurboJPEG unit tests (this includes the lossless
 transform options) and compare the images generated by each feature to images
 generated using the equivalent feature in the libjpeg API.  The extended
-MozJPEG tests are meant to test for regressions in the MozJPEG wrappers,
+TurboJPEG tests are meant to test for regressions in the TurboJPEG wrappers,
 not in the underlying libjpeg API library.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 709b90c7..15fafcf0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,8 +47,8 @@ message(STATUS "VERSION = ${VERSION}, BUILD = ${BUILD}")
 option(WITH_SIMD "Include SIMD extensions" TRUE)
 option(WITH_ARITH_ENC "Include arithmetic encoding support" TRUE)
 option(WITH_ARITH_DEC "Include arithmetic decoding support" TRUE)
-option(WITH_JPEG7 "Emulate libjpeg v7 API/ABI (this makes libmozjpeg backward incompatible with libjpeg v6b)" FALSE)
-option(WITH_JPEG8 "Emulate libjpeg v8 API/ABI (this makes libmozjpeg backward incompatible with libjpeg v6b)" FALSE)
+option(WITH_JPEG7 "Emulate libjpeg v7 API/ABI (this makes mozjpeg backward incompatible with libjpeg v6b)" FALSE)
+option(WITH_JPEG8 "Emulate libjpeg v8 API/ABI (this makes mozjpeg backward incompatible with libjpeg v6b)" FALSE)
 option(WITH_MEM_SRCDST "Include in-memory source/destination manager functions when emulating the libjpeg v6b or v7 API/ABI" TRUE)
 option(WITH_TURBOJPEG "Include the TurboJPEG wrapper library and associated test programs" TRUE)
 option(WITH_JAVA "Build Java wrapper for the TurboJPEG library" FALSE)
@@ -155,7 +155,7 @@ if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
     set(CMAKE_INSTALL_PREFIX_DEFAULT ${CMAKE_INSTALL_PREFIX_DEFAULT}64)
   endif()
   set(CMAKE_INSTALL_PREFIX "c:/${CMAKE_INSTALL_PREFIX_DEFAULT}" CACHE PATH
-    "Directory into which to install libmozjpeg (default: c:/${CMAKE_INSTALL_PREFIX_DEFAULT})"
+    "Directory into which to install mozjpeg (default: c:/${CMAKE_INSTALL_PREFIX_DEFAULT})"
     FORCE)
 endif()
 
diff --git a/LICENSE.txt b/LICENSE.txt
index 32be1d53..0034ed19 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -2,6 +2,6 @@ Redistribution and use in source and binary forms, with or without modification,
 
 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-* Neither the name of the libmozjpeg Project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+* Neither the name of the mozjpeg Project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE, DATA, OR PROFITS;  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/configure.ac b/configure.ac
index 5386e5d0..398717f1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -149,10 +149,10 @@ AC_ARG_VAR(JPEG_LIB_VERSION, [libjpeg API version (62, 70, or 80)])
 if test "x$JPEG_LIB_VERSION" = "x"; then
   AC_ARG_WITH([jpeg7],
     AC_HELP_STRING([--with-jpeg7],
-      [Emulate libjpeg v7 API/ABI (this makes libmozjpeg backward incompatible with libjpeg v6b.)]))
+      [Emulate libjpeg v7 API/ABI (this makes mozjpeg backward incompatible with libjpeg v6b.)]))
   AC_ARG_WITH([jpeg8],
     AC_HELP_STRING([--with-jpeg8],
-      [Emulate libjpeg v8 API/ABI (this makes libmozjpeg backward incompatible with libjpeg v6b.)]))
+      [Emulate libjpeg v8 API/ABI (this makes mozjpeg backward incompatible with libjpeg v6b.)]))
   if test "x${with_jpeg8}" = "xyes"; then
     JPEG_LIB_VERSION=80
   else
@@ -170,9 +170,9 @@ AC_DEFINE_UNQUOTED(JPEG_LIB_VERSION, [$JPEG_LIB_VERSION],
   [libjpeg API version])
 
 AC_ARG_VAR(SO_MAJOR_VERSION,
-  [Major version of the libmozjpeg shared library (default is determined by the API version)])
+  [Major version of the mozjpeg shared library (default is determined by the API version)])
 AC_ARG_VAR(SO_MINOR_VERSION,
-  [Minor version of the libmozjpeg shared library (default is determined by the API version)])
+  [Minor version of the mozjpeg shared library (default is determined by the API version)])
 if test "x$SO_MAJOR_VERSION" = "x"; then
   case "$JPEG_LIB_VERSION" in
     62)  SO_MAJOR_VERSION=$JPEG_LIB_VERSION ;;
@@ -217,12 +217,12 @@ AC_SUBST(SO_MINOR_VERSION)
 AC_SUBST(SO_AGE)
 AC_SUBST(MEM_SRCDST_FUNCTIONS)
 
-AC_DEFINE_UNQUOTED(LIBMOZJPEG_VERSION, [$VERSION], [libmozjpeg version])
+AC_DEFINE_UNQUOTED(MOZJPEG_VERSION, [$VERSION], [mozjpeg version])
 
 VERSION_SCRIPT=yes
 AC_ARG_ENABLE([ld-version-script],
   AS_HELP_STRING([--disable-ld-version-script],
-    [Disable linker version script for libmozjpeg (default is to use linker version script if the linker supports it)]),
+    [Disable linker version script for mozjpeg (default is to use linker version script if the linker supports it)]),
   [VERSION_SCRIPT=$enableval], [])
 
 AC_MSG_CHECKING([whether the linker supports version scripts])
@@ -250,7 +250,7 @@ if test "x$VERSION_SCRIPT_FLAG" = "x"; then
 fi
 LDFLAGS="$SAVED_LDFLAGS"
 
-AC_MSG_CHECKING([whether to use version script when building libmozjpeg])
+AC_MSG_CHECKING([whether to use version script when building mozjpeg])
 AC_MSG_RESULT($VERSION_SCRIPT)
 
 AM_CONDITIONAL(VERSION_SCRIPT, test "x$VERSION_SCRIPT" = "xyes")
diff --git a/release/License.rtf b/release/License.rtf
index 4b8e8baf..a155d57d 100755
--- a/release/License.rtf
+++ b/release/License.rtf
@@ -12,7 +12,7 @@
 \pard\tx220\tx720\pardeftab720\li720\fi-720
 \ls1\ilvl0\cf0 {\listtext	\'95	}Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\
 {\listtext	\'95	}Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\
-{\listtext	\'95	}Neither the name of the libmozjpeg Project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\
+{\listtext	\'95	}Neither the name of the mozjpeg Project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\
 \pard\pardeftab720\qc
 \cf0 \
 \pard\pardeftab720
diff --git a/release/ReadMe.txt b/release/ReadMe.txt
index b9f6ca57..e0e19133 100644
--- a/release/ReadMe.txt
+++ b/release/ReadMe.txt
@@ -1,5 +1,3 @@
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2, NEON) to accelerate baseline JPEG compression and decompression on x86, x86-64, and ARM systems.  On such systems, libjpeg-turbo is generally 2-4x as fast as libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines.  In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+mozjpeg is a fork of libjpeg-turbo that aims to speed up load times of web pages by reducing the size (and, by extension, the transmission time) of JPEG files.  It accomplishes this by enabling optimized Huffman trees and progressive entropy coding by default in the JPEG compressor, as well as splitting the spectrum of DCT coefficients into separate scans and using Trellis quantisation.
 
-libjpeg-turbo implements both the traditional libjpeg API as well as the less powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features colorspace extensions that allow it to compress from/decompress to 32-bit and big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java interface.
-
-libjpeg-turbo was originally based on libjpeg/SIMD, an MMX-accelerated derivative of libjpeg v6b developed by Miyasaka Masaru.  The TigerVNC and VirtualGL projects made numerous enhancements to the codec in 2009, and in early 2010, libjpeg-turbo spun off into an independent project, with the goal of making high-speed JPEG compression/decompression technology available to a broader range of users and developers.
+Although it is based on libjpeg-turbo, mozjpeg is not intended to be a general-purpose or high-performance JPEG library.  Its performance is highly "asymmetric".  That is, the JPEG files it generates require much more time to compress than to decompress.  When the default settings are used, mozjpeg is considerably slower than libjpeg-turbo or even libjpeg at compressing images.  Thus, it is not generally suitable for real-time compression.  It is best used as part of a web encoding workflow.
diff --git a/release/Welcome.rtf b/release/Welcome.rtf
index dab6445b..ed34c7c8 100755
--- a/release/Welcome.rtf
+++ b/release/Welcome.rtf
@@ -5,7 +5,7 @@
 \deftab720
 \pard\pardeftab720
 
-\f0\fs24 \cf0 This installer will install the libmozjpeg SDK and run-time libraries onto your computer so that you can use libmozjpeg to build new applications or accelerate existing ones.  To remove the libmozjpeg package, run\
+\f0\fs24 \cf0 This installer will install the mozjpeg SDK and run-time libraries onto your computer so that you can use mozjpeg to build new applications.  To remove the mozjpeg package, run\
 \
 \pard\pardeftab720
 
diff --git a/release/deb-control.tmpl b/release/deb-control.tmpl
index 50df76b2..ae3a14ec 100644
--- a/release/deb-control.tmpl
+++ b/release/deb-control.tmpl
@@ -4,28 +4,21 @@ Section: misc
 Priority: optional
 Architecture: {__ARCH}
 Essential: no
-Maintainer: The libmozjpeg Project <joshmoz@gmail.com>
+Maintainer: Mozilla Research <joshmoz@gmail.com>
 Homepage: https://github.com/mozilla/mozjpeg
 Installed-Size: {__SIZE}
-Description: A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs
- libmozjpeg is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
- NEON) to accelerate baseline JPEG compression and decompression on x86,
- x86-64, and ARM systems.  On such systems, libmozjpeg is generally 2-4x as
- fast as libjpeg, all else being equal.  On other types of systems,
- libmozjpeg can still outperform libjpeg by a significant amount, by virtue
- of its highly-optimized Huffman coding routines.  In many cases, the
- performance of libmozjpeg rivals that of proprietary high-speed JPEG
- codecs.
+Description: A JPEG codec that provides increased compression for JPEG images (at the expense of compression performance)
+ mozjpeg is a fork of libjpeg-turbo that aims to speed up load times of web
+ pages by reducing the size (and, by extension, the transmission time) of JPEG
+ files.  It accomplishes this by enabling optimized Huffman trees and
+ progressive entropy coding by default in the JPEG compressor, as well as
+ splitting the spectrum of DCT coefficients into separate scans and using
+ Trellis quantisation.
  .
- libmozjpeg implements both the traditional libjpeg API as well as the less
- powerful but more straightforward TurboJPEG API.  libmozjpeg also features
- colorspace extensions that allow it to compress from/decompress to 32-bit and
- big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java
- interface.
- .
- libmozjpeg was forked from libjpeg-turbo. libjpeg-turbo was originally based on
- libjpeg/SIMD, an MMX-accelerated derivative of libjpeg v6b developed by Miyasaka
- Masaru.  The TigerVNC and VirtualGL projects made numerous enhancements to the
- codec in 2009, and in early 2010, libjpeg-turbo spun off into an independent
- project, with the goal of making high-speed JPEG compression/decompression technology
- available to a broader range of users and developers.
+ Although it is based on libjpeg-turbo, mozjpeg is not intended to be a
+ general-purpose or high-performance JPEG library.  Its performance is highly
+ "asymmetric".  That is, the JPEG files it generates require much more time to
+ compress than to decompress.  When the default settings are used, mozjpeg is
+ considerably slower than libjpeg-turbo or even libjpeg at compressing images.
+ Thus, it is not generally suitable for real-time compression.  It is best used
+ as part of a web encoding workflow.
diff --git a/release/libmozjpeg.spec.in b/release/libmozjpeg.spec.in
index 7678ca37..a0d71213 100644
--- a/release/libmozjpeg.spec.in
+++ b/release/libmozjpeg.spec.in
@@ -1,4 +1,4 @@
-# Path under which libmozjpeg should be installed
+# Path under which mozjpeg should be installed
 %define _prefix %{__prefix}
 
 # Path under which executables should be installed
@@ -25,13 +25,13 @@
 # Path under which man pages should be installed
 %define _mandir %{__mandir}
 
-Summary: A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs
+Summary: A JPEG codec that provides increased compression for JPEG images (at the expense of compression performance)
 Name: @PKGNAME@
 Version: @VERSION@
-Vendor: The libmozjpeg Project
+Vendor: Mozilla Research
 URL: https://github.com/mozilla/mozjpeg
 Group: System Environment/Libraries
-#-->Source0: http://prdownloads.sourceforge.net/libmozjpeg/libmozjpeg-%{version}.tar.gz
+#-->Source0: https://github.com/mozilla/mozjpeg/archive/v%{version}.tar.gz
 Release: @BUILD@
 License: BSD-style
 BuildRoot: %{_blddir}/%{name}-buildroot-%{version}-%{release}
@@ -43,30 +43,23 @@ Provides: %{name} = %{version}-%{release}, @PACKAGE_NAME@ = %{version}-%{release
 %endif
 
 %description
-libmozjpeg is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
-NEON) to accelerate baseline JPEG compression and decompression on x86, x86-64,
-and ARM systems.  On such systems, libmozjpeg is generally 2-4x as fast as
-libjpeg, all else being equal.  On other types of systems, libmozjpeg can
-still outperform libjpeg by a significant amount, by virtue of its
-highly-optimized Huffman coding routines.  In many cases, the performance of
-libmozjpeg rivals that of proprietary high-speed JPEG codecs.
+mozjpeg is a fork of libjpeg-turbo that aims to speed up load times of web
+pages by reducing the size (and, by extension, the transmission time) of JPEG
+files.  It accomplishes this by enabling optimized Huffman trees and
+progressive entropy coding by default in the JPEG compressor, as well as
+splitting the spectrum of DCT coefficients into separate scans and using
+Trellis quantisation.
 
-libmozjpeg implements both the traditional libjpeg API as well as the less
-powerful but more straightforward TurboJPEG API.  libmozjpeg also features
-colorspace extensions that allow it to compress from/decompress to 32-bit and
-big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java
-interface.
-
-libmozjpeg was forked from libjpeg-turbo.
-libjpeg-turbo was originally based on libjpeg/SIMD, an MMX-accelerated
-derivative of libjpeg v6b developed by Miyasaka Masaru.  The TigerVNC and
-VirtualGL projects made numerous enhancements to the codec in 2009, and in
-early 2010, libjpeg-turbo spun off into an independent project, with the goal
-of making high-speed JPEG compression/decompression technology available to a
-broader range of users and developers.
+Although it is based on libjpeg-turbo, mozjpeg is not intended to be a
+general-purpose or high-performance JPEG library.  Its performance is highly
+"asymmetric".  That is, the JPEG files it generates require much more time to
+compress than to decompress.  When the default settings are used, mozjpeg is
+considerably slower than libjpeg-turbo or even libjpeg at compressing images.
+Thus, it is not generally suitable for real-time compression.  It is best used
+as part of a web encoding workflow.
 
 #-->%prep
-#-->%setup -q -n libmozjpeg-%{version}
+#-->%setup -q -n mozjpeg-%{version}
 
 #-->%build
 #-->./configure prefix=%{_prefix} bindir=%{_bindir} datadir=%{_datadir} \
@@ -87,7 +80,7 @@ rm -f $RPM_BUILD_ROOT%{_libdir}/*.la
 
 LJT_LIBDIR=%{__libdir}
 if [ ! "$LJT_LIBDIR" = "%{_libdir}" ]; then
-	echo ERROR: libmozjpeg must be configured with libdir=%{_prefix}/%{_lib} when generating an in-tree RPM for this architecture.
+	echo ERROR: mozjpeg must be configured with libdir=%{_prefix}/%{_lib} when generating an in-tree RPM for this architecture.
 	exit 1
 fi
 
diff --git a/release/uninstall.in b/release/uninstall.in
index 9eebf201..7612d617 100644
--- a/release/uninstall.in
+++ b/release/uninstall.in
@@ -8,7 +8,7 @@
 # - Redistributions in binary form must reproduce the above copyright notice,
 #   this list of conditions and the following disclaimer in the documentation
 #   and/or other materials provided with the distribution.
-# - Neither the name of the libmozjpeg Project nor the names of its
+# - Neither the name of the mozjpeg Project nor the names of its
 #   contributors may be used to endorse or promote products derived from this
 #   software without specific prior written permission.
 #

From 6b236241d287f9944904021cb9c178ba109e28f3 Mon Sep 17 00:00:00 2001
From: DRC <information@virtualgl.org>
Date: Wed, 5 Nov 2014 20:47:54 -0600
Subject: [PATCH 007/108] Change "libmozjpeg" to "mozjpeg" in packaging and
 install functions This might be slightly more controversial, since it changes
 the CMake and autotools project names and the binaty package names to
 "mozjpeg", and it changes the default install directory to /opt/mozjpeg.  To
 me, this makes much more sense, but it does represent a change in operational
 behavior, which is why I put it in a separate commit.

---
 BUILDING.txt                                   | 14 +++++++-------
 CMakeLists.txt                                 |  8 ++++----
 Makefile.am                                    | 18 +++++++++---------
 configure.ac                                   | 16 ++++++++--------
 release/Welcome.rtf                            |  2 +-
 release/makecygwinpkg.in                       |  2 +-
 release/makedpkg.in                            |  2 +-
 release/makemacpkg.in                          |  4 ++--
 release/{libmozjpeg.nsi.in => mozjpeg.nsi.in}  |  0
 .../{libmozjpeg.spec.in => mozjpeg.spec.in}    |  6 +++---
 release/uninstall.in                           |  4 ++--
 11 files changed, 38 insertions(+), 38 deletions(-)
 rename release/{libmozjpeg.nsi.in => mozjpeg.nsi.in} (100%)
 rename release/{libmozjpeg.spec.in => mozjpeg.spec.in} (95%)

diff --git a/BUILDING.txt b/BUILDING.txt
index 6f43c792..ceeeb049 100644
--- a/BUILDING.txt
+++ b/BUILDING.txt
@@ -173,8 +173,8 @@ For example,
 
 will install the header files in /usr/local/include and the library files in
 /usr/local/lib64.  If 'prefix' and 'libdir' are not specified, then the default
-is to install the header files in /opt/libmozjpeg/include and the library
-files in /opt/libmozjpeg/lib32 (32-bit) or /opt/libmozjpeg/lib64
+is to install the header files in /opt/mozjpeg/include and the library
+files in /opt/mozjpeg/lib32 (32-bit) or /opt/mozjpeg/lib64
 (64-bit.)
 
 NOTE: You can specify a prefix of /usr and a libdir of, for instance,
@@ -635,12 +635,12 @@ to the cmake command line.
 For example,
 
   cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_INSTALL_PREFIX=c:\libmozjpeg {source_directory}
+    -DCMAKE_INSTALL_PREFIX=c:\mozjpeg {source_directory}
   nmake install
 
-will install the header files in c:\libmozjpeg\include, the library files
-in c:\libmozjpeg\lib, the DLL's in c:\libmozjpeg\bin, and the
-documentation in c:\libmozjpeg\doc.
+will install the header files in c:\mozjpeg\include, the library files
+in c:\mozjpeg\lib, the DLL's in c:\mozjpeg\bin, and the
+documentation in c:\mozjpeg\doc.
 
 
 =============
@@ -773,7 +773,7 @@ If using MinGW:
 
 If using the Visual Studio IDE, build the "installer" project.
 
-The installer package (libmozjpeg[-gcc][64].exe) will be located under
+The installer package (mozjpeg[-gcc][64].exe) will be located under
 {build_directory}.  If building using the Visual Studio IDE, then the installer
 package will be located in a subdirectory with the same name as the
 configuration you built (such as {build_directory}\Debug\ or
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 15fafcf0..8095f351 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ if(POLICY CMP0022)
   cmake_policy(SET CMP0022 OLD)
 endif()
 
-project(libmozjpeg C)
+project(mozjpeg C)
 set(VERSION 2.1)
 
 if(CYGWIN OR NOT CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
@@ -778,16 +778,16 @@ endif()
 
 STRING(REGEX REPLACE "/" "\\\\" INST_DIR "${CMAKE_INSTALL_PREFIX}")
 
-configure_file(release/libmozjpeg.nsi.in libmozjpeg.nsi @ONLY)
+configure_file(release/mozjpeg.nsi.in mozjpeg.nsi @ONLY)
 
 if(WITH_JAVA)
   set(JAVA_DEPEND java)
 endif()
 add_custom_target(installer
-  makensis -nocd ${INST_DEFS} libmozjpeg.nsi
+  makensis -nocd ${INST_DEFS} mozjpeg.nsi
   DEPENDS jpeg jpeg-static turbojpeg turbojpeg-static rdjpgcom wrjpgcom
     cjpeg djpeg jpegtran tjbench ${JAVA_DEPEND}
-  SOURCES libmozjpeg.nsi)
+  SOURCES mozjpeg.nsi)
 
 if(WITH_TURBOJPEG)
   set(TURBOJPEG_TARGETS turbojpeg turbojpeg-static tjbench)
diff --git a/Makefile.am b/Makefile.am
index 1b47317c..3b99865b 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -536,26 +536,26 @@ if WITH_JAVA
 endif
 
 
-pkgscripts/libmozjpeg.spec: pkgscripts/libmozjpeg.spec.tmpl
-	cat pkgscripts/libmozjpeg.spec.tmpl | sed s@%{__prefix}@$(prefix)@g | \
+pkgscripts/mozjpeg.spec: pkgscripts/mozjpeg.spec.tmpl
+	cat pkgscripts/mozjpeg.spec.tmpl | sed s@%{__prefix}@$(prefix)@g | \
 		sed s@%{__bindir}@$(bindir)@g | sed s@%{__datadir}@$(datadir)@g | \
 		sed s@%{__docdir}@$(docdir)@g | sed s@%{__includedir}@$(includedir)@g | \
 		sed s@%{__libdir}@$(libdir)@g | sed s@%{__mandir}@$(mandir)@g \
-		> pkgscripts/libmozjpeg.spec
+		> pkgscripts/mozjpeg.spec
 
-rpm: all pkgscripts/libmozjpeg.spec
+rpm: all pkgscripts/mozjpeg.spec
 	TMPDIR=`mktemp -d /tmp/${PACKAGE_NAME}-build.XXXXXX`; \
 	mkdir -p $$TMPDIR/RPMS; \
 	ln -fs `pwd` $$TMPDIR/BUILD; \
 	rm -f ${PKGNAME}-${VERSION}.${RPMARCH}.rpm; \
 	rpmbuild -bb --define "_blddir $$TMPDIR/buildroot"  \
 		--define "_topdir $$TMPDIR" \
-		--target ${RPMARCH} pkgscripts/libmozjpeg.spec; \
+		--target ${RPMARCH} pkgscripts/mozjpeg.spec; \
 	cp $$TMPDIR/RPMS/${RPMARCH}/${PKGNAME}-${VERSION}-${BUILD}.${RPMARCH}.rpm \
 		${PKGNAME}-${VERSION}.${RPMARCH}.rpm; \
 	rm -rf $$TMPDIR
 
-srpm: dist-gzip pkgscripts/libmozjpeg.spec
+srpm: dist-gzip pkgscripts/mozjpeg.spec
 	TMPDIR=`mktemp -d /tmp/${PACKAGE_NAME}-build.XXXXXX`; \
 	mkdir -p $$TMPDIR/RPMS; \
 	mkdir -p $$TMPDIR/SRPMS; \
@@ -564,10 +564,10 @@ srpm: dist-gzip pkgscripts/libmozjpeg.spec
 	mkdir -p $$TMPDIR/SPECS; \
 	rm -f ${PKGNAME}-${VERSION}.src.rpm; \
 	cp ${PACKAGE_NAME}-${VERSION}.tar.gz $$TMPDIR/SOURCES; \
-	cat pkgscripts/libmozjpeg.spec | sed s/%{_blddir}/%{_tmppath}/g \
+	cat pkgscripts/mozjpeg.spec | sed s/%{_blddir}/%{_tmppath}/g \
 		| sed s/#--\>//g \
-		> $$TMPDIR/SPECS/libmozjpeg.spec; \
-	rpmbuild -bs --define "_topdir $$TMPDIR" $$TMPDIR/SPECS/libmozjpeg.spec; \
+		> $$TMPDIR/SPECS/mozjpeg.spec; \
+	rpmbuild -bs --define "_topdir $$TMPDIR" $$TMPDIR/SPECS/mozjpeg.spec; \
 	cp $$TMPDIR/SRPMS/${PKGNAME}-${VERSION}-${BUILD}.src.rpm \
 		${PKGNAME}-${VERSION}.src.rpm; \
 	rm -rf $$TMPDIR
diff --git a/configure.ac b/configure.ac
index 398717f1..1715401b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,12 +2,12 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.56])
-AC_INIT([libmozjpeg], [2.1])
+AC_INIT([mozjpeg], [2.1])
 BUILD=`date +%Y%m%d`
 
 AM_INIT_AUTOMAKE([-Wall foreign dist-bzip2])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES(yes)])
-AC_PREFIX_DEFAULT(/opt/libmozjpeg)
+AC_PREFIX_DEFAULT(/opt/mozjpeg)
 
 # Checks for programs.
 SAVED_CFLAGS=${CFLAGS}
@@ -22,7 +22,7 @@ m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
 AC_PROG_LIBTOOL
 AC_PROG_LN_S
 
-# When the prefix is /opt/libmozjpeg, we assume that an "official" binary is
+# When the prefix is /opt/mozjpeg, we assume that an "official" binary is
 # being created, and thus we install things into specific locations.
 
 old_prefix=${prefix}
@@ -31,12 +31,12 @@ if test "x$prefix" = "xNONE" -a "x$ac_default_prefix" != "x"; then
 fi
 DATADIR=`eval echo ${datadir}`
 DATADIR=`eval echo $DATADIR`
-if test "$DATADIR" = "/opt/libmozjpeg/share"; then
+if test "$DATADIR" = "/opt/mozjpeg/share"; then
   datadir='${prefix}'
 fi
 DATADIR=`eval echo ${datarootdir}`
 DATADIR=`eval echo $DATADIR`
-if test "$DATADIR" = "/opt/libmozjpeg/share"; then
+if test "$DATADIR" = "/opt/mozjpeg/share"; then
   datarootdir='${prefix}'
 fi
 
@@ -48,7 +48,7 @@ fi
 if test "x${libdir}" = 'x${exec_prefix}/lib' -o "x${libdir}" = 'x${prefix}/lib'; then
   LIBDIR=`eval echo ${libdir}`
   LIBDIR=`eval echo $LIBDIR`
-  if test "$LIBDIR" = "/opt/libmozjpeg/lib"; then
+  if test "$LIBDIR" = "/opt/mozjpeg/lib"; then
     case $host_os in
       darwin*)
         ;;
@@ -515,7 +515,7 @@ AM_CONDITIONAL([SIMD_MIPS], [test "x$simd_arch" = "xmips"])
 AM_CONDITIONAL([X86_64], [test "x$host_cpu" = "xx86_64" -o "x$host_cpu" = "xamd64"])
 AM_CONDITIONAL([WITH_TURBOJPEG], [test "x$with_turbojpeg" != "xno"])
 
-AC_ARG_VAR(PKGNAME, [distribution package name (default: libmozjpeg)])
+AC_ARG_VAR(PKGNAME, [distribution package name (default: mozjpeg)])
 if test "x$PKGNAME" = "x"; then
   PKGNAME=$PACKAGE_NAME
 fi
@@ -546,7 +546,7 @@ AC_DEFINE_UNQUOTED([BUILD], "$BUILD", [libjpeg-turbo build number])
 AC_CONFIG_HEADERS([config.h])
 AC_CONFIG_HEADERS([jconfig.h])
 AC_CONFIG_HEADERS([jconfigint.h])
-AC_CONFIG_FILES([pkgscripts/libmozjpeg.spec.tmpl:release/libmozjpeg.spec.in])
+AC_CONFIG_FILES([pkgscripts/mozjpeg.spec.tmpl:release/mozjpeg.spec.in])
 AC_CONFIG_FILES([pkgscripts/makecygwinpkg.tmpl:release/makecygwinpkg.in])
 AC_CONFIG_FILES([pkgscripts/makedpkg.tmpl:release/makedpkg.in])
 AC_CONFIG_FILES([pkgscripts/makemacpkg.tmpl:release/makemacpkg.in])
diff --git a/release/Welcome.rtf b/release/Welcome.rtf
index ed34c7c8..3d5995aa 100755
--- a/release/Welcome.rtf
+++ b/release/Welcome.rtf
@@ -9,7 +9,7 @@
 \
 \pard\pardeftab720
 
-\f1 \cf0   /opt/libmozjpeg/bin/uninstall\
+\f1 \cf0   /opt/mozjpeg/bin/uninstall\
 \pard\pardeftab720
 
 \f0 \cf0 \
diff --git a/release/makecygwinpkg.in b/release/makecygwinpkg.in
index ac3ca982..8a735ed5 100755
--- a/release/makecygwinpkg.in
+++ b/release/makecygwinpkg.in
@@ -31,7 +31,7 @@ __PWD=`pwd`
 make install DESTDIR=$TMPDIR/pkg docdir=/usr/share/doc/$PACKAGE_NAME-$VERSION \
 	exampledir=/usr/share/doc/$PACKAGE_NAME-$VERSION
 rm $TMPDIR/pkg$LIBDIR/*.la
-if [ "$PREFIX" = "/opt/libmozjpeg" -a "$DOCDIR" = "/opt/libmozjpeg/doc" ]; then
+if [ "$PREFIX" = "/opt/mozjpeg" -a "$DOCDIR" = "/opt/mozjpeg/doc" ]; then
 	ln -fs /usr/share/doc/$PACKAGE_NAME-$VERSION $TMPDIR/pkg$DOCDIR
 fi
 cd $TMPDIR/pkg
diff --git a/release/makedpkg.in b/release/makedpkg.in
index 06cc4db3..2e004fc5 100644
--- a/release/makedpkg.in
+++ b/release/makedpkg.in
@@ -45,7 +45,7 @@ makedeb()
 		make install DESTDIR=$TMPDIR docdir=/usr/share/doc/$DIRNAME-$VERSION \
 			exampledir=/usr/share/doc/$DIRNAME-$VERSION
 		rm -f $TMPDIR$LIBDIR/*.la
-		if [ "$PREFIX" = "/opt/libmozjpeg" -a "$DOCDIR" = "/opt/libmozjpeg/doc" ]; then
+		if [ "$PREFIX" = "/opt/mozjpeg" -a "$DOCDIR" = "/opt/mozjpeg/doc" ]; then
 			ln -fs /usr/share/doc/$DIRNAME-$VERSION $TMPDIR$DOCDIR
 		fi
 	fi
diff --git a/release/makemacpkg.in b/release/makemacpkg.in
index 8b887949..8b8a1865 100644
--- a/release/makemacpkg.in
+++ b/release/makemacpkg.in
@@ -91,7 +91,7 @@ make install DESTDIR=$PKGROOT docdir=/Library/Documentation/$PACKAGE_NAME \
 	exampledir=/Library/Documentation/$PACKAGE_NAME
 rm -f $PKGROOT$LIBDIR/*.la
 
-if [ "$PREFIX" = "/opt/libmozjpeg" -a "$DOCDIR" = "/opt/libmozjpeg/doc" ]; then
+if [ "$PREFIX" = "/opt/mozjpeg" -a "$DOCDIR" = "/opt/mozjpeg/doc" ]; then
 	ln -fs /Library/Documentation/$PACKAGE_NAME $PKGROOT$DOCDIR
 fi
 
@@ -235,7 +235,7 @@ install_name_tool -id $LIBDIR/libturbojpeg.0.dylib $PKGROOT/$LIBDIR/libturbojpeg
 if [ $WITH_JAVA = 1 ]; then
 	ln -fs libturbojpeg.0.dylib $PKGROOT/$LIBDIR/libturbojpeg.jnilib
 fi
-if [ "$PREFIX" = "/opt/libmozjpeg" -a "$LIBDIR" = "/opt/libmozjpeg/lib" ]; then
+if [ "$PREFIX" = "/opt/mozjpeg" -a "$LIBDIR" = "/opt/mozjpeg/lib" ]; then
 	if [ ! -h $PKGROOT/$PREFIX/lib32 ]; then
 		ln -fs lib $PKGROOT/$PREFIX/lib32
 	fi
diff --git a/release/libmozjpeg.nsi.in b/release/mozjpeg.nsi.in
similarity index 100%
rename from release/libmozjpeg.nsi.in
rename to release/mozjpeg.nsi.in
diff --git a/release/libmozjpeg.spec.in b/release/mozjpeg.spec.in
similarity index 95%
rename from release/libmozjpeg.spec.in
rename to release/mozjpeg.spec.in
index a0d71213..f14558bb 100644
--- a/release/libmozjpeg.spec.in
+++ b/release/mozjpeg.spec.in
@@ -17,7 +17,7 @@
 %ifarch x86_64
 %define _lib lib64
 %else
-%if "%{_prefix}" == "/opt/libmozjpeg"
+%if "%{_prefix}" == "/opt/mozjpeg"
 %define _lib lib32
 %endif
 %endif
@@ -87,7 +87,7 @@ fi
 #-->%endif
 
 LJT_DOCDIR=%{__docdir}
-if [ "%{_prefix}" = "/opt/libmozjpeg" -a "$LJT_DOCDIR" = "/opt/libmozjpeg/doc" ]; then
+if [ "%{_prefix}" = "/opt/mozjpeg" -a "$LJT_DOCDIR" = "/opt/mozjpeg/doc" ]; then
 	ln -fs %{_docdir} $RPM_BUILD_ROOT/$LJT_DOCDIR
 fi
 
@@ -103,7 +103,7 @@ rm -rf $RPM_BUILD_ROOT
 %dir %{_docdir}
 %doc %{_docdir}/*
 %dir %{_prefix}
-%if "%{_prefix}" == "/opt/libmozjpeg" && "%{_docdir}" != "%{_prefix}/doc"
+%if "%{_prefix}" == "/opt/mozjpeg" && "%{_docdir}" != "%{_prefix}/doc"
  %{_prefix}/doc
 %endif
 %dir %{_bindir}
diff --git a/release/uninstall.in b/release/uninstall.in
index 7612d617..f00259b1 100644
--- a/release/uninstall.in
+++ b/release/uninstall.in
@@ -72,7 +72,7 @@ fi
 if [ -d $INCLUDEDIR ]; then
 	rmdir $INCLUDEDIR 2>&1 || EXITSTATUS=-1
 fi
-if [ "$PREFIX" = "/opt/libmozjpeg" -a "$LIBDIR" = "/opt/libmozjpeg/lib" ]; then
+if [ "$PREFIX" = "/opt/mozjpeg" -a "$LIBDIR" = "/opt/mozjpeg/lib" ]; then
 	if [ -h $LIBDIR\32 ]; then
 		rm $LIBDIR\32 2>&1 || EXITSTATUS=-1
 	fi
@@ -92,7 +92,7 @@ fi
 if [ -d $DATADIR -a "$DATADIR" != "$PREFIX" ]; then
 	rmdir $DATADIR 2>&1 || EXITSTATUS=-1
 fi
-if [ "$PREFIX" = "/opt/libmozjpeg" -a -h "$PREFIX/doc" ]; then
+if [ "$PREFIX" = "/opt/mozjpeg" -a -h "$PREFIX/doc" ]; then
 	rm $PREFIX/doc 2>&1 || EXITSTATUS=-1
 fi
 rmdir $PREFIX 2>&1 || EXITSTATUS=-1

From 5f1960f38c47275c114fe3275938e976bb20d19a Mon Sep 17 00:00:00 2001
From: DRC <information@virtualgl.org>
Date: Wed, 5 Nov 2014 20:54:20 -0600
Subject: [PATCH 008/108] Fix a buglet introduced in the libjpeg-turbo
 changelog by mozjpeg

---
 ChangeLog.txt | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index b158dc4d..166788b2 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -116,14 +116,6 @@ compressed JPEG image.  If the size of the previous JPEG image was not as large
 as the current JPEG image, then TurboJPEG would unnecessarily reallocate the
 destination buffer.
 
-[7] Improved the accuracy and performance of the non-SIMD implementation of the
-floating point inverse DCT (using code borrowed from libjpeg v8a and later.)
-The accuracy of this implementation now matches the accuracy of the SSE/SSE2
-implementation.  Note, however, that the floating point DCT/IDCT algorithms are
-mainly a legacy feature.  They generally do not produce significantly better
-accuracy than the slow integer DCT/IDCT algorithms, and they are quite a bit
-slower.
-
 
 1.3.1
 =====

From ebc7c3a760350736fdaf60ef6c1a65505321d3d8 Mon Sep 17 00:00:00 2001
From: DRC <information@virtualgl.org>
Date: Thu, 6 Nov 2014 01:14:11 -0600
Subject: [PATCH 009/108] Reconcile indentation, whitespace, and other code
 formatting with libjpeg-turbo. For whatever reason, some of these files
 didn't get fully merged from libjpeg-turbo 1.4.  They still contained tab
 characters and other formatting conventions from libjpeg-turbo 1.3.  This
 patch also fixes some obvious indentation errors in the mozjpeg-specific
 code.  There is more formatting work that needs to be done to the
 mozjpeg-specific code, to fix line overruns, incorrect operator whitespace,
 and other issues that make it not consistent with the libjpeg/libjpeg-turbo
 code.

---
 cdjpeg.h    |  26 ++--
 cjpeg.c     |  21 +--
 jccoefct.c  |  50 +++---
 jcdctmgr.c  |  23 +--
 jchuff.h    |   4 +-
 jcmarker.c  | 124 +++++++--------
 jcmaster.c  | 144 +++++++++---------
 jconfig.txt |  16 +-
 jcparam.c   | 101 ++++++------
 jcphuff.c   |   2 +-
 jpegint.h   |   5 +-
 jpeglib.h   | 430 ++++++++++++++++++++++++++--------------------------
 rdjpeg.c    |   8 +-
 rdswitch.c  |  84 +++++-----
 turbojpeg.c |   2 +-
 15 files changed, 521 insertions(+), 519 deletions(-)

diff --git a/cdjpeg.h b/cdjpeg.h
index 4e8ae1f9..92e6e381 100644
--- a/cdjpeg.h
+++ b/cdjpeg.h
@@ -13,12 +13,12 @@
  * cjpeg and djpeg.  It is NOT used by the core JPEG library.
  */
 
-#define JPEG_CJPEG_DJPEG	/* define proper options in jconfig.h */
-#define JPEG_INTERNAL_OPTIONS	/* cjpeg.c,djpeg.c need to see xxx_SUPPORTED */
+#define JPEG_CJPEG_DJPEG        /* define proper options in jconfig.h */
+#define JPEG_INTERNAL_OPTIONS   /* cjpeg.c,djpeg.c need to see xxx_SUPPORTED */
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jerror.h"		/* get library error codes too */
-#include "cderror.h"		/* get application-specific error codes */
+#include "jerror.h"             /* get library error codes too */
+#include "cderror.h"            /* get application-specific error codes */
 
 #define JPEG_RAW_READER 0
 
@@ -85,9 +85,9 @@ struct djpeg_dest_struct {
  */
 
 struct cdjpeg_progress_mgr {
-  struct jpeg_progress_mgr pub;	/* fields known to JPEG library */
-  int completed_extra_passes;	/* extra passes completed */
-  int total_extra_passes;	/* total extra */
+  struct jpeg_progress_mgr pub; /* fields known to JPEG library */
+  int completed_extra_passes;   /* extra passes completed */
+  int total_extra_passes;       /* total extra */
   /* last printed percentage stored here to avoid multiple printouts */
   int percent_done;
 };
@@ -137,15 +137,15 @@ EXTERN(FILE *) write_stdout (void);
 
 /* miscellaneous useful macros */
 
-#ifdef DONT_USE_B_MODE		/* define mode parameters for fopen() */
-#define READ_BINARY	"r"
-#define WRITE_BINARY	"w"
+#ifdef DONT_USE_B_MODE          /* define mode parameters for fopen() */
+#define READ_BINARY     "r"
+#define WRITE_BINARY    "w"
 #else
-#define READ_BINARY	"rb"
-#define WRITE_BINARY	"wb"
+#define READ_BINARY     "rb"
+#define WRITE_BINARY    "wb"
 #endif
 
-#ifndef EXIT_FAILURE		/* define exit() codes if not provided */
+#ifndef EXIT_FAILURE            /* define exit() codes if not provided */
 #define EXIT_FAILURE  1
 #endif
 #ifndef EXIT_SUCCESS
diff --git a/cjpeg.c b/cjpeg.c
index 098dbc32..784e7bc0 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -294,7 +294,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
 
     } else if (keymatch(arg, "dct", 2)) {
       /* Select DCT algorithm. */
-      if (++argn >= argc) { /* advance to next argument */
+      if (++argn >= argc) {      /* advance to next argument */
         fprintf(stderr, "%s: missing argument for dct\n", progname);
         usage();
       }
@@ -340,14 +340,14 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       jpeg_set_colorspace(cinfo, JCS_RGB);
 
     } else if (keymatch(arg, "lambda1", 7)) {
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       jpeg_c_set_float_param(cinfo, JFLOAT_LAMBDA_LOG_SCALE1,
                              atof(argv[argn]));
 
     } else if (keymatch(arg, "lambda2", 7)) {
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       jpeg_c_set_float_param(cinfo, JFLOAT_LAMBDA_LOG_SCALE2,
                              atof(argv[argn]));
 
@@ -379,7 +379,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
 
     } else if (keymatch(arg, "outfile", 4)) {
       /* Set output file name. */
-      if (++argn >= argc)	{ /* advance to next argument */
+      if (++argn >= argc) {      /* advance to next argument */
         fprintf(stderr, "%s: missing argument for outfile\n", progname);
         usage();
       }
@@ -408,7 +408,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
 
     } else if (keymatch(arg, "quality", 1)) {
       /* Quality ratings (quantization table scaling factors). */
-      if (++argn >= argc)	{ /* advance to next argument */
+      if (++argn >= argc) {      /* advance to next argument */
         fprintf(stderr, "%s: missing argument for quality\n", progname);
         usage();
       }
@@ -743,7 +743,7 @@ main (int argc, char **argv)
           GETJOCTET(marker->data[2]) == 0x49 &&
           GETJOCTET(marker->data[3]) == 0x46 &&
           GETJOCTET(marker->data[4]) == 0)
-        continue;			/* reject duplicate JFIF */
+        continue;                       /* reject duplicate JFIF */
       if (cinfo.write_Adobe_marker &&
           marker->marker == JPEG_APP0+14 &&
           marker->data_length >= 5 &&
@@ -752,8 +752,9 @@ main (int argc, char **argv)
           GETJOCTET(marker->data[2]) == 0x6F &&
           GETJOCTET(marker->data[3]) == 0x62 &&
           GETJOCTET(marker->data[4]) == 0x65)
-        continue;			/* reject duplicate Adobe */
-      jpeg_write_marker(&cinfo, marker->marker, marker->data, marker->data_length);
+        continue;                       /* reject duplicate Adobe */
+      jpeg_write_marker(&cinfo, marker->marker, marker->data,
+                        marker->data_length);
     }
   }
   
diff --git a/jccoefct.c b/jccoefct.c
index ad8e4e36..38769976 100644
--- a/jccoefct.c
+++ b/jccoefct.c
@@ -19,6 +19,7 @@
 #include "jpeglib.h"
 #include "jchuff.h"
 
+
 /* We use a full-image coefficient buffer when doing Huffman optimization,
  * and also for writing multiple-scan JPEG files.  In all cases, the DCT
  * step is run during the first pass, and subsequent passes need only read
@@ -190,7 +191,8 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
             (*cinfo->fdct->forward_DCT) (cinfo, compptr,
                                          input_buf[compptr->component_index],
                                          coef->MCU_buffer[blkn],
-					 ypos, xpos, (JDIMENSION) blockcnt, NULL);
+                                         ypos, xpos, (JDIMENSION) blockcnt,
+                                         NULL);
             if (blockcnt < compptr->MCU_width) {
               /* Create some dummy blocks at the right edge of the image. */
               jzero_far((void *) coef->MCU_buffer[blkn + blockcnt],
@@ -302,7 +304,8 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
       (*cinfo->fdct->forward_DCT) (cinfo, compptr,
                                    input_buf[ci], thisblockrow,
                                    (JDIMENSION) (block_row * DCTSIZE),
-				   (JDIMENSION) 0, blocks_across, buffer_dst[block_row]);
+                                   (JDIMENSION) 0, blocks_across,
+                                   buffer_dst[block_row]);
       if (ndummy > 0) {
         /* Create dummy blocks at the right edge of the image. */
         thisblockrow += blocks_across; /* => first dummy block */
@@ -410,13 +413,13 @@ compress_trellis_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
                        &lastDC);
       
       if (ndummy > 0) {
-	/* Create dummy blocks at the right edge of the image. */
-	thisblockrow += blocks_across; /* => first dummy block */
-	jzero_far((void *) thisblockrow, ndummy * sizeof(JBLOCK));
-	lastDC = thisblockrow[-1][0];
-	for (bi = 0; bi < ndummy; bi++) {
-	  thisblockrow[bi][0] = lastDC;
-	}
+        /* Create dummy blocks at the right edge of the image. */
+        thisblockrow += blocks_across; /* => first dummy block */
+        jzero_far((void *) thisblockrow, ndummy * sizeof(JBLOCK));
+        lastDC = thisblockrow[-1][0];
+        for (bi = 0; bi < ndummy; bi++) {
+          thisblockrow[bi][0] = lastDC;
+        }
       }
     }
     /* If at end of image, create dummy block rows as needed.
@@ -425,22 +428,22 @@ compress_trellis_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
      * This squeezes a few more bytes out of the resulting file...
      */
     if (coef->iMCU_row_num == last_iMCU_row) {
-      blocks_across += ndummy;	/* include lower right corner */
+      blocks_across += ndummy;  /* include lower right corner */
       MCUs_across = blocks_across / h_samp_factor;
       for (block_row = block_rows; block_row < compptr->v_samp_factor;
-	   block_row++) {
-	thisblockrow = buffer[block_row];
-	lastblockrow = buffer[block_row-1];
-	jzero_far((void *) thisblockrow,
-		  (size_t) (blocks_across * sizeof(JBLOCK)));
-	for (MCUindex = 0; MCUindex < MCUs_across; MCUindex++) {
-	  lastDC = lastblockrow[h_samp_factor-1][0];
-	  for (bi = 0; bi < h_samp_factor; bi++) {
-	    thisblockrow[bi][0] = lastDC;
-	  }
-	  thisblockrow += h_samp_factor; /* advance to next MCU in row */
-	  lastblockrow += h_samp_factor;
-	}
+           block_row++) {
+        thisblockrow = buffer[block_row];
+        lastblockrow = buffer[block_row-1];
+        jzero_far((void *) thisblockrow,
+                  (size_t) (blocks_across * sizeof(JBLOCK)));
+        for (MCUindex = 0; MCUindex < MCUs_across; MCUindex++) {
+          lastDC = lastblockrow[h_samp_factor-1][0];
+          for (bi = 0; bi < h_samp_factor; bi++) {
+            thisblockrow[bi][0] = lastDC;
+          }
+          thisblockrow += h_samp_factor; /* advance to next MCU in row */
+          lastblockrow += h_samp_factor;
+        }
       }
     }
   }
@@ -503,7 +506,6 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
           }
         }
       }
-
       /* Try to write the MCU. */
       if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
diff --git a/jcdctmgr.c b/jcdctmgr.c
index d64d655f..89e3400e 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -25,6 +25,7 @@
 #include <assert.h>
 #include <math.h>
 
+
 /* Private subobject for this module */
 
 typedef void (*forward_DCT_method_ptr) (DCTELEM * data);
@@ -663,7 +664,7 @@ METHODDEF(void)
 forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
              JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
              JDIMENSION start_row, JDIMENSION start_col,
-	     JDIMENSION num_blocks, JBLOCKROW dst)
+             JDIMENSION num_blocks, JBLOCKROW dst)
 /* This version is used for integer DCT implementations. */
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
@@ -697,17 +698,17 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
     if (dst) {
       int i;
       if (cinfo->dct_method == JDCT_IFAST) {
-	static const INT16 aanscales[DCTSIZE2] = {
-	  /* precomputed values scaled up by 14 bits */
-	  16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
-	  22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
-	  21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
-	  19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
-	  16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
-	  12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
+        static const INT16 aanscales[DCTSIZE2] = {
+          /* precomputed values scaled up by 14 bits */
+          16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
+          22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
+          21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
+          19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
+          16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
+          12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
           8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
           4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
-	};
+        };
         
         for (i = 0; i < DCTSIZE2; i++) {
           int x = workspace[i];
@@ -789,7 +790,7 @@ METHODDEF(void)
 forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
                    JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
                    JDIMENSION start_row, JDIMENSION start_col,
-		   JDIMENSION num_blocks, JBLOCKROW dst)
+                   JDIMENSION num_blocks, JBLOCKROW dst)
 /* This version is used for floating-point DCT implementations. */
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
diff --git a/jchuff.h b/jchuff.h
index 2a5ecb56..b58538bc 100644
--- a/jchuff.h
+++ b/jchuff.h
@@ -29,8 +29,8 @@
 /* Derived data constructed for each Huffman table */
 
 typedef struct {
-  unsigned int ehufco[256];	/* code for each symbol */
-  char ehufsi[256];		/* length of code for each symbol */
+  unsigned int ehufco[256];     /* code for each symbol */
+  char ehufsi[256];             /* length of code for each symbol */
   /* If no code has been allocated for a symbol S, ehufsi[S] contains 0 */
 } c_derived_tbl;
 
diff --git a/jcmarker.c b/jcmarker.c
index a62da057..9268b0c9 100644
--- a/jcmarker.c
+++ b/jcmarker.c
@@ -17,7 +17,7 @@
 #include "jpegcomp.h"
 
 
-typedef enum {			/* JPEG marker codes */
+typedef enum {                  /* JPEG marker codes */
   M_SOF0  = 0xc0,
   M_SOF1  = 0xc1,
   M_SOF2  = 0xc2,
@@ -173,7 +173,7 @@ emit_dqt (j_compress_ptr cinfo, int index)
       /* The table entries must be emitted in zigzag order. */
       unsigned int qval = qtbl->quantval[jpeg_natural_order[i]];
       if (prec)
-	emit_byte(cinfo, (int) (qval >> 8));
+        emit_byte(cinfo, (int) (qval >> 8));
       emit_byte(cinfo, (int) (qval & 0xFF));
     }
 
@@ -253,33 +253,33 @@ emit_dht (j_compress_ptr cinfo, int index, boolean is_ac)
 {
   JHUFF_TBL * htbl;
   int length, i;
-  
+
   if (is_ac) {
     htbl = cinfo->ac_huff_tbl_ptrs[index];
-    index += 0x10;		/* output index has AC bit set */
+    index += 0x10;              /* output index has AC bit set */
   } else {
     htbl = cinfo->dc_huff_tbl_ptrs[index];
   }
 
   if (htbl == NULL)
     ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, index);
-  
+
   if (! htbl->sent_table) {
     emit_marker(cinfo, M_DHT);
-    
+
     length = 0;
     for (i = 1; i <= 16; i++)
       length += htbl->bits[i];
-    
+
     emit_2bytes(cinfo, length + 2 + 1 + 16);
     emit_byte(cinfo, index);
-    
+
     for (i = 1; i <= 16; i++)
       emit_byte(cinfo, htbl->bits[i]);
-    
+
     for (i = 0; i < length; i++)
       emit_byte(cinfo, htbl->huffval[i]);
-    
+
     htbl->sent_table = TRUE;
   }
 }
@@ -427,12 +427,12 @@ emit_dac (j_compress_ptr cinfo)
 
     for (i = 0; i < NUM_ARITH_TBLS; i++) {
       if (dc_in_use[i]) {
-	emit_byte(cinfo, i);
-	emit_byte(cinfo, cinfo->arith_dc_L[i] + (cinfo->arith_dc_U[i]<<4));
+        emit_byte(cinfo, i);
+        emit_byte(cinfo, cinfo->arith_dc_L[i] + (cinfo->arith_dc_U[i]<<4));
       }
       if (ac_in_use[i]) {
-	emit_byte(cinfo, i + 0x10);
-	emit_byte(cinfo, cinfo->arith_ac_K[i]);
+        emit_byte(cinfo, i + 0x10);
+        emit_byte(cinfo, cinfo->arith_ac_K[i]);
       }
     }
   }
@@ -445,8 +445,8 @@ emit_dri (j_compress_ptr cinfo)
 /* Emit a DRI marker */
 {
   emit_marker(cinfo, M_DRI);
-  
-  emit_2bytes(cinfo, 4);	/* fixed length */
+
+  emit_2bytes(cinfo, 4);        /* fixed length */
 
   emit_2bytes(cinfo, (int) cinfo->restart_interval);
 }
@@ -458,9 +458,9 @@ emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
 {
   int ci;
   jpeg_component_info *compptr;
-  
+
   emit_marker(cinfo, code);
-  
+
   emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */
 
   /* Make sure image isn't bigger than SOF field can handle */
@@ -489,13 +489,13 @@ emit_sos (j_compress_ptr cinfo)
 {
   int i, td, ta;
   jpeg_component_info *compptr;
-  
+
   emit_marker(cinfo, M_SOS);
-  
+
   emit_2bytes(cinfo, 2 * cinfo->comps_in_scan + 2 + 1 + 3); /* length */
-  
+
   emit_byte(cinfo, cinfo->comps_in_scan);
-  
+
   for (i = 0; i < cinfo->comps_in_scan; i++) {
     compptr = cinfo->cur_comp_info[i];
     emit_byte(cinfo, compptr->component_id);
@@ -523,22 +523,22 @@ emit_jfif_app0 (j_compress_ptr cinfo)
 /* Emit a JFIF-compliant APP0 marker */
 {
   /*
-   * Length of APP0 block	(2 bytes)
-   * Block ID			(4 bytes - ASCII "JFIF")
-   * Zero byte			(1 byte to terminate the ID string)
-   * Version Major, Minor	(2 bytes - major first)
-   * Units			(1 byte - 0x00 = none, 0x01 = inch, 0x02 = cm)
-   * Xdpu			(2 bytes - dots per unit horizontal)
-   * Ydpu			(2 bytes - dots per unit vertical)
-   * Thumbnail X size		(1 byte)
-   * Thumbnail Y size		(1 byte)
+   * Length of APP0 block       (2 bytes)
+   * Block ID                   (4 bytes - ASCII "JFIF")
+   * Zero byte                  (1 byte to terminate the ID string)
+   * Version Major, Minor       (2 bytes - major first)
+   * Units                      (1 byte - 0x00 = none, 0x01 = inch, 0x02 = cm)
+   * Xdpu                       (2 bytes - dots per unit horizontal)
+   * Ydpu                       (2 bytes - dots per unit vertical)
+   * Thumbnail X size           (1 byte)
+   * Thumbnail Y size           (1 byte)
    */
-  
+
   emit_marker(cinfo, M_APP0);
-  
+
   emit_2bytes(cinfo, 2 + 4 + 1 + 2 + 1 + 2 + 2 + 1 + 1); /* length */
 
-  emit_byte(cinfo, 0x4A);	/* Identifier: ASCII "JFIF" */
+  emit_byte(cinfo, 0x4A);       /* Identifier: ASCII "JFIF" */
   emit_byte(cinfo, 0x46);
   emit_byte(cinfo, 0x49);
   emit_byte(cinfo, 0x46);
@@ -548,7 +548,7 @@ emit_jfif_app0 (j_compress_ptr cinfo)
   emit_byte(cinfo, cinfo->density_unit); /* Pixel size information */
   emit_2bytes(cinfo, (int) cinfo->X_density);
   emit_2bytes(cinfo, (int) cinfo->Y_density);
-  emit_byte(cinfo, 0);		/* No thumbnail image */
+  emit_byte(cinfo, 0);          /* No thumbnail image */
   emit_byte(cinfo, 0);
 }
 
@@ -558,12 +558,12 @@ emit_adobe_app14 (j_compress_ptr cinfo)
 /* Emit an Adobe APP14 marker */
 {
   /*
-   * Length of APP14 block	(2 bytes)
-   * Block ID			(5 bytes - ASCII "Adobe")
-   * Version Number		(2 bytes - currently 100)
-   * Flags0			(2 bytes - currently 0)
-   * Flags1			(2 bytes - currently 0)
-   * Color transform		(1 byte)
+   * Length of APP14 block      (2 bytes)
+   * Block ID                   (5 bytes - ASCII "Adobe")
+   * Version Number             (2 bytes - currently 100)
+   * Flags0                     (2 bytes - currently 0)
+   * Flags1                     (2 bytes - currently 0)
+   * Color transform            (1 byte)
    *
    * Although Adobe TN 5116 mentions Version = 101, all the Adobe files
    * now in circulation seem to use Version = 100, so that's what we write.
@@ -572,28 +572,28 @@ emit_adobe_app14 (j_compress_ptr cinfo)
    * YCbCr, 2 if it's YCCK, 0 otherwise.  Adobe's definition has to do with
    * whether the encoder performed a transformation, which is pretty useless.
    */
-  
+
   emit_marker(cinfo, M_APP14);
-  
+
   emit_2bytes(cinfo, 2 + 5 + 2 + 2 + 2 + 1); /* length */
 
-  emit_byte(cinfo, 0x41);	/* Identifier: ASCII "Adobe" */
+  emit_byte(cinfo, 0x41);       /* Identifier: ASCII "Adobe" */
   emit_byte(cinfo, 0x64);
   emit_byte(cinfo, 0x6F);
   emit_byte(cinfo, 0x62);
   emit_byte(cinfo, 0x65);
-  emit_2bytes(cinfo, 100);	/* Version */
-  emit_2bytes(cinfo, 0);	/* Flags0 */
-  emit_2bytes(cinfo, 0);	/* Flags1 */
+  emit_2bytes(cinfo, 100);      /* Version */
+  emit_2bytes(cinfo, 0);        /* Flags0 */
+  emit_2bytes(cinfo, 0);        /* Flags1 */
   switch (cinfo->jpeg_color_space) {
   case JCS_YCbCr:
-    emit_byte(cinfo, 1);	/* Color transform = 1 */
+    emit_byte(cinfo, 1);        /* Color transform = 1 */
     break;
   case JCS_YCCK:
-    emit_byte(cinfo, 2);	/* Color transform = 2 */
+    emit_byte(cinfo, 2);        /* Color transform = 2 */
     break;
   default:
-    emit_byte(cinfo, 0);	/* Color transform = 0 */
+    emit_byte(cinfo, 0);        /* Color transform = 0 */
     break;
   }
 }
@@ -611,12 +611,12 @@ METHODDEF(void)
 write_marker_header (j_compress_ptr cinfo, int marker, unsigned int datalen)
 /* Emit an arbitrary marker header */
 {
-  if (datalen > (unsigned int) 65533)		/* safety check */
+  if (datalen > (unsigned int) 65533)           /* safety check */
     ERREXIT(cinfo, JERR_BAD_LENGTH);
 
   emit_marker(cinfo, (JPEG_MARKER) marker);
 
-  emit_2bytes(cinfo, (int) (datalen + 2));	/* total length */
+  emit_2bytes(cinfo, (int) (datalen + 2));      /* total length */
 }
 
 METHODDEF(void)
@@ -643,12 +643,12 @@ write_file_header (j_compress_ptr cinfo)
 {
   my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
 
-  emit_marker(cinfo, M_SOI);	/* first the SOI */
+  emit_marker(cinfo, M_SOI);    /* first the SOI */
 
   /* SOI is defined to reset restart interval to 0 */
   marker->last_restart_interval = 0;
 
-  if (cinfo->write_JFIF_header)	/* next an optional JFIF APP0 */
+  if (cinfo->write_JFIF_header) /* next an optional JFIF APP0 */
     emit_jfif_app0(cinfo);
   if (cinfo->write_Adobe_marker) /* next an optional Adobe APP14 */
     emit_adobe_app14(cinfo);
@@ -669,7 +669,7 @@ write_frame_header (j_compress_ptr cinfo)
   int ci, prec;
   boolean is_baseline;
   jpeg_component_info *compptr;
-  
+
   /* Emit DQT for each quantization table.
    * Note that emit_dqt() suppresses any duplicate tables.
    */
@@ -692,9 +692,9 @@ write_frame_header (j_compress_ptr cinfo)
   } else {
     is_baseline = TRUE;
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-	 ci++, compptr++) {
+         ci++, compptr++) {
       if (compptr->dc_tbl_no > 1 || compptr->ac_tbl_no > 1)
-	is_baseline = FALSE;
+        is_baseline = FALSE;
     }
     if (prec && is_baseline) {
       is_baseline = FALSE;
@@ -711,11 +711,11 @@ write_frame_header (j_compress_ptr cinfo)
       emit_sof(cinfo, M_SOF9);  /* SOF code for sequential arithmetic */
   } else {
     if (cinfo->progressive_mode)
-      emit_sof(cinfo, M_SOF2);	/* SOF code for progressive Huffman */
+      emit_sof(cinfo, M_SOF2);  /* SOF code for progressive Huffman */
     else if (is_baseline)
-      emit_sof(cinfo, M_SOF0);	/* SOF code for baseline implementation */
+      emit_sof(cinfo, M_SOF0);  /* SOF code for baseline implementation */
     else
-      emit_sof(cinfo, M_SOF1);	/* SOF code for non-baseline Huffman file */
+      emit_sof(cinfo, M_SOF1);  /* SOF code for non-baseline Huffman file */
   }
 }
 
@@ -801,9 +801,9 @@ write_tables_only (j_compress_ptr cinfo)
   if (! cinfo->arith_code) {
     for (i = 0; i < NUM_HUFF_TBLS; i++) {
       if (cinfo->dc_huff_tbl_ptrs[i] != NULL)
-	emit_dht(cinfo, i, FALSE);
+        emit_dht(cinfo, i, FALSE);
       if (cinfo->ac_huff_tbl_ptrs[i] != NULL)
-	emit_dht(cinfo, i, TRUE);
+        emit_dht(cinfo, i, TRUE);
     }
   }
 
diff --git a/jcmaster.c b/jcmaster.c
index 37e577cd..ceb4282c 100644
--- a/jcmaster.c
+++ b/jcmaster.c
@@ -12,7 +12,7 @@
  *
  * This file contains master control logic for the JPEG compressor.
  * These routines are concerned with parameter validation, initial setup,
- * and inter-pass control (determining the number of passes and the work 
+ * and inter-pass control (determining the number of passes and the work
  * to be done in each pass).
  */
 
@@ -26,21 +26,21 @@
 /* Private state */
 
 typedef enum {
-	main_pass,		/* input data, also do first output step */
-	huff_opt_pass,		/* Huffman code optimization pass */
-	output_pass,		/* data output pass */
+        main_pass,              /* input data, also do first output step */
+        huff_opt_pass,          /* Huffman code optimization pass */
+        output_pass,            /* data output pass */
         trellis_pass            /* trellis quantization pass */
 } c_pass_type;
 
 typedef struct {
-  struct jpeg_comp_master pub;	/* public fields */
+  struct jpeg_comp_master pub;  /* public fields */
 
-  c_pass_type pass_type;	/* the type of the current pass */
+  c_pass_type pass_type;        /* the type of the current pass */
 
-  int pass_number;		/* # of passes completed */
-  int total_passes;		/* total # of passes needed */
+  int pass_number;              /* # of passes completed */
+  int total_passes;             /* total # of passes needed */
 
-  int scan_number;		/* current index in scan_info[] */
+  int scan_number;              /* current index in scan_info[] */
   
   /* fields for scan optimisation */
   int pass_number_scan_opt_base; /* pass number where scan optimization begins */
@@ -122,7 +122,7 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only)
   /* Check that number of components won't exceed internal array sizes */
   if (cinfo->num_components > MAX_COMPONENTS)
     ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components,
-	     MAX_COMPONENTS);
+             MAX_COMPONENTS);
 
   /* Compute maximum sampling factors; check factor validity */
   cinfo->max_h_samp_factor = 1;
@@ -130,12 +130,12 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only)
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     if (compptr->h_samp_factor<=0 || compptr->h_samp_factor>MAX_SAMP_FACTOR ||
-	compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR)
+        compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR)
       ERREXIT(cinfo, JERR_BAD_SAMPLING);
     cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor,
-				   compptr->h_samp_factor);
+                                   compptr->h_samp_factor);
     cinfo->max_v_samp_factor = MAX(cinfo->max_v_samp_factor,
-				   compptr->v_samp_factor);
+                                   compptr->v_samp_factor);
   }
 
   /* Compute dimensions of components */
@@ -152,17 +152,17 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only)
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
       jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor,
-		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+                    (long) (cinfo->max_h_samp_factor * DCTSIZE));
     compptr->height_in_blocks = (JDIMENSION)
       jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor,
-		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+                    (long) (cinfo->max_v_samp_factor * DCTSIZE));
     /* Size in samples */
     compptr->downsampled_width = (JDIMENSION)
       jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor,
-		    (long) cinfo->max_h_samp_factor);
+                    (long) cinfo->max_h_samp_factor);
     compptr->downsampled_height = (JDIMENSION)
       jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor,
-		    (long) cinfo->max_v_samp_factor);
+                    (long) cinfo->max_v_samp_factor);
     /* Mark component needed (this flag isn't actually used for compression) */
     compptr->component_needed = TRUE;
   }
@@ -172,7 +172,7 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only)
    */
   cinfo->total_iMCU_rows = (JDIMENSION)
     jdiv_round_up((long) cinfo->_jpeg_height,
-		  (long) (cinfo->max_v_samp_factor*DCTSIZE));
+                  (long) (cinfo->max_v_samp_factor*DCTSIZE));
 }
 
 
@@ -213,15 +213,15 @@ validate_script (j_compress_ptr cinfo)
 #ifdef C_PROGRESSIVE_SUPPORTED
     cinfo->progressive_mode = TRUE;
     last_bitpos_ptr = & last_bitpos[0][0];
-    for (ci = 0; ci < cinfo->num_components; ci++) 
+    for (ci = 0; ci < cinfo->num_components; ci++)
       for (coefi = 0; coefi < DCTSIZE2; coefi++)
-	*last_bitpos_ptr++ = -1;
+        *last_bitpos_ptr++ = -1;
 #else
     ERREXIT(cinfo, JERR_NOT_COMPILED);
 #endif
   } else {
     cinfo->progressive_mode = FALSE;
-    for (ci = 0; ci < cinfo->num_components; ci++) 
+    for (ci = 0; ci < cinfo->num_components; ci++)
       component_sent[ci] = FALSE;
   }
 
@@ -233,10 +233,10 @@ validate_script (j_compress_ptr cinfo)
     for (ci = 0; ci < ncomps; ci++) {
       thisi = scanptr->component_index[ci];
       if (thisi < 0 || thisi >= cinfo->num_components)
-	ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
+        ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
       /* Components must appear in SOF order within each scan */
       if (ci > 0 && thisi <= scanptr->component_index[ci-1])
-	ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
+        ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
     }
     /* Validate progression parameters */
     Ss = scanptr->Ss;
@@ -258,43 +258,43 @@ validate_script (j_compress_ptr cinfo)
 #define MAX_AH_AL 13
 #endif
       if (Ss < 0 || Ss >= DCTSIZE2 || Se < Ss || Se >= DCTSIZE2 ||
-	  Ah < 0 || Ah > MAX_AH_AL || Al < 0 || Al > MAX_AH_AL)
-	ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+          Ah < 0 || Ah > MAX_AH_AL || Al < 0 || Al > MAX_AH_AL)
+        ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
       if (Ss == 0) {
-	if (Se != 0)		/* DC and AC together not OK */
-	  ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+        if (Se != 0)            /* DC and AC together not OK */
+          ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
       } else {
-	if (ncomps != 1)	/* AC scans must be for only one component */
-	  ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+        if (ncomps != 1)        /* AC scans must be for only one component */
+          ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
       }
       for (ci = 0; ci < ncomps; ci++) {
-	last_bitpos_ptr = & last_bitpos[scanptr->component_index[ci]][0];
-	if (Ss != 0 && last_bitpos_ptr[0] < 0) /* AC without prior DC scan */
-	  ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
-	for (coefi = Ss; coefi <= Se; coefi++) {
-	  if (last_bitpos_ptr[coefi] < 0) {
-	    /* first scan of this coefficient */
-	    if (Ah != 0)
-	      ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
-	  } else {
-	    /* not first scan */
-	    if (Ah != last_bitpos_ptr[coefi] || Al != Ah-1)
-	      ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
-	  }
-	  last_bitpos_ptr[coefi] = Al;
-	}
+        last_bitpos_ptr = & last_bitpos[scanptr->component_index[ci]][0];
+        if (Ss != 0 && last_bitpos_ptr[0] < 0) /* AC without prior DC scan */
+          ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+        for (coefi = Ss; coefi <= Se; coefi++) {
+          if (last_bitpos_ptr[coefi] < 0) {
+            /* first scan of this coefficient */
+            if (Ah != 0)
+              ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+          } else {
+            /* not first scan */
+            if (Ah != last_bitpos_ptr[coefi] || Al != Ah-1)
+              ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+          }
+          last_bitpos_ptr[coefi] = Al;
+        }
       }
 #endif
     } else {
       /* For sequential JPEG, all progression parameters must be these: */
       if (Ss != 0 || Se != DCTSIZE2-1 || Ah != 0 || Al != 0)
-	ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+        ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
       /* Make sure components are not sent twice */
       for (ci = 0; ci < ncomps; ci++) {
-	thisi = scanptr->component_index[ci];
-	if (component_sent[thisi])
-	  ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
-	component_sent[thisi] = TRUE;
+        thisi = scanptr->component_index[ci];
+        if (component_sent[thisi])
+          ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
+        component_sent[thisi] = TRUE;
       }
     }
   }
@@ -309,13 +309,13 @@ validate_script (j_compress_ptr cinfo)
      */
     for (ci = 0; ci < cinfo->num_components; ci++) {
       if (last_bitpos[ci][0] < 0)
-	ERREXIT(cinfo, JERR_MISSING_DATA);
+        ERREXIT(cinfo, JERR_MISSING_DATA);
     }
 #endif
   } else {
     for (ci = 0; ci < cinfo->num_components; ci++) {
       if (! component_sent[ci])
-	ERREXIT(cinfo, JERR_MISSING_DATA);
+        ERREXIT(cinfo, JERR_MISSING_DATA);
     }
   }
 }
@@ -356,7 +356,7 @@ select_scan_parameters (j_compress_ptr cinfo)
     cinfo->comps_in_scan = scanptr->comps_in_scan;
     for (ci = 0; ci < scanptr->comps_in_scan; ci++) {
       cinfo->cur_comp_info[ci] =
-	&cinfo->comp_info[scanptr->component_index[ci]];
+        &cinfo->comp_info[scanptr->component_index[ci]];
     }
     cinfo->Ss = scanptr->Ss;
     cinfo->Se = scanptr->Se;
@@ -384,7 +384,7 @@ select_scan_parameters (j_compress_ptr cinfo)
     /* Prepare for single sequential-JPEG scan containing all components */
     if (cinfo->num_components > MAX_COMPS_IN_SCAN)
       ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components,
-	       MAX_COMPS_IN_SCAN);
+               MAX_COMPS_IN_SCAN);
     cinfo->comps_in_scan = cinfo->num_components;
     for (ci = 0; ci < cinfo->num_components; ci++) {
       cinfo->cur_comp_info[ci] = &cinfo->comp_info[ci];
@@ -404,16 +404,16 @@ per_scan_setup (j_compress_ptr cinfo)
 {
   int ci, mcublks, tmp;
   jpeg_component_info *compptr;
-  
+
   if (cinfo->comps_in_scan == 1) {
-    
+
     /* Noninterleaved (single-component) scan */
     compptr = cinfo->cur_comp_info[0];
-    
+
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = compptr->width_in_blocks;
     cinfo->MCU_rows_in_scan = compptr->height_in_blocks;
-    
+
     /* For noninterleaved scan, always one block per MCU */
     compptr->MCU_width = 1;
     compptr->MCU_height = 1;
@@ -426,28 +426,28 @@ per_scan_setup (j_compress_ptr cinfo)
     tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
     if (tmp == 0) tmp = compptr->v_samp_factor;
     compptr->last_row_height = tmp;
-    
+
     /* Prepare array describing MCU composition */
     cinfo->blocks_in_MCU = 1;
     cinfo->MCU_membership[0] = 0;
-    
+
   } else {
-    
+
     /* Interleaved (multi-component) scan */
     if (cinfo->comps_in_scan <= 0 || cinfo->comps_in_scan > MAX_COMPS_IN_SCAN)
       ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->comps_in_scan,
-	       MAX_COMPS_IN_SCAN);
-    
+               MAX_COMPS_IN_SCAN);
+
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
       jdiv_round_up((long) cinfo->_jpeg_width,
-		    (long) (cinfo->max_h_samp_factor*DCTSIZE));
+                    (long) (cinfo->max_h_samp_factor*DCTSIZE));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
       jdiv_round_up((long) cinfo->_jpeg_height,
-		    (long) (cinfo->max_v_samp_factor*DCTSIZE));
-    
+                    (long) (cinfo->max_v_samp_factor*DCTSIZE));
+
     cinfo->blocks_in_MCU = 0;
-    
+
     for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
       compptr = cinfo->cur_comp_info[ci];
       /* Sampling factors give # of blocks of component in each MCU */
@@ -465,12 +465,12 @@ per_scan_setup (j_compress_ptr cinfo)
       /* Prepare array describing MCU composition */
       mcublks = compptr->MCU_blocks;
       if (cinfo->blocks_in_MCU + mcublks > C_MAX_BLOCKS_IN_MCU)
-	ERREXIT(cinfo, JERR_BAD_MCU_SIZE);
+        ERREXIT(cinfo, JERR_BAD_MCU_SIZE);
       while (mcublks-- > 0) {
-	cinfo->MCU_membership[cinfo->blocks_in_MCU++] = ci;
+        cinfo->MCU_membership[cinfo->blocks_in_MCU++] = ci;
       }
     }
-    
+
   }
 
   /* Convert restart specified in rows to actual MCU count. */
@@ -512,8 +512,8 @@ prepare_for_pass (j_compress_ptr cinfo)
     (*cinfo->fdct->start_pass) (cinfo);
     (*cinfo->entropy->start_pass) (cinfo, cinfo->optimize_coding);
     (*cinfo->coef->start_pass) (cinfo,
-				(master->total_passes > 1 ?
-				 JBUF_SAVE_AND_PASS : JBUF_PASS_THRU));
+                                (master->total_passes > 1 ?
+                                 JBUF_SAVE_AND_PASS : JBUF_PASS_THRU));
     (*cinfo->main->start_pass) (cinfo, JBUF_PASS_THRU);
     if (cinfo->optimize_coding) {
       /* No immediate data output; postpone writing frame/scan headers */
@@ -946,7 +946,7 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
     cinfo->num_scans = 1;
   }
 
-  if (cinfo->progressive_mode && !cinfo->arith_code)	/*  TEMPORARY HACK ??? */
+  if (cinfo->progressive_mode && !cinfo->arith_code)    /*  TEMPORARY HACK ??? */
     cinfo->optimize_coding = TRUE; /* assume default tables no good for progressive mode */
 
   /* Initialize my private state */
diff --git a/jconfig.txt b/jconfig.txt
index 4946c981..d8a45f53 100644
--- a/jconfig.txt
+++ b/jconfig.txt
@@ -78,10 +78,10 @@
 /* Define "boolean" as unsigned char, not int, on Windows systems.
  */
 #ifdef _WIN32
-#ifndef __RPCNDR_H__		/* don't conflict if rpcndr.h already read */
+#ifndef __RPCNDR_H__            /* don't conflict if rpcndr.h already read */
 typedef unsigned char boolean;
 #endif
-#define HAVE_BOOLEAN		/* prevent jmorecfg.h from redefining it */
+#define HAVE_BOOLEAN            /* prevent jmorecfg.h from redefining it */
 #endif
 
 
@@ -114,12 +114,12 @@ typedef unsigned char boolean;
 
 /* These defines indicate which image (non-JPEG) file formats are allowed. */
 
-#define PNG_SUPPORTED       /* PNG image file format */
-#define BMP_SUPPORTED       /* BMP image file format */
-#define GIF_SUPPORTED       /* GIF image file format */
-#define PPM_SUPPORTED       /* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED        /* Utah RLE image file format */
-#define TARGA_SUPPORTED     /* Targa image file format */
+#define PNG_SUPPORTED           /* PNG image file format */
+#define BMP_SUPPORTED           /* BMP image file format */
+#define GIF_SUPPORTED           /* GIF image file format */
+#define PPM_SUPPORTED           /* PBMPLUS PPM/PGM image file format */
+#undef RLE_SUPPORTED            /* Utah RLE image file format */
+#define TARGA_SUPPORTED         /* Targa image file format */
 
 /* Define this if you want to name both input and output files on the command
  * line, rather than using stdout and optionally stdin.  You MUST do this if
diff --git a/jcparam.c b/jcparam.c
index 8af8ae64..77407651 100644
--- a/jcparam.c
+++ b/jcparam.c
@@ -117,11 +117,11 @@ jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
     jpeg_add_quant_table(cinfo, 1, flat_quant_tbl,
                          cinfo->q_scale_factor[1], force_baseline);
   } else {
-  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
-                       cinfo->q_scale_factor[0], force_baseline);
-  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
-                       cinfo->q_scale_factor[1], force_baseline);
-}
+    jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
+                         cinfo->q_scale_factor[0], force_baseline);
+    jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
+                         cinfo->q_scale_factor[1], force_baseline);
+  }
 }
 #endif
 
@@ -142,11 +142,11 @@ jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
     jpeg_add_quant_table(cinfo, 1, flat_quant_tbl,
                          scale_factor, force_baseline);
   } else {
-  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
-                       scale_factor, force_baseline);
-  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
-                       scale_factor, force_baseline);
-}
+    jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
+                         scale_factor, force_baseline);
+    jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
+                         scale_factor, force_baseline);
+  }
 }
 
 
@@ -249,9 +249,9 @@ jpeg_set_defaults (j_compress_ptr cinfo)
   cinfo->scan_info = NULL;
   cinfo->num_scans = 0;
   if (!cinfo->master->use_moz_defaults) {
-  /* Default is no multiple-scan output */
-  cinfo->scan_info = NULL;
-  cinfo->num_scans = 0;
+    /* Default is no multiple-scan output */
+    cinfo->scan_info = NULL;
+    cinfo->num_scans = 0;
   }
 #else
   /* Default is no multiple-scan output */
@@ -270,8 +270,8 @@ jpeg_set_defaults (j_compress_ptr cinfo)
     /* By default, do extra passes to optimize entropy coding */
     cinfo->optimize_coding = TRUE;
   else
-  /* By default, don't do extra passes to optimize entropy coding */
-  cinfo->optimize_coding = FALSE;
+    /* By default, don't do extra passes to optimize entropy coding */
+    cinfo->optimize_coding = FALSE;
 #else
   /* By default, don't do extra passes to optimize entropy coding */
   cinfo->optimize_coding = FALSE;
@@ -494,7 +494,7 @@ fill_a_scan (jpeg_scan_info * scanptr, int ci,
 
 LOCAL(jpeg_scan_info *)
 fill_a_scan_pair (jpeg_scan_info * scanptr, int ci,
-             int Ss, int Se, int Ah, int Al)
+                  int Ss, int Se, int Ah, int Al)
 /* Support routine: generate one scan for pair of components */
 {
   scanptr->comps_in_scan = 2;
@@ -705,16 +705,16 @@ jpeg_simple_progression (j_compress_ptr cinfo)
   } else {
     /* All-purpose script for other color spaces. */
     if (cinfo->master->use_moz_defaults == TRUE) {
-    if (ncomps > MAX_COMPS_IN_SCAN)
-        nscans = 5 * ncomps;	/* 2 DC + 4 AC scans per component */
+      if (ncomps > MAX_COMPS_IN_SCAN)
+        nscans = 5 * ncomps;      /* 2 DC + 4 AC scans per component */
       else
-        nscans = 1 + 4 * ncomps;	/* 2 DC scans; 4 AC scans per component */
+        nscans = 1 + 4 * ncomps;  /* 2 DC scans; 4 AC scans per component */
     } else {
       if (ncomps > MAX_COMPS_IN_SCAN)
-      nscans = 6 * ncomps;      /* 2 DC + 4 AC scans per component */
-    else
-      nscans = 2 + 4 * ncomps;  /* 2 DC scans; 4 AC scans per component */
-  }
+        nscans = 6 * ncomps;      /* 2 DC + 4 AC scans per component */
+      else
+        nscans = 2 + 4 * ncomps;  /* 2 DC scans; 4 AC scans per component */
+    }
   }
 
   /* Allocate space for script.
@@ -764,23 +764,23 @@ jpeg_simple_progression (j_compress_ptr cinfo)
       scanptr = fill_a_scan(scanptr, 2, 9, 63, 0, 0);
     } else {
       /* Initial DC scan */
-    scanptr = fill_dc_scans(scanptr, ncomps, 0, 1);
-    /* Initial AC scan: get some luma data out in a hurry */
-    scanptr = fill_a_scan(scanptr, 0, 1, 5, 0, 2);
-    /* Chroma data is too small to be worth expending many scans on */
-    scanptr = fill_a_scan(scanptr, 2, 1, 63, 0, 1);
-    scanptr = fill_a_scan(scanptr, 1, 1, 63, 0, 1);
-    /* Complete spectral selection for luma AC */
-    scanptr = fill_a_scan(scanptr, 0, 6, 63, 0, 2);
-    /* Refine next bit of luma AC */
-    scanptr = fill_a_scan(scanptr, 0, 1, 63, 2, 1);
-    /* Finish DC successive approximation */
-    scanptr = fill_dc_scans(scanptr, ncomps, 1, 0);
-    /* Finish AC successive approximation */
-    scanptr = fill_a_scan(scanptr, 2, 1, 63, 1, 0);
-    scanptr = fill_a_scan(scanptr, 1, 1, 63, 1, 0);
-    /* Luma bottom bit comes last since it's usually largest scan */
-    scanptr = fill_a_scan(scanptr, 0, 1, 63, 1, 0);
+      scanptr = fill_dc_scans(scanptr, ncomps, 0, 1);
+      /* Initial AC scan: get some luma data out in a hurry */
+      scanptr = fill_a_scan(scanptr, 0, 1, 5, 0, 2);
+      /* Chroma data is too small to be worth expending many scans on */
+      scanptr = fill_a_scan(scanptr, 2, 1, 63, 0, 1);
+      scanptr = fill_a_scan(scanptr, 1, 1, 63, 0, 1);
+      /* Complete spectral selection for luma AC */
+      scanptr = fill_a_scan(scanptr, 0, 6, 63, 0, 2);
+      /* Refine next bit of luma AC */
+      scanptr = fill_a_scan(scanptr, 0, 1, 63, 2, 1);
+      /* Finish DC successive approximation */
+      scanptr = fill_dc_scans(scanptr, ncomps, 1, 0);
+      /* Finish AC successive approximation */
+      scanptr = fill_a_scan(scanptr, 2, 1, 63, 1, 0);
+      scanptr = fill_a_scan(scanptr, 1, 1, 63, 1, 0);
+      /* Luma bottom bit comes last since it's usually largest scan */
+      scanptr = fill_a_scan(scanptr, 0, 1, 63, 1, 0);
     }
   } else {
     /* All-purpose script for other color spaces. */
@@ -788,7 +788,7 @@ jpeg_simple_progression (j_compress_ptr cinfo)
       /* scan defined in jpeg_scan_bw.txt in jpgcrush */
       /* DC component, no successive approximation */
       scanptr = fill_dc_scans(scanptr, ncomps, 0, 0);
-    /* Successive approximation first pass */
+      /* Successive approximation first pass */
       scanptr = fill_scans(scanptr, ncomps, 1, 8, 0, 2);
       scanptr = fill_scans(scanptr, ncomps, 9, 63, 0, 2);
       /* Successive approximation second pass */
@@ -797,17 +797,16 @@ jpeg_simple_progression (j_compress_ptr cinfo)
       scanptr = fill_scans(scanptr, ncomps, 1, 63, 1, 0);
     } else {
       /* Successive approximation first pass */
-    scanptr = fill_dc_scans(scanptr, ncomps, 0, 1);
-    scanptr = fill_scans(scanptr, ncomps, 1, 5, 0, 2);
-    scanptr = fill_scans(scanptr, ncomps, 6, 63, 0, 2);
-    /* Successive approximation second pass */
-    scanptr = fill_scans(scanptr, ncomps, 1, 63, 2, 1);
-    /* Successive approximation final pass */
-    scanptr = fill_dc_scans(scanptr, ncomps, 1, 0);
-    scanptr = fill_scans(scanptr, ncomps, 1, 63, 1, 0);
+      scanptr = fill_dc_scans(scanptr, ncomps, 0, 1);
+      scanptr = fill_scans(scanptr, ncomps, 1, 5, 0, 2);
+      scanptr = fill_scans(scanptr, ncomps, 6, 63, 0, 2);
+      /* Successive approximation second pass */
+      scanptr = fill_scans(scanptr, ncomps, 1, 63, 2, 1);
+      /* Successive approximation final pass */
+      scanptr = fill_dc_scans(scanptr, ncomps, 1, 0);
+      scanptr = fill_scans(scanptr, ncomps, 1, 63, 1, 0);
+    }
   }
 }
-}
-
 
 #endif /* C_PROGRESSIVE_SUPPORTED */
diff --git a/jcphuff.c b/jcphuff.c
index da2464ad..cebd075e 100644
--- a/jcphuff.c
+++ b/jcphuff.c
@@ -177,7 +177,7 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
         int i, j;
         for (i = 0; i < 16; i++)
           for (j = 0; j < 12; j++)
-          entropy->count_ptrs[tbl][16*i+j] = 1;
+            entropy->count_ptrs[tbl][16 * i + j] = 1;
       }
     } else {
       /* Compute derived values for Huffman table */
diff --git a/jpegint.h b/jpegint.h
index bc0d3dbe..76fef797 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -23,9 +23,8 @@ typedef enum {            /* Operating modes for buffer controllers */
   /* Remaining modes require a full-image buffer to have been created */
   JBUF_SAVE_SOURCE,       /* Run source subobject only, save output */
   JBUF_CRANK_DEST,        /* Run dest subobject only, using saved data */
-	JBUF_SAVE_AND_PASS,	/* Run both subobjects, save output */
-        JBUF_REQUANT            /* Requantize */
-
+  JBUF_SAVE_AND_PASS,     /* Run both subobjects, save output */
+  JBUF_REQUANT            /* Requantize */
 } J_BUF_MODE;
 
 /* Values of global_state field (jdapi.c has some dependencies on ordering!) */
diff --git a/jpeglib.h b/jpeglib.h
index 2f000723..7ea8baea 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -25,10 +25,10 @@
  * manual configuration options that most people need not worry about.
  */
 
-#ifndef JCONFIG_INCLUDED	/* in case jinclude.h already did */
-#include "jconfig.h"		/* widely used configuration options */
+#ifndef JCONFIG_INCLUDED        /* in case jinclude.h already did */
+#include "jconfig.h"            /* widely used configuration options */
 #endif
-#include "jmorecfg.h"		/* seldom changed options */
+#include "jmorecfg.h"           /* seldom changed options */
 
 
 #ifdef __cplusplus
@@ -43,13 +43,13 @@ extern "C" {
  * if you want to be compatible.
  */
 
-#define DCTSIZE		    8	/* The basic DCT block is 8x8 samples */
-#define DCTSIZE2	    64	/* DCTSIZE squared; # of elements in a block */
-#define NUM_QUANT_TBLS      4	/* Quantization tables are numbered 0..3 */
-#define NUM_HUFF_TBLS       4	/* Huffman tables are numbered 0..3 */
-#define NUM_ARITH_TBLS      16	/* Arith-coding tables are numbered 0..15 */
-#define MAX_COMPS_IN_SCAN   4	/* JPEG limit on # of components in one scan */
-#define MAX_SAMP_FACTOR     4	/* JPEG limit on sampling factors */
+#define DCTSIZE             8   /* The basic DCT block is 8x8 samples */
+#define DCTSIZE2            64  /* DCTSIZE squared; # of elements in a block */
+#define NUM_QUANT_TBLS      4   /* Quantization tables are numbered 0..3 */
+#define NUM_HUFF_TBLS       4   /* Huffman tables are numbered 0..3 */
+#define NUM_ARITH_TBLS      16  /* Arith-coding tables are numbered 0..15 */
+#define MAX_COMPS_IN_SCAN   4   /* JPEG limit on # of components in one scan */
+#define MAX_SAMP_FACTOR     4   /* JPEG limit on sampling factors */
 /* Unfortunately, some bozo at Adobe saw no reason to be bound by the standard;
  * the PostScript DCT filter can emit files with many more than 10 blocks/MCU.
  * If you happen to run across such a file, you can up D_MAX_BLOCKS_IN_MCU
@@ -67,13 +67,13 @@ extern "C" {
  */
 
 typedef JSAMPLE *JSAMPROW;      /* ptr to one image row of pixel samples. */
-typedef JSAMPROW *JSAMPARRAY;	/* ptr to some rows (a 2-D sample array) */
-typedef JSAMPARRAY *JSAMPIMAGE;	/* a 3-D sample array: top index is color */
+typedef JSAMPROW *JSAMPARRAY;   /* ptr to some rows (a 2-D sample array) */
+typedef JSAMPARRAY *JSAMPIMAGE; /* a 3-D sample array: top index is color */
 
-typedef JCOEF JBLOCK[DCTSIZE2];	/* one block of coefficients */
+typedef JCOEF JBLOCK[DCTSIZE2]; /* one block of coefficients */
 typedef JBLOCK *JBLOCKROW;      /* pointer to one row of coefficient blocks */
-typedef JBLOCKROW *JBLOCKARRAY;		/* a 2-D array of coefficient blocks */
-typedef JBLOCKARRAY *JBLOCKIMAGE;	/* a 3-D array of coefficient blocks */
+typedef JBLOCKROW *JBLOCKARRAY;         /* a 2-D array of coefficient blocks */
+typedef JBLOCKARRAY *JBLOCKIMAGE;       /* a 3-D array of coefficient blocks */
 
 typedef JCOEF *JCOEFPTR;        /* useful in a couple of places */
 
@@ -88,13 +88,13 @@ typedef struct {
    * (not the zigzag order in which they are stored in a JPEG DQT marker).
    * CAUTION: IJG versions prior to v6a kept this array in zigzag order.
    */
-  UINT16 quantval[DCTSIZE2];	/* quantization step for each coefficient */
+  UINT16 quantval[DCTSIZE2];    /* quantization step for each coefficient */
   /* This field is used only during compression.  It's initialized FALSE when
    * the table is created, and set TRUE when it's been output to the file.
    * You could suppress output of a table by setting this to TRUE.
    * (See jpeg_suppress_tables for an example.)
    */
-  boolean sent_table;		/* TRUE when table has been output */
+  boolean sent_table;           /* TRUE when table has been output */
 } JQUANT_TBL;
 
 
@@ -102,15 +102,15 @@ typedef struct {
 
 typedef struct {
   /* These two fields directly represent the contents of a JPEG DHT marker */
-  UINT8 bits[17];		/* bits[k] = # of symbols with codes of */
-				/* length k bits; bits[0] is unused */
-  UINT8 huffval[256];		/* The symbols, in order of incr code length */
+  UINT8 bits[17];               /* bits[k] = # of symbols with codes of */
+                                /* length k bits; bits[0] is unused */
+  UINT8 huffval[256];           /* The symbols, in order of incr code length */
   /* This field is used only during compression.  It's initialized FALSE when
    * the table is created, and set TRUE when it's been output to the file.
    * You could suppress output of a table by setting this to TRUE.
    * (See jpeg_suppress_tables for an example.)
    */
-  boolean sent_table;		/* TRUE when table has been output */
+  boolean sent_table;           /* TRUE when table has been output */
 } JHUFF_TBL;
 
 
@@ -120,20 +120,20 @@ typedef struct {
   /* These values are fixed over the whole image. */
   /* For compression, they must be supplied by parameter setup; */
   /* for decompression, they are read from the SOF marker. */
-  int component_id;		/* identifier for this component (0..255) */
-  int component_index;		/* its index in SOF or cinfo->comp_info[] */
-  int h_samp_factor;		/* horizontal sampling factor (1..4) */
-  int v_samp_factor;		/* vertical sampling factor (1..4) */
-  int quant_tbl_no;		/* quantization table selector (0..3) */
+  int component_id;             /* identifier for this component (0..255) */
+  int component_index;          /* its index in SOF or cinfo->comp_info[] */
+  int h_samp_factor;            /* horizontal sampling factor (1..4) */
+  int v_samp_factor;            /* vertical sampling factor (1..4) */
+  int quant_tbl_no;             /* quantization table selector (0..3) */
   /* These values may vary between scans. */
   /* For compression, they must be supplied by parameter setup; */
   /* for decompression, they are read from the SOS marker. */
   /* The decompressor output side may not use these variables. */
-  int dc_tbl_no;		/* DC entropy table selector (0..3) */
-  int ac_tbl_no;		/* AC entropy table selector (0..3) */
-  
+  int dc_tbl_no;                /* DC entropy table selector (0..3) */
+  int ac_tbl_no;                /* AC entropy table selector (0..3) */
+
   /* Remaining fields should be treated as private by applications. */
-  
+
   /* These values are computed during compression or decompression startup: */
   /* Component's size in DCT blocks.
    * Any dummy blocks added to complete an MCU are not counted; therefore
@@ -159,22 +159,22 @@ typedef struct {
    * and similarly for height.  For decompression, IDCT scaling is included, so
    * downsampled_width = ceil(image_width * Hi/Hmax * DCT_[h_]scaled_size/DCTSIZE)
    */
-  JDIMENSION downsampled_width;	 /* actual width in samples */
+  JDIMENSION downsampled_width;  /* actual width in samples */
   JDIMENSION downsampled_height; /* actual height in samples */
   /* This flag is used only for decompression.  In cases where some of the
    * components will be ignored (eg grayscale output from YCbCr image),
    * we can skip most computations for the unused components.
    */
-  boolean component_needed;	/* do we need the value of this component? */
+  boolean component_needed;     /* do we need the value of this component? */
 
   /* These values are computed before starting a scan of the component. */
   /* The decompressor output side may not use these variables. */
-  int MCU_width;		/* number of blocks per MCU, horizontally */
-  int MCU_height;		/* number of blocks per MCU, vertically */
-  int MCU_blocks;		/* MCU_width * MCU_height */
-  int MCU_sample_width;		/* MCU width in samples, MCU_width*DCT_[h_]scaled_size */
-  int last_col_width;		/* # of non-dummy blocks across in last MCU */
-  int last_row_height;		/* # of non-dummy blocks down in last MCU */
+  int MCU_width;                /* number of blocks per MCU, horizontally */
+  int MCU_height;               /* number of blocks per MCU, vertically */
+  int MCU_blocks;               /* MCU_width * MCU_height */
+  int MCU_sample_width;         /* MCU width in samples, MCU_width*DCT_[h_]scaled_size */
+  int last_col_width;           /* # of non-dummy blocks across in last MCU */
+  int last_row_height;          /* # of non-dummy blocks down in last MCU */
 
   /* Saved quantization table for component; NULL if none yet saved.
    * See jdinput.c comments about the need for this information.
@@ -190,10 +190,10 @@ typedef struct {
 /* The script for encoding a multiple-scan file is an array of these: */
 
 typedef struct {
-  int comps_in_scan;		/* number of components encoded in this scan */
+  int comps_in_scan;            /* number of components encoded in this scan */
   int component_index[MAX_COMPS_IN_SCAN]; /* their SOF/comp_info[] indexes */
-  int Ss, Se;			/* progressive JPEG spectral selection parms */
-  int Ah, Al;			/* progressive JPEG successive approx. parms */
+  int Ss, Se;                   /* progressive JPEG spectral selection parms */
+  int Ah, Al;                   /* progressive JPEG successive approx. parms */
 } jpeg_scan_info;
 
 /* The decompressor can save APPn and COM markers in a list of these: */
@@ -201,10 +201,10 @@ typedef struct {
 typedef struct jpeg_marker_struct * jpeg_saved_marker_ptr;
 
 struct jpeg_marker_struct {
-  jpeg_saved_marker_ptr next;	/* next in list, or NULL */
-  UINT8 marker;			/* marker code: JPEG_COM, or JPEG_APP0+n */
-  unsigned int original_length;	/* # bytes of data in the file */
-  unsigned int data_length;	/* # bytes of data saved at data[] */
+  jpeg_saved_marker_ptr next;   /* next in list, or NULL */
+  UINT8 marker;                 /* marker code: JPEG_COM, or JPEG_APP0+n */
+  unsigned int original_length; /* # bytes of data in the file */
+  unsigned int data_length;     /* # bytes of data saved at data[] */
   JOCTET * data;                /* the data contained in the marker */
   /* the marker length word is not counted in data_length or original_length */
 };
@@ -215,28 +215,28 @@ struct jpeg_marker_struct {
 #define JCS_ALPHA_EXTENSIONS 1
 
 typedef enum {
-	JCS_UNKNOWN,		/* error/unspecified */
-	JCS_GRAYSCALE,		/* monochrome */
+  JCS_UNKNOWN,            /* error/unspecified */
+  JCS_GRAYSCALE,          /* monochrome */
   JCS_RGB,                /* red/green/blue as specified by the RGB_RED,
                              RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE macros */
-	JCS_YCbCr,		/* Y/Cb/Cr (also known as YUV) */
-	JCS_CMYK,		/* C/M/Y/K */
-	JCS_YCCK,		/* Y/Cb/Cr/K */
-	JCS_EXT_RGB,		/* red/green/blue */
-	JCS_EXT_RGBX,		/* red/green/blue/x */
-	JCS_EXT_BGR,		/* blue/green/red */
-	JCS_EXT_BGRX,		/* blue/green/red/x */
-	JCS_EXT_XBGR,		/* x/blue/green/red */
-	JCS_EXT_XRGB,		/* x/red/green/blue */
+  JCS_YCbCr,              /* Y/Cb/Cr (also known as YUV) */
+  JCS_CMYK,               /* C/M/Y/K */
+  JCS_YCCK,               /* Y/Cb/Cr/K */
+  JCS_EXT_RGB,            /* red/green/blue */
+  JCS_EXT_RGBX,           /* red/green/blue/x */
+  JCS_EXT_BGR,            /* blue/green/red */
+  JCS_EXT_BGRX,           /* blue/green/red/x */
+  JCS_EXT_XBGR,           /* x/blue/green/red */
+  JCS_EXT_XRGB,           /* x/red/green/blue */
   /* When out_color_space it set to JCS_EXT_RGBX, JCS_EXT_BGRX, JCS_EXT_XBGR,
      or JCS_EXT_XRGB during decompression, the X byte is undefined, and in
      order to ensure the best performance, libjpeg-turbo can set that byte to
      whatever value it wishes.  Use the following colorspace constants to
      ensure that the X byte is set to 0xFF, so that it can be interpreted as an
      opaque alpha channel. */
-	JCS_EXT_RGBA,		/* red/green/blue/alpha */
-	JCS_EXT_BGRA,		/* blue/green/red/alpha */
-	JCS_EXT_ABGR,		/* alpha/blue/green/red */
+  JCS_EXT_RGBA,           /* red/green/blue/alpha */
+  JCS_EXT_BGRA,           /* blue/green/red/alpha */
+  JCS_EXT_ABGR,           /* alpha/blue/green/red */
   JCS_EXT_ARGB,           /* alpha/red/green/blue */
   JCS_RGB565              /* 5-bit red/6-bit green/5-bit blue */
 } J_COLOR_SPACE;
@@ -244,24 +244,24 @@ typedef enum {
 /* DCT/IDCT algorithm options. */
 
 typedef enum {
-	JDCT_ISLOW,		/* slow but accurate integer algorithm */
-	JDCT_IFAST,		/* faster, less accurate integer method */
-	JDCT_FLOAT		/* floating-point: accurate, fast on fast HW */
+  JDCT_ISLOW,             /* slow but accurate integer algorithm */
+  JDCT_IFAST,             /* faster, less accurate integer method */
+  JDCT_FLOAT              /* floating-point: accurate, fast on fast HW */
 } J_DCT_METHOD;
 
-#ifndef JDCT_DEFAULT		/* may be overridden in jconfig.h */
+#ifndef JDCT_DEFAULT            /* may be overridden in jconfig.h */
 #define JDCT_DEFAULT  JDCT_ISLOW
 #endif
-#ifndef JDCT_FASTEST		/* may be overridden in jconfig.h */
+#ifndef JDCT_FASTEST            /* may be overridden in jconfig.h */
 #define JDCT_FASTEST  JDCT_IFAST
 #endif
 
 /* Dithering options for decompression. */
 
 typedef enum {
-	JDITHER_NONE,		/* no dithering */
-	JDITHER_ORDERED,	/* simple ordered dither */
-	JDITHER_FS		/* Floyd-Steinberg error diffusion dither */
+  JDITHER_NONE,           /* no dithering */
+  JDITHER_ORDERED,        /* simple ordered dither */
+  JDITHER_FS              /* Floyd-Steinberg error diffusion dither */
 } J_DITHER_MODE;
 
 
@@ -307,19 +307,19 @@ typedef enum {
 /* Common fields between JPEG compression and decompression master structs. */
 
 #define jpeg_common_fields \
-  struct jpeg_error_mgr * err;	/* Error handler module */\
-  struct jpeg_memory_mgr * mem;	/* Memory manager module */\
+  struct jpeg_error_mgr * err;  /* Error handler module */\
+  struct jpeg_memory_mgr * mem; /* Memory manager module */\
   struct jpeg_progress_mgr * progress; /* Progress monitor, or NULL if none */\
-  void * client_data;		/* Available for use by application */\
-  boolean is_decompressor;	/* So common code can tell which is which */\
-  int global_state		/* For checking call sequence validity */
+  void * client_data;           /* Available for use by application */\
+  boolean is_decompressor;      /* So common code can tell which is which */\
+  int global_state              /* For checking call sequence validity */
 
 /* Routines that are to be used by both halves of the library are declared
  * to receive a pointer to this structure.  There are no actual instances of
  * jpeg_common_struct, only of jpeg_compress_struct and jpeg_decompress_struct.
  */
 struct jpeg_common_struct {
-  jpeg_common_fields;		/* Fields common to both master struct types */
+  jpeg_common_fields;           /* Fields common to both master struct types */
   /* Additional fields follow in an actual jpeg_compress_struct or
    * jpeg_decompress_struct.  All three structs must agree on these
    * initial fields!  (This would be a lot cleaner in C++.)
@@ -334,7 +334,7 @@ typedef struct jpeg_decompress_struct * j_decompress_ptr;
 /* Master record for a compression instance */
 
 struct jpeg_compress_struct {
-  jpeg_common_fields;		/* Fields shared with jpeg_decompress_struct */
+  jpeg_common_fields;           /* Fields shared with jpeg_decompress_struct */
 
   /* Destination for compressed data */
   struct jpeg_destination_mgr * dest;
@@ -344,12 +344,12 @@ struct jpeg_compress_struct {
    * be correct before you can even call jpeg_set_defaults().
    */
 
-  JDIMENSION image_width;	/* input image width */
-  JDIMENSION image_height;	/* input image height */
-  int input_components;		/* # of color components in input image */
-  J_COLOR_SPACE in_color_space;	/* colorspace of input image */
+  JDIMENSION image_width;       /* input image width */
+  JDIMENSION image_height;      /* input image height */
+  int input_components;         /* # of color components in input image */
+  J_COLOR_SPACE in_color_space; /* colorspace of input image */
 
-  double input_gamma;		/* image gamma of input image */
+  double input_gamma;           /* image gamma of input image */
 
   /* Compression parameters --- these fields must be set before calling
    * jpeg_start_compress().  We recommend calling jpeg_set_defaults() to
@@ -362,8 +362,8 @@ struct jpeg_compress_struct {
 #if JPEG_LIB_VERSION >= 70
   unsigned int scale_num, scale_denom; /* fraction by which to scale image */
 
-  JDIMENSION jpeg_width;	/* scaled JPEG image width */
-  JDIMENSION jpeg_height;	/* scaled JPEG image height */
+  JDIMENSION jpeg_width;        /* scaled JPEG image width */
+  JDIMENSION jpeg_height;       /* scaled JPEG image height */
   /* Dimensions of actual JPEG image that will be written to file,
    * derived from input dimensions by scaling factors above.
    * These fields are computed by jpeg_start_compress().
@@ -372,9 +372,9 @@ struct jpeg_compress_struct {
    */
 #endif
 
-  int data_precision;		/* bits of precision in image data */
+  int data_precision;           /* bits of precision in image data */
 
-  int num_components;		/* # of color components in JPEG image */
+  int num_components;           /* # of color components in JPEG image */
   J_COLOR_SPACE jpeg_color_space; /* colorspace of JPEG image */
 
   jpeg_component_info * comp_info;
@@ -396,22 +396,22 @@ struct jpeg_compress_struct {
   UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */
   UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */
 
-  int num_scans;		/* # of entries in scan_info array */
+  int num_scans;                /* # of entries in scan_info array */
   const jpeg_scan_info * scan_info; /* script for multi-scan file, or NULL */
   /* The default value of scan_info is NULL, which causes a single-scan
    * sequential JPEG file to be emitted.  To create a multi-scan file,
    * set num_scans and scan_info to point to an array of scan definitions.
    */
 
-  boolean raw_data_in;		/* TRUE=caller supplies downsampled data */
-  boolean arith_code;		/* TRUE=arithmetic coding, FALSE=Huffman */
-  boolean optimize_coding;	/* TRUE=optimize entropy encoding parms */
-  boolean CCIR601_sampling;	/* TRUE=first samples are cosited */
+  boolean raw_data_in;          /* TRUE=caller supplies downsampled data */
+  boolean arith_code;           /* TRUE=arithmetic coding, FALSE=Huffman */
+  boolean optimize_coding;      /* TRUE=optimize entropy encoding parms */
+  boolean CCIR601_sampling;     /* TRUE=first samples are cosited */
 #if JPEG_LIB_VERSION >= 70
   boolean do_fancy_downsampling; /* TRUE=apply fancy downsampling */
 #endif
-  int smoothing_factor;		/* 1..100, or 0 for no input smoothing */
-  J_DCT_METHOD dct_method;	/* DCT algorithm selector */
+  int smoothing_factor;         /* 1..100, or 0 for no input smoothing */
+  J_DCT_METHOD dct_method;      /* DCT algorithm selector */
 
   /* The restart interval can be specified in absolute MCUs by setting
    * restart_interval, or in MCU rows by setting restart_in_rows
@@ -419,28 +419,28 @@ struct jpeg_compress_struct {
    * for each scan).
    */
   unsigned int restart_interval; /* MCUs per restart, or 0 for no restart */
-  int restart_in_rows;		/* if > 0, MCU rows per restart interval */
+  int restart_in_rows;          /* if > 0, MCU rows per restart interval */
 
   /* Parameters controlling emission of special markers. */
 
-  boolean write_JFIF_header;	/* should a JFIF marker be written? */
-  UINT8 JFIF_major_version;	/* What to write for the JFIF version number */
+  boolean write_JFIF_header;    /* should a JFIF marker be written? */
+  UINT8 JFIF_major_version;     /* What to write for the JFIF version number */
   UINT8 JFIF_minor_version;
   /* These three values are not used by the JPEG code, merely copied */
   /* into the JFIF APP0 marker.  density_unit can be 0 for unknown, */
   /* 1 for dots/inch, or 2 for dots/cm.  Note that the pixel aspect */
   /* ratio is defined by X_density/Y_density even when density_unit=0. */
-  UINT8 density_unit;		/* JFIF code for pixel size units */
-  UINT16 X_density;		/* Horizontal pixel density */
-  UINT16 Y_density;		/* Vertical pixel density */
-  boolean write_Adobe_marker;	/* should an Adobe marker be written? */
-  
+  UINT8 density_unit;           /* JFIF code for pixel size units */
+  UINT16 X_density;             /* Horizontal pixel density */
+  UINT16 Y_density;             /* Vertical pixel density */
+  boolean write_Adobe_marker;   /* should an Adobe marker be written? */
+
   /* State variable: index of next scanline to be written to
    * jpeg_write_scanlines().  Application may use this to control its
    * processing loop, e.g., "while (next_scanline < image_height)".
    */
 
-  JDIMENSION next_scanline;	/* 0 .. image_height-1  */
+  JDIMENSION next_scanline;     /* 0 .. image_height-1  */
 
   /* Remaining fields are known throughout compressor, but generally
    * should not be touched by a surrounding application.
@@ -449,44 +449,44 @@ struct jpeg_compress_struct {
   /*
    * These fields are computed during compression startup
    */
-  boolean progressive_mode;	/* TRUE if scan script uses progressive mode */
-  int max_h_samp_factor;	/* largest h_samp_factor */
-  int max_v_samp_factor;	/* largest v_samp_factor */
+  boolean progressive_mode;     /* TRUE if scan script uses progressive mode */
+  int max_h_samp_factor;        /* largest h_samp_factor */
+  int max_v_samp_factor;        /* largest v_samp_factor */
 
 #if JPEG_LIB_VERSION >= 70
-  int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
-  int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
+  int min_DCT_h_scaled_size;    /* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;    /* smallest DCT_v_scaled_size of any component */
 #endif
 
-  JDIMENSION total_iMCU_rows;	/* # of iMCU rows to be input to coef ctlr */
+  JDIMENSION total_iMCU_rows;   /* # of iMCU rows to be input to coef ctlr */
   /* The coefficient controller receives data in units of MCU rows as defined
    * for fully interleaved scans (whether the JPEG file is interleaved or not).
    * There are v_samp_factor * DCTSIZE sample rows of each component in an
    * "iMCU" (interleaved MCU) row.
    */
-  
+
   /*
    * These fields are valid during any one scan.
    * They describe the components and MCUs actually appearing in the scan.
    */
-  int comps_in_scan;		/* # of JPEG components in this scan */
+  int comps_in_scan;            /* # of JPEG components in this scan */
   jpeg_component_info * cur_comp_info[MAX_COMPS_IN_SCAN];
   /* *cur_comp_info[i] describes component that appears i'th in SOS */
-  
-  JDIMENSION MCUs_per_row;	/* # of MCUs across the image */
-  JDIMENSION MCU_rows_in_scan;	/* # of MCU rows in the image */
-  
-  int blocks_in_MCU;		/* # of DCT blocks per MCU */
+
+  JDIMENSION MCUs_per_row;      /* # of MCUs across the image */
+  JDIMENSION MCU_rows_in_scan;  /* # of MCU rows in the image */
+
+  int blocks_in_MCU;            /* # of DCT blocks per MCU */
   int MCU_membership[C_MAX_BLOCKS_IN_MCU];
   /* MCU_membership[i] is index in cur_comp_info of component owning */
   /* i'th block in an MCU */
 
-  int Ss, Se, Ah, Al;		/* progressive JPEG parameters for scan */
+  int Ss, Se, Ah, Al;           /* progressive JPEG parameters for scan */
 
 #if JPEG_LIB_VERSION >= 80
-  int block_size;		/* the basic DCT block size: 1..16 */
-  const int * natural_order;	/* natural-order position array */
-  int lim_Se;			/* min( Se, DCTSIZE2-1 ) */
+  int block_size;               /* the basic DCT block size: 1..16 */
+  const int * natural_order;    /* natural-order position array */
+  int lim_Se;                   /* min( Se, DCTSIZE2-1 ) */
 #endif
 
   /*
@@ -509,7 +509,7 @@ struct jpeg_compress_struct {
 /* Master record for a decompression instance */
 
 struct jpeg_decompress_struct {
-  jpeg_common_fields;		/* Fields shared with jpeg_compress_struct */
+  jpeg_common_fields;           /* Fields shared with jpeg_compress_struct */
 
   /* Source of compressed data */
   struct jpeg_source_mgr * src;
@@ -517,9 +517,9 @@ struct jpeg_decompress_struct {
   /* Basic description of image --- filled in by jpeg_read_header(). */
   /* Application may inspect these values to decide how to process image. */
 
-  JDIMENSION image_width;	/* nominal image width (from SOF marker) */
-  JDIMENSION image_height;	/* nominal image height */
-  int num_components;		/* # of color components in JPEG image */
+  JDIMENSION image_width;       /* nominal image width (from SOF marker) */
+  JDIMENSION image_height;      /* nominal image height */
+  int num_components;           /* # of color components in JPEG image */
   J_COLOR_SPACE jpeg_color_space; /* colorspace of JPEG image */
 
   /* Decompression processing parameters --- these fields must be set before
@@ -531,24 +531,24 @@ struct jpeg_decompress_struct {
 
   unsigned int scale_num, scale_denom; /* fraction by which to scale image */
 
-  double output_gamma;		/* image gamma wanted in output */
+  double output_gamma;          /* image gamma wanted in output */
 
-  boolean buffered_image;	/* TRUE=multiple output passes */
-  boolean raw_data_out;		/* TRUE=downsampled data wanted */
+  boolean buffered_image;       /* TRUE=multiple output passes */
+  boolean raw_data_out;         /* TRUE=downsampled data wanted */
 
-  J_DCT_METHOD dct_method;	/* IDCT algorithm selector */
-  boolean do_fancy_upsampling;	/* TRUE=apply fancy upsampling */
-  boolean do_block_smoothing;	/* TRUE=apply interblock smoothing */
+  J_DCT_METHOD dct_method;      /* IDCT algorithm selector */
+  boolean do_fancy_upsampling;  /* TRUE=apply fancy upsampling */
+  boolean do_block_smoothing;   /* TRUE=apply interblock smoothing */
 
-  boolean quantize_colors;	/* TRUE=colormapped output wanted */
+  boolean quantize_colors;      /* TRUE=colormapped output wanted */
   /* the following are ignored if not quantize_colors: */
-  J_DITHER_MODE dither_mode;	/* type of color dithering to use */
-  boolean two_pass_quantize;	/* TRUE=use two-pass color quantization */
-  int desired_number_of_colors;	/* max # colors to use in created colormap */
+  J_DITHER_MODE dither_mode;    /* type of color dithering to use */
+  boolean two_pass_quantize;    /* TRUE=use two-pass color quantization */
+  int desired_number_of_colors; /* max # colors to use in created colormap */
   /* these are significant only in buffered-image mode: */
-  boolean enable_1pass_quant;	/* enable future use of 1-pass quantizer */
+  boolean enable_1pass_quant;   /* enable future use of 1-pass quantizer */
   boolean enable_external_quant;/* enable future use of external colormap */
-  boolean enable_2pass_quant;	/* enable future use of 2-pass quantizer */
+  boolean enable_2pass_quant;   /* enable future use of 2-pass quantizer */
 
   /* Description of actual output image that will be returned to application.
    * These fields are computed by jpeg_start_decompress().
@@ -556,14 +556,14 @@ struct jpeg_decompress_struct {
    * in advance of calling jpeg_start_decompress().
    */
 
-  JDIMENSION output_width;	/* scaled image width */
-  JDIMENSION output_height;	/* scaled image height */
-  int out_color_components;	/* # of color components in out_color_space */
-  int output_components;	/* # of color components returned */
+  JDIMENSION output_width;      /* scaled image width */
+  JDIMENSION output_height;     /* scaled image height */
+  int out_color_components;     /* # of color components in out_color_space */
+  int output_components;        /* # of color components returned */
   /* output_components is 1 (a colormap index) when quantizing colors;
    * otherwise it equals out_color_components.
    */
-  int rec_outbuf_height;	/* min recommended height of scanline buffer */
+  int rec_outbuf_height;        /* min recommended height of scanline buffer */
   /* If the buffer passed to jpeg_read_scanlines() is less than this many rows
    * high, space and time will be wasted due to unnecessary data copying.
    * Usually rec_outbuf_height will be 1 or 2, at most 4.
@@ -575,8 +575,8 @@ struct jpeg_decompress_struct {
    * jpeg_start_decompress or jpeg_start_output.
    * The map has out_color_components rows and actual_number_of_colors columns.
    */
-  int actual_number_of_colors;	/* number of entries in use */
-  JSAMPARRAY colormap;		/* The color map as a 2-D pixel array */
+  int actual_number_of_colors;  /* number of entries in use */
+  JSAMPARRAY colormap;          /* The color map as a 2-D pixel array */
 
   /* State variables: these variables indicate the progress of decompression.
    * The application may examine these but must not modify them.
@@ -586,20 +586,20 @@ struct jpeg_decompress_struct {
    * Application may use this to control its processing loop, e.g.,
    * "while (output_scanline < output_height)".
    */
-  JDIMENSION output_scanline;	/* 0 .. output_height-1  */
+  JDIMENSION output_scanline;   /* 0 .. output_height-1  */
 
   /* Current input scan number and number of iMCU rows completed in scan.
    * These indicate the progress of the decompressor input side.
    */
-  int input_scan_number;	/* Number of SOS markers seen so far */
-  JDIMENSION input_iMCU_row;	/* Number of iMCU rows completed */
+  int input_scan_number;        /* Number of SOS markers seen so far */
+  JDIMENSION input_iMCU_row;    /* Number of iMCU rows completed */
 
   /* The "output scan number" is the notional scan being displayed by the
    * output side.  The decompressor will not allow output scan/row number
    * to get ahead of input scan/row, but it can fall arbitrarily far behind.
    */
-  int output_scan_number;	/* Nominal scan number being displayed */
-  JDIMENSION output_iMCU_row;	/* Number of iMCU rows read */
+  int output_scan_number;       /* Nominal scan number being displayed */
+  JDIMENSION output_iMCU_row;   /* Number of iMCU rows read */
 
   /* Current progression status.  coef_bits[c][i] indicates the precision
    * with which component c's DCT coefficient i (in zigzag order) is known.
@@ -608,7 +608,7 @@ struct jpeg_decompress_struct {
    * (thus, 0 at completion of the progression).
    * This pointer is NULL when reading a non-progressive file.
    */
-  int (*coef_bits)[DCTSIZE2];	/* -1 or current Al value for each coef */
+  int (*coef_bits)[DCTSIZE2];   /* -1 or current Al value for each coef */
 
   /* Internal JPEG parameters --- the application usually need not look at
    * these fields.  Note that the decompressor output side may not use
@@ -630,16 +630,16 @@ struct jpeg_decompress_struct {
    * are given in SOF/SOS markers or defined to be reset by SOI.
    */
 
-  int data_precision;		/* bits of precision in image data */
+  int data_precision;           /* bits of precision in image data */
 
   jpeg_component_info * comp_info;
   /* comp_info[i] describes component that appears i'th in SOF */
 
 #if JPEG_LIB_VERSION >= 80
-  boolean is_baseline;		/* TRUE if Baseline SOF0 encountered */
+  boolean is_baseline;          /* TRUE if Baseline SOF0 encountered */
 #endif
-  boolean progressive_mode;	/* TRUE if SOFn specifies progressive mode */
-  boolean arith_code;		/* TRUE=arithmetic coding, FALSE=Huffman */
+  boolean progressive_mode;     /* TRUE if SOFn specifies progressive mode */
+  boolean arith_code;           /* TRUE=arithmetic coding, FALSE=Huffman */
 
   UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */
   UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */
@@ -650,17 +650,17 @@ struct jpeg_decompress_struct {
   /* These fields record data obtained from optional markers recognized by
    * the JPEG library.
    */
-  boolean saw_JFIF_marker;	/* TRUE iff a JFIF APP0 marker was found */
+  boolean saw_JFIF_marker;      /* TRUE iff a JFIF APP0 marker was found */
   /* Data copied from JFIF marker; only valid if saw_JFIF_marker is TRUE: */
-  UINT8 JFIF_major_version;	/* JFIF version number */
+  UINT8 JFIF_major_version;     /* JFIF version number */
   UINT8 JFIF_minor_version;
-  UINT8 density_unit;		/* JFIF code for pixel size units */
-  UINT16 X_density;		/* Horizontal pixel density */
-  UINT16 Y_density;		/* Vertical pixel density */
-  boolean saw_Adobe_marker;	/* TRUE iff an Adobe APP14 marker was found */
-  UINT8 Adobe_transform;	/* Color transform code from Adobe marker */
+  UINT8 density_unit;           /* JFIF code for pixel size units */
+  UINT16 X_density;             /* Horizontal pixel density */
+  UINT16 Y_density;             /* Vertical pixel density */
+  boolean saw_Adobe_marker;     /* TRUE iff an Adobe APP14 marker was found */
+  UINT8 Adobe_transform;        /* Color transform code from Adobe marker */
 
-  boolean CCIR601_sampling;	/* TRUE=first samples are cosited */
+  boolean CCIR601_sampling;     /* TRUE=first samples are cosited */
 
   /* Aside from the specific data retained from APPn markers known to the
    * library, the uninterpreted contents of any or all APPn and COM markers
@@ -675,17 +675,17 @@ struct jpeg_decompress_struct {
   /*
    * These fields are computed during decompression startup
    */
-  int max_h_samp_factor;	/* largest h_samp_factor */
-  int max_v_samp_factor;	/* largest v_samp_factor */
+  int max_h_samp_factor;        /* largest h_samp_factor */
+  int max_v_samp_factor;        /* largest v_samp_factor */
 
 #if JPEG_LIB_VERSION >= 70
-  int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
-  int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
+  int min_DCT_h_scaled_size;    /* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;    /* smallest DCT_v_scaled_size of any component */
 #else
-  int min_DCT_scaled_size;	/* smallest DCT_scaled_size of any component */
+  int min_DCT_scaled_size;      /* smallest DCT_scaled_size of any component */
 #endif
 
-  JDIMENSION total_iMCU_rows;	/* # of iMCU rows in image */
+  JDIMENSION total_iMCU_rows;   /* # of iMCU rows in image */
   /* The coefficient controller's input and output progress is measured in
    * units of "iMCU" (interleaved MCU) rows.  These are the same as MCU rows
    * in fully interleaved JPEG scans, but are used whether the scan is
@@ -701,26 +701,26 @@ struct jpeg_decompress_struct {
    * They describe the components and MCUs actually appearing in the scan.
    * Note that the decompressor output side must not use these fields.
    */
-  int comps_in_scan;		/* # of JPEG components in this scan */
+  int comps_in_scan;            /* # of JPEG components in this scan */
   jpeg_component_info * cur_comp_info[MAX_COMPS_IN_SCAN];
   /* *cur_comp_info[i] describes component that appears i'th in SOS */
 
-  JDIMENSION MCUs_per_row;	/* # of MCUs across the image */
-  JDIMENSION MCU_rows_in_scan;	/* # of MCU rows in the image */
+  JDIMENSION MCUs_per_row;      /* # of MCUs across the image */
+  JDIMENSION MCU_rows_in_scan;  /* # of MCU rows in the image */
 
-  int blocks_in_MCU;		/* # of DCT blocks per MCU */
+  int blocks_in_MCU;            /* # of DCT blocks per MCU */
   int MCU_membership[D_MAX_BLOCKS_IN_MCU];
   /* MCU_membership[i] is index in cur_comp_info of component owning */
   /* i'th block in an MCU */
 
-  int Ss, Se, Ah, Al;		/* progressive JPEG parameters for scan */
+  int Ss, Se, Ah, Al;           /* progressive JPEG parameters for scan */
 
 #if JPEG_LIB_VERSION >= 80
   /* These fields are derived from Se of first SOS marker.
    */
-  int block_size;		/* the basic DCT block size: 1..16 */
+  int block_size;               /* the basic DCT block size: 1..16 */
   const int * natural_order; /* natural-order position array for entropy decode */
-  int lim_Se;			/* min( Se, DCTSIZE2-1 ) for entropy decode */
+  int lim_Se;                   /* min( Se, DCTSIZE2-1 ) for entropy decode */
 #endif
 
   /* This field is shared between entropy decoder and marker parser.
@@ -765,10 +765,10 @@ struct jpeg_error_mgr {
   void (*output_message) (j_common_ptr cinfo);
   /* Format a message string for the most recent JPEG error or message */
   void (*format_message) (j_common_ptr cinfo, char * buffer);
-#define JMSG_LENGTH_MAX  200	/* recommended size of format_message buffer */
+#define JMSG_LENGTH_MAX  200    /* recommended size of format_message buffer */
   /* Reset error state variables at start of a new image */
   void (*reset_error_mgr) (j_common_ptr cinfo);
-  
+
   /* The message ID code and any parameters are saved here.
    * A message can have one string parameter or up to 8 int parameters.
    */
@@ -778,18 +778,18 @@ struct jpeg_error_mgr {
     int i[8];
     char s[JMSG_STR_PARM_MAX];
   } msg_parm;
-  
+
   /* Standard state variables for error facility */
-  
-  int trace_level;		/* max msg_level that will be displayed */
-  
+
+  int trace_level;              /* max msg_level that will be displayed */
+
   /* For recoverable corrupt-data errors, we emit a warning message,
    * but keep going unless emit_message chooses to abort.  emit_message
    * should count warnings in num_warnings.  The surrounding application
    * can check for bad data by seeing if num_warnings is nonzero at the
    * end of processing.
    */
-  long num_warnings;		/* number of corrupt-data warnings */
+  long num_warnings;            /* number of corrupt-data warnings */
 
   /* These fields point to the table(s) of error message strings.
    * An application can change the table pointer to switch to a different
@@ -807,8 +807,8 @@ struct jpeg_error_mgr {
    * It contains strings numbered first_addon_message..last_addon_message.
    */
   const char * const * addon_message_table; /* Non-library errors */
-  int first_addon_message;	/* code for first string in addon table */
-  int last_addon_message;	/* code for last string in addon table */
+  int first_addon_message;      /* code for first string in addon table */
+  int last_addon_message;       /* code for last string in addon table */
 };
 
 
@@ -817,18 +817,18 @@ struct jpeg_error_mgr {
 struct jpeg_progress_mgr {
   void (*progress_monitor) (j_common_ptr cinfo);
 
-  long pass_counter;		/* work units completed in this pass */
-  long pass_limit;		/* total number of work units in this pass */
-  int completed_passes;		/* passes completed so far */
-  int total_passes;		/* total number of passes expected */
+  long pass_counter;            /* work units completed in this pass */
+  long pass_limit;              /* total number of work units in this pass */
+  int completed_passes;         /* passes completed so far */
+  int total_passes;             /* total number of passes expected */
 };
 
 
 /* Data destination object for compression */
 
 struct jpeg_destination_mgr {
-  JOCTET * next_output_byte;	/* => next byte to write in buffer */
-  size_t free_in_buffer;	/* # of byte spaces remaining in buffer */
+  JOCTET * next_output_byte;    /* => next byte to write in buffer */
+  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
 
   void (*init_destination) (j_compress_ptr cinfo);
   boolean (*empty_output_buffer) (j_compress_ptr cinfo);
@@ -840,7 +840,7 @@ struct jpeg_destination_mgr {
 
 struct jpeg_source_mgr {
   const JOCTET * next_input_byte; /* => next byte to read from buffer */
-  size_t bytes_in_buffer;	/* # of bytes remaining in buffer */
+  size_t bytes_in_buffer;       /* # of bytes remaining in buffer */
 
   void (*init_source) (j_decompress_ptr cinfo);
   boolean (*fill_input_buffer) (j_decompress_ptr cinfo);
@@ -861,9 +861,9 @@ struct jpeg_source_mgr {
  * successful.
  */
 
-#define JPOOL_PERMANENT	0	/* lasts until master record is destroyed */
-#define JPOOL_IMAGE	1	/* lasts until done with image/datastream */
-#define JPOOL_NUMPOOLS	2
+#define JPOOL_PERMANENT 0       /* lasts until master record is destroyed */
+#define JPOOL_IMAGE     1       /* lasts until done with image/datastream */
+#define JPOOL_NUMPOOLS  2
 
 typedef struct jvirt_sarray_control * jvirt_sarray_ptr;
 typedef struct jvirt_barray_control * jvirt_barray_ptr;
@@ -879,14 +879,14 @@ struct jpeg_memory_mgr {
   JBLOCKARRAY (*alloc_barray) (j_common_ptr cinfo, int pool_id,
                                JDIMENSION blocksperrow, JDIMENSION numrows);
   jvirt_sarray_ptr (*request_virt_sarray) (j_common_ptr cinfo, int pool_id,
-						  boolean pre_zero,
-						  JDIMENSION samplesperrow,
-						  JDIMENSION numrows,
+                                           boolean pre_zero,
+                                           JDIMENSION samplesperrow,
+                                           JDIMENSION numrows,
                                            JDIMENSION maxaccess);
   jvirt_barray_ptr (*request_virt_barray) (j_common_ptr cinfo, int pool_id,
-						  boolean pre_zero,
-						  JDIMENSION blocksperrow,
-						  JDIMENSION numrows,
+                                           boolean pre_zero,
+                                           JDIMENSION blocksperrow,
+                                           JDIMENSION numrows,
                                            JDIMENSION maxaccess);
   void (*realize_virt_arrays) (j_common_ptr cinfo);
   JSAMPARRAY (*access_virt_sarray) (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
@@ -928,10 +928,10 @@ EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr * err);
  */
 #define jpeg_create_compress(cinfo) \
     jpeg_CreateCompress((cinfo), JPEG_LIB_VERSION, \
-			(size_t) sizeof(struct jpeg_compress_struct))
+                        (size_t) sizeof(struct jpeg_compress_struct))
 #define jpeg_create_decompress(cinfo) \
     jpeg_CreateDecompress((cinfo), JPEG_LIB_VERSION, \
-			  (size_t) sizeof(struct jpeg_decompress_struct))
+                          (size_t) sizeof(struct jpeg_decompress_struct))
 EXTERN(void) jpeg_CreateCompress (j_compress_ptr cinfo, int version,
                                   size_t structsize);
 EXTERN(void) jpeg_CreateDecompress (j_decompress_ptr cinfo, int version,
@@ -968,7 +968,7 @@ EXTERN(void) jpeg_default_qtables (j_compress_ptr cinfo,
                                    boolean force_baseline);
 #endif
 EXTERN(void) jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
-				       const unsigned int *basic_table,
+                                   const unsigned int *basic_table,
                                    int scale_factor, boolean force_baseline);
 EXTERN(int) jpeg_quality_scaling (int quality);
 EXTERN(float) jpeg_float_quality_scaling (float quality);
@@ -981,7 +981,7 @@ EXTERN(JHUFF_TBL *) jpeg_alloc_huff_table (j_common_ptr cinfo);
 EXTERN(void) jpeg_start_compress (j_compress_ptr cinfo,
                                   boolean write_all_tables);
 EXTERN(JDIMENSION) jpeg_write_scanlines (j_compress_ptr cinfo,
-					     JSAMPARRAY scanlines,
+                                         JSAMPARRAY scanlines,
                                          JDIMENSION num_lines);
 EXTERN(void) jpeg_finish_compress (j_compress_ptr cinfo);
 
@@ -1008,9 +1008,9 @@ EXTERN(void) jpeg_write_tables (j_compress_ptr cinfo);
 /* Decompression startup: read start of JPEG datastream to see what's there */
 EXTERN(int) jpeg_read_header (j_decompress_ptr cinfo, boolean require_image);
 /* Return value is one of: */
-#define JPEG_SUSPENDED		0 /* Suspended due to lack of input data */
-#define JPEG_HEADER_OK		1 /* Found valid image datastream */
-#define JPEG_HEADER_TABLES_ONLY	2 /* Found valid table-specs-only datastream */
+#define JPEG_SUSPENDED          0 /* Suspended due to lack of input data */
+#define JPEG_HEADER_OK          1 /* Found valid image datastream */
+#define JPEG_HEADER_TABLES_ONLY 2 /* Found valid table-specs-only datastream */
 /* If you pass require_image = TRUE (normal case), you need not check for
  * a TABLES_ONLY return code; an abbreviated file will cause an error exit.
  * JPEG_SUSPENDED is only possible if you use a data source module that can
@@ -1020,7 +1020,7 @@ EXTERN(int) jpeg_read_header (j_decompress_ptr cinfo, boolean require_image);
 /* Main entry points for decompression */
 EXTERN(boolean) jpeg_start_decompress (j_decompress_ptr cinfo);
 EXTERN(JDIMENSION) jpeg_read_scanlines (j_decompress_ptr cinfo,
-					    JSAMPARRAY scanlines,
+                                        JSAMPARRAY scanlines,
                                         JDIMENSION max_lines);
 EXTERN(boolean) jpeg_finish_decompress (j_decompress_ptr cinfo);
 
@@ -1036,11 +1036,11 @@ EXTERN(boolean) jpeg_input_complete (j_decompress_ptr cinfo);
 EXTERN(void) jpeg_new_colormap (j_decompress_ptr cinfo);
 EXTERN(int) jpeg_consume_input (j_decompress_ptr cinfo);
 /* Return value is one of: */
-/* #define JPEG_SUSPENDED	0    Suspended due to lack of input data */
-#define JPEG_REACHED_SOS	1 /* Reached start of new scan */
-#define JPEG_REACHED_EOI	2 /* Reached end of image */
-#define JPEG_ROW_COMPLETED	3 /* Completed one iMCU row */
-#define JPEG_SCAN_COMPLETED	4 /* Completed last iMCU row of a scan */
+/* #define JPEG_SUSPENDED       0    Suspended due to lack of input data */
+#define JPEG_REACHED_SOS        1 /* Reached start of new scan */
+#define JPEG_REACHED_EOI        2 /* Reached end of image */
+#define JPEG_ROW_COMPLETED      3 /* Completed one iMCU row */
+#define JPEG_SCAN_COMPLETED     4 /* Completed last iMCU row of a scan */
 
 /* Precalculate output dimensions for current decompression parameters. */
 #if JPEG_LIB_VERSION >= 80
@@ -1108,10 +1108,10 @@ EXTERN(int) jpeg_c_get_int_param (j_compress_ptr cinfo, J_INT_PARAM param);
  * are likely to want to use them.
  */
 
-#define JPEG_RST0	0xD0	/* RST0 marker code */
-#define JPEG_EOI	0xD9	/* EOI marker code */
-#define JPEG_APP0	0xE0	/* APP0 marker code */
-#define JPEG_COM	0xFE	/* COM marker code */
+#define JPEG_RST0       0xD0    /* RST0 marker code */
+#define JPEG_EOI        0xD9    /* EOI marker code */
+#define JPEG_APP0       0xE0    /* APP0 marker code */
+#define JPEG_COM        0xFE    /* COM marker code */
 
 
 /* If we have a brain-damaged compiler that emits warnings (or worse, errors)
@@ -1120,7 +1120,7 @@ EXTERN(int) jpeg_c_get_int_param (j_compress_ptr cinfo, J_INT_PARAM param);
  */
 
 #ifdef INCOMPLETE_TYPES_BROKEN
-#ifndef JPEG_INTERNALS		/* will be defined in jpegint.h */
+#ifndef JPEG_INTERNALS          /* will be defined in jpegint.h */
 struct jvirt_sarray_control { long dummy; };
 struct jvirt_barray_control { long dummy; };
 struct jpeg_comp_master { long dummy; };
@@ -1155,8 +1155,8 @@ struct jpeg_color_quantizer { long dummy; };
  */
 
 #ifdef JPEG_INTERNALS
-#include "jpegint.h"		/* fetch private declarations */
-#include "jerror.h"		/* fetch error codes too */
+#include "jpegint.h"            /* fetch private declarations */
+#include "jerror.h"             /* fetch error codes too */
 #endif
 
 #ifdef __cplusplus
diff --git a/rdjpeg.c b/rdjpeg.c
index 96bb7d51..c0b53ab7 100644
--- a/rdjpeg.c
+++ b/rdjpeg.c
@@ -9,7 +9,7 @@
  *
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 
 #if JPEG_RAW_READER
 #define NUM_ROWS 32
@@ -22,7 +22,7 @@ typedef struct _jpeg_source_struct * jpeg_source_ptr;
 typedef struct _jpeg_source_struct {
   struct cjpeg_source_struct pub; /* public fields */
 
-  j_compress_ptr cinfo;		/* back link saves passing separate parm */
+  j_compress_ptr cinfo;         /* back link saves passing separate parm */
   
   struct jpeg_decompress_struct dinfo;
   struct jpeg_error_mgr jerr;
@@ -150,8 +150,8 @@ jinit_read_jpeg (j_compress_ptr cinfo)
   /* Create module interface object */
   source = (jpeg_source_ptr)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  sizeof(jpeg_source_struct));
-  source->cinfo = cinfo;	/* make back link for subroutines */
+                                  sizeof(jpeg_source_struct));
+  source->cinfo = cinfo;        /* make back link for subroutines */
   /* Fill in method ptrs, except get_pixel_rows which start_input sets */
   source->pub.start_input = start_input_jpeg;
   source->pub.finish_input = finish_input_jpeg;
diff --git a/rdswitch.c b/rdswitch.c
index 66f8e59c..2ded4478 100644
--- a/rdswitch.c
+++ b/rdswitch.c
@@ -9,15 +9,15 @@
  *
  * This file contains routines to process some of cjpeg's more complicated
  * command-line switches.  Switches processed here are:
- *	-qtables file		Read quantization tables from text file
- *	-scans file		Read scan script from text file
- *	-quality N[,N,...]	Set quality ratings
- *	-qslots N[,N,...]	Set component quantization table selectors
- *	-sample HxV[,HxV,...]	Set component sampling factors
+ *      -qtables file           Read quantization tables from text file
+ *      -scans file             Read scan script from text file
+ *      -quality N[,N,...]      Set quality ratings
+ *      -qslots N[,N,...]       Set component quantization table selectors
+ *      -sample HxV[,HxV,...]   Set component sampling factors
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
-#include <ctype.h>		/* to declare isdigit(), isspace() */
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+#include <ctype.h>              /* to declare isdigit(), isspace() */
 
 
 LOCAL(int)
@@ -26,7 +26,7 @@ text_getc (FILE * file)
 /* A comment/newline sequence is returned as a newline */
 {
   register int ch;
-  
+
   ch = getc(file);
   if (ch == '#') {
     do {
@@ -44,7 +44,7 @@ read_text_integer (FILE * file, long * result, int * termchar)
 {
   register int ch;
   register long val;
-  
+
   /* Skip any leading whitespace, detect EOF */
   do {
     ch = text_getc(file);
@@ -53,7 +53,7 @@ read_text_integer (FILE * file, long * result, int * termchar)
       return FALSE;
     }
   } while (isspace(ch));
-  
+
   if (! isdigit(ch)) {
     *termchar = ch;
     return FALSE;
@@ -108,15 +108,15 @@ read_quant_tables (j_compress_ptr cinfo, char * filename, boolean force_baseline
     table[0] = (unsigned int) val;
     for (i = 1; i < DCTSIZE2; i++) {
       if (! read_text_integer(fp, &val, &termchar)) {
-	fprintf(stderr, "Invalid table data in file %s\n", filename);
-	fclose(fp);
-	return FALSE;
+        fprintf(stderr, "Invalid table data in file %s\n", filename);
+        fclose(fp);
+        return FALSE;
       }
       table[i] = (unsigned int) val;
     }
 #if JPEG_LIB_VERSION >= 70
     jpeg_add_quant_table(cinfo, tblno, table, cinfo->q_scale_factor[tblno],
-			 force_baseline);
+                         force_baseline);
 #else
     jpeg_add_quant_table(cinfo, tblno, table, q_scale_factor[tblno],
                          force_baseline);
@@ -150,7 +150,7 @@ read_scan_integer (FILE * file, long * result, int * termchar)
   ch = *termchar;
   while (ch != EOF && isspace(ch))
     ch = text_getc(file);
-  if (isdigit(ch)) {		/* oops, put it back */
+  if (isdigit(ch)) {            /* oops, put it back */
     if (ungetc(ch, file) == EOF)
       return FALSE;
     ch = ' ';
@@ -188,7 +188,7 @@ read_scan_script (j_compress_ptr cinfo, char * filename)
   int scanno, ncomps, termchar;
   long val;
   jpeg_scan_info * scanptr;
-#define MAX_SCANS  100		/* quite arbitrary limit */
+#define MAX_SCANS  100          /* quite arbitrary limit */
   jpeg_scan_info scans[MAX_SCANS];
 
   if ((fp = fopen(filename, "r")) == NULL) {
@@ -208,29 +208,29 @@ read_scan_script (j_compress_ptr cinfo, char * filename)
     ncomps = 1;
     while (termchar == ' ') {
       if (ncomps >= MAX_COMPS_IN_SCAN) {
-	fprintf(stderr, "Too many components in one scan in file %s\n",
-		filename);
-	fclose(fp);
-	return FALSE;
+        fprintf(stderr, "Too many components in one scan in file %s\n",
+                filename);
+        fclose(fp);
+        return FALSE;
       }
       if (! read_scan_integer(fp, &val, &termchar))
-	goto bogus;
+        goto bogus;
       scanptr->component_index[ncomps] = (int) val;
       ncomps++;
     }
     scanptr->comps_in_scan = ncomps;
     if (termchar == ':') {
       if (! read_scan_integer(fp, &val, &termchar) || termchar != ' ')
-	goto bogus;
+        goto bogus;
       scanptr->Ss = (int) val;
       if (! read_scan_integer(fp, &val, &termchar) || termchar != ' ')
-	goto bogus;
+        goto bogus;
       scanptr->Se = (int) val;
       if (! read_scan_integer(fp, &val, &termchar) || termchar != ' ')
-	goto bogus;
+        goto bogus;
       scanptr->Ah = (int) val;
       if (! read_scan_integer(fp, &val, &termchar))
-	goto bogus;
+        goto bogus;
       scanptr->Al = (int) val;
     } else {
       /* set non-progressive parameters */
@@ -337,7 +337,7 @@ set_quality_ratings (j_compress_ptr cinfo, char *arg, boolean force_baseline)
  * If there are more q-table slots than parameters, the last value is replicated.
  */
 {
-  float val = 75.f;             /* default value */
+  float val = 75.f;                 /* default value */
   int tblno;
   char ch;
 
@@ -355,7 +355,7 @@ set_quality_ratings (j_compress_ptr cinfo, char *arg, boolean force_baseline)
       q_scale_factor[tblno] = jpeg_float_quality_scaling(val);
 #endif
       while (*arg && *arg++ != ',') /* advance to next segment of arg string */
-	;
+        ;
     } else {
       /* reached end of parameter, set remaining factors to last value */
 #if JPEG_LIB_VERSION >= 70
@@ -377,25 +377,25 @@ set_quant_slots (j_compress_ptr cinfo, char *arg)
  * If there are more components than parameters, the last value is replicated.
  */
 {
-  int val = 0;			/* default table # */
+  int val = 0;                  /* default table # */
   int ci;
   char ch;
 
   for (ci = 0; ci < MAX_COMPONENTS; ci++) {
     if (*arg) {
-      ch = ',';			/* if not set by sscanf, will be ',' */
+      ch = ',';                 /* if not set by sscanf, will be ',' */
       if (sscanf(arg, "%d%c", &val, &ch) < 1)
-	return FALSE;
-      if (ch != ',')		/* syntax check */
-	return FALSE;
+        return FALSE;
+      if (ch != ',')            /* syntax check */
+        return FALSE;
       if (val < 0 || val >= NUM_QUANT_TBLS) {
-	fprintf(stderr, "JPEG quantization tables are numbered 0..%d\n",
-		NUM_QUANT_TBLS-1);
-	return FALSE;
+        fprintf(stderr, "JPEG quantization tables are numbered 0..%d\n",
+                NUM_QUANT_TBLS-1);
+        return FALSE;
       }
       cinfo->comp_info[ci].quant_tbl_no = val;
       while (*arg && *arg++ != ',') /* advance to next segment of arg string */
-	;
+        ;
     } else {
       /* reached end of parameter, set remaining components to last table */
       cinfo->comp_info[ci].quant_tbl_no = val;
@@ -417,19 +417,19 @@ set_sample_factors (j_compress_ptr cinfo, char *arg)
 
   for (ci = 0; ci < MAX_COMPONENTS; ci++) {
     if (*arg) {
-      ch2 = ',';		/* if not set by sscanf, will be ',' */
+      ch2 = ',';                /* if not set by sscanf, will be ',' */
       if (sscanf(arg, "%d%c%d%c", &val1, &ch1, &val2, &ch2) < 3)
-	return FALSE;
+        return FALSE;
       if ((ch1 != 'x' && ch1 != 'X') || ch2 != ',') /* syntax check */
-	return FALSE;
+        return FALSE;
       if (val1 <= 0 || val1 > 4 || val2 <= 0 || val2 > 4) {
-	fprintf(stderr, "JPEG sampling factors must be 1..4\n");
-	return FALSE;
+        fprintf(stderr, "JPEG sampling factors must be 1..4\n");
+        return FALSE;
       }
       cinfo->comp_info[ci].h_samp_factor = val1;
       cinfo->comp_info[ci].v_samp_factor = val2;
       while (*arg && *arg++ != ',') /* advance to next segment of arg string */
-	;
+        ;
     } else {
       /* reached end of parameter, set remaining components to 1x1 sampling */
       cinfo->comp_info[ci].h_samp_factor = 1;
diff --git a/turbojpeg.c b/turbojpeg.c
index d36ce5e8..3ccb23ee 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c
@@ -243,7 +243,7 @@ static int setCompDefaults(struct jpeg_compress_struct *cinfo,
 	/* Set scan pattern again as colorspace might have changed */
 	if (cinfo->master->use_moz_defaults)
 		jpeg_simple_progression(cinfo);
-  
+
 	cinfo->comp_info[0].h_samp_factor=tjMCUWidth[subsamp]/8;
 	cinfo->comp_info[1].h_samp_factor=1;
 	cinfo->comp_info[2].h_samp_factor=1;

From 63eb69de62f9cfeb4fdfbc759eac878dd7cabba4 Mon Sep 17 00:00:00 2001
From: DRC <information@virtualgl.org>
Date: Thu, 6 Nov 2014 01:30:06 -0600
Subject: [PATCH 010/108] Remove unused code in TJBench The merge from
 libjpeg-turbo 1.4 somehow failed here, and this function from 1.3 was left
 hanging.

---
 tjbench.c | 76 -------------------------------------------------------
 1 file changed, 76 deletions(-)

diff --git a/tjbench.c b/tjbench.c
index 2a3f9445..741dc7e7 100644
--- a/tjbench.c
+++ b/tjbench.c
@@ -273,82 +273,6 @@ int decomp(unsigned char *srcbuf, unsigned char **jpegbuf,
 }
 
 
-int dotestyuv(unsigned char *srcbuf, int w, int h, int subsamp,
-	char *filename)
-{
-	char tempstr[1024], tempstr2[80];
-	FILE *file=NULL;  tjhandle handle=NULL;
-	unsigned char *dstbuf=NULL;
-	double start, elapsed;
-	int i, retval=0, ps=tjPixelSize[pf];
-	int yuvsize=0;
-
-	yuvsize=tjBufSizeYUV(w, h, subsamp);
-	if((dstbuf=(unsigned char *)malloc(yuvsize)) == NULL)
-		_throwunix("allocating image buffer");
-
-	if(!quiet)
-		printf(">>>>>  %s (%s) <--> YUV %s  <<<<<\n", pixFormatStr[pf],
-			(flags&TJFLAG_BOTTOMUP)? "Bottom-up":"Top-down", subNameLong[subsamp]);
-
-	if(quiet==1)
-		printf("%s\t%s\t%s\tN/A\t", pixFormatStr[pf],
-			(flags&TJFLAG_BOTTOMUP)? "BU":"TD", subNameLong[subsamp]);
-
-	if((handle=tjInitCompress())==NULL)
-		_throwtj("executing tjInitCompress()");
-
-	/* Execute once to preload cache */
-	if(tjEncodeYUV2(handle, srcbuf, w, 0, h, pf, dstbuf, subsamp, flags)==-1)
-		_throwtj("executing tjEncodeYUV2()");
-
-	/* Benchmark */
-	for(i=0, start=gettime(); (elapsed=gettime()-start)<benchtime; i++)
-	{
-		if(tjEncodeYUV2(handle, srcbuf, w, 0, h, pf, dstbuf, subsamp, flags)==-1)
-			_throwtj("executing tjEncodeYUV2()");
-	}
-
-	if(tjDestroy(handle)==-1) _throwtj("executing tjDestroy()");
-	handle=NULL;
-
-	if(quiet==1) printf("%-4d  %-4d\t", w, h);
-	if(quiet)
-	{
-		printf("%s%c%s%c",
-			sigfig((double)(w*h)/1000000.*(double)i/elapsed, 4, tempstr, 1024),
-			quiet==2? '\n':'\t',
-			sigfig((double)(w*h*ps)/(double)yuvsize, 4, tempstr2, 80),
-			quiet==2? '\n':'\t');
-	}
-	else
-	{
-		printf("\n%s size: %d x %d\n", "Image", w, h);
-		printf("C--> Frame rate:           %f fps\n", (double)i/elapsed);
-		printf("     Output image size:    %d bytes\n", yuvsize);
-		printf("     Compression ratio:    %f:1\n",
-			(double)(w*h*ps)/(double)yuvsize);
-		printf("     Source throughput:    %f Megapixels/sec\n",
-			(double)(w*h)/1000000.*(double)i/elapsed);
-		printf("     Output bit stream:    %f Megabits/sec\n",
-			(double)yuvsize*8./1000000.*(double)i/elapsed);
-	}
-	snprintf(tempstr, 1024, "%s_%s.yuv", filename, subName[subsamp]);
-	if((file=fopen(tempstr, "wb"))==NULL)
-		_throwunix("opening reference image");
-	if(fwrite(dstbuf, yuvsize, 1, file)!=1)
-		_throwunix("writing reference image");
-	fclose(file);  file=NULL;
-	if(!quiet) printf("Reference image written to %s\n", tempstr);
-
-	bailout:
-	if(file) {fclose(file);  file=NULL;}
-	if(dstbuf) {free(dstbuf);  dstbuf=NULL;}
-	if(handle) {tjDestroy(handle);  handle=NULL;}
-	return retval;
-}
-
-
 int fullTest(unsigned char *srcbuf, int w, int h, int subsamp, int jpegqual,
 	char *filename)
 {

From 017339f71579f04e2d722d401e52cebf1379ee37 Mon Sep 17 00:00:00 2001
From: DRC <information@virtualgl.org>
Date: Thu, 6 Nov 2014 01:40:12 -0600
Subject: [PATCH 011/108] Change name of Mac package to com.mozilla.mozjpeg to
 avoid conflicts with libjpeg-turbo Mac package

---
 release/Distribution.xml | 10 +++++-----
 release/makemacpkg.in    |  2 +-
 release/uninstall.in     |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/release/Distribution.xml b/release/Distribution.xml
index ee73ab00..529e951f 100644
--- a/release/Distribution.xml
+++ b/release/Distribution.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <installer-gui-script minSpecVersion="1">
-	<title>libjpeg-turbo</title>
+	<title>mozjpeg</title>
 	<welcome file="Welcome.rtf" />
 	<readme file="ReadMe.txt" />
 	<license file="License.rtf" />
@@ -12,13 +12,13 @@
 	<options customize="never" />
 	<choices-outline>
 		<line choice="default">
-			<line choice="com.libjpeg-turbo.libjpeg-turbo"/>
+			<line choice="com.mozilla.mozjpeg"/>
 		</line>
 	</choices-outline>
 	<choice id="default"/>
-	<choice id="com.libjpeg-turbo.libjpeg-turbo" visible="false">
-		<pkg-ref id="com.libjpeg-turbo.libjpeg-turbo"/>
+	<choice id="com.mozilla.mozjpeg" visible="false">
+		<pkg-ref id="com.mozilla.mozjpeg"/>
 	</choice>
 	<pkg-ref auth="root"
-		id="com.libjpeg-turbo.libjpeg-turbo">libjpeg-turbo.pkg</pkg-ref>
+		id="com.mozilla.mozjpeg">mozjpeg.pkg</pkg-ref>
 </installer-gui-script>
diff --git a/release/makemacpkg.in b/release/makemacpkg.in
index 8b8a1865..ea45522d 100644
--- a/release/makemacpkg.in
+++ b/release/makemacpkg.in
@@ -254,7 +254,7 @@ cp $SRCDIR/release/License.rtf $SRCDIR/release/Welcome.rtf $SRCDIR/release/ReadM
 
 mkdir $TMPDIR/dmg
 pkgbuild --root $PKGROOT --version $VERSION.$BUILD \
-	--identifier com.libjpeg-turbo.libjpeg-turbo $TMPDIR/pkg/$PACKAGE_NAME.pkg
+	--identifier com.mozilla.$PACKAGE_NAME $TMPDIR/pkg/$PACKAGE_NAME.pkg
 productbuild --distribution $SRCDIR/release/Distribution.xml \
 	--package-path $TMPDIR/pkg/ --resources $TMPDIR/pkg/ \
 	$TMPDIR/dmg/$PACKAGE_NAME.pkg
diff --git a/release/uninstall.in b/release/uninstall.in
index f00259b1..6e47801f 100644
--- a/release/uninstall.in
+++ b/release/uninstall.in
@@ -32,7 +32,7 @@ if [ ! "`id -u`" = "0" ]; then
 fi
 
 PACKAGE=@PKGNAME@
-MACPACKAGE=com.$PACKAGE.$PACKAGE
+MACPACKAGE=com.mozilla.$PACKAGE
 RECEIPT=/Library/Receipts/$PACKAGE.pkg
 
 LSBOM=

From f7067a9e73feeba955ff4da0b09bb51b2a4f566a Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Tue, 18 Nov 2014 15:53:34 +0000
Subject: [PATCH 012/108] Uses clz and bsr instructions for bit counting on
 ARM64 platforms as well.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1413 632fc199-4ca6-4c93-a231-07263d6284db
---
 ChangeLog.txt | 3 +++
 jchuff.c      | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index 6b71ac2a..6663d0af 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -13,6 +13,9 @@ instead of -1 if componentID was > 0 and subsamp was TJSAMP_GRAY.
 [3] Fixed an issue in tjBufSizeYUV2() wherby it would erroneously return 0
 instead of -1 if width was < 1.
 
+[5] The Huffman encoder now uses clz and bsr instructions for bit counting on
+ARM64 platforms (see 1.4 beta1 [5].)
+
 
 1.3.90 (1.4 beta1)
 ==================
diff --git a/jchuff.c b/jchuff.c
index d8eb1d35..447209a0 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -37,7 +37,7 @@
  */
 
 /* NOTE: Both GCC and Clang define __GNUC__ */
-#if defined __GNUC__ && defined __arm__
+#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
 #if !defined __thumb__ || defined __thumb2__
 #define USE_CLZ_INTRINSIC
 #endif

From 60ba1963feb0d561841d652343b865914cf9c556 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Tue, 18 Nov 2014 21:45:02 +0000
Subject: [PATCH 013/108] Make TJCompressor.close() and TJDecompressor.close()
 idempotent

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1415 632fc199-4ca6-4c93-a231-07263d6284db
---
 ChangeLog.txt                                       | 7 +++++++
 java/org/libjpegturbo/turbojpeg/TJCompressor.java   | 3 ++-
 java/org/libjpegturbo/turbojpeg/TJDecompressor.java | 3 ++-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index 6663d0af..7d3143e9 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -16,6 +16,13 @@ instead of -1 if width was < 1.
 [5] The Huffman encoder now uses clz and bsr instructions for bit counting on
 ARM64 platforms (see 1.4 beta1 [5].)
 
+[6] The close() method in the TJCompressor and TJDecompressor Java classes is
+now idempotent.  Previously, that method would call the native tjDestroy()
+function even if the TurboJPEG instance had already been destroyed.  This
+caused an exception to be thrown during finalization, if the close() method had
+already been called.  The exception was caught, but it was still an expensive
+operation.
+
 
 1.3.90 (1.4 beta1)
 ==================
diff --git a/java/org/libjpegturbo/turbojpeg/TJCompressor.java b/java/org/libjpegturbo/turbojpeg/TJCompressor.java
index c4a8cc58..a780d15a 100644
--- a/java/org/libjpegturbo/turbojpeg/TJCompressor.java
+++ b/java/org/libjpegturbo/turbojpeg/TJCompressor.java
@@ -567,7 +567,8 @@ public class TJCompressor {
    * Free the native structures associated with this compressor instance.
    */
   public void close() throws Exception {
-    destroy();
+    if (handle != 0)
+      destroy();
   }
 
   protected void finalize() throws Throwable {
diff --git a/java/org/libjpegturbo/turbojpeg/TJDecompressor.java b/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
index 1a2774c0..876ff8f0 100644
--- a/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
+++ b/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
@@ -834,7 +834,8 @@ public class TJDecompressor {
    * Free the native structures associated with this decompressor instance.
    */
   public void close() throws Exception {
-    destroy();
+    if (handle != 0)
+      destroy();
   }
 
   protected void finalize() throws Throwable {

From 24ad6a0179203f8b42a26fbf9e9de4a0f2a44b00 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Wed, 19 Nov 2014 00:54:39 +0000
Subject: [PATCH 014/108] Sometimes the sampling factors in grayscale images
 can be > 1 (for instance, if compressing using 'cjpeg -sample 2x2
 -grayscale'.)  Technically, sampling factors have no meaning with grayscale
 JPEGs, and the libjpeg decompressor ignores them in that case.  Thus, the
 TurboJPEG decompressor should ignore them as well.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1417 632fc199-4ca6-4c93-a231-07263d6284db
---
 ChangeLog.txt | 9 +++++++++
 turbojpeg.c   | 8 ++++++++
 2 files changed, 17 insertions(+)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index 7d3143e9..e21ef968 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -23,6 +23,15 @@ caused an exception to be thrown during finalization, if the close() method had
 already been called.  The exception was caught, but it was still an expensive
 operation.
 
+[7] The TurboJPEG API previously generated an error ("Could not determine
+subsampling type for JPEG image") when attempting to decompress grayscale JPEG
+images that were compressed with a sampling factor other than 1 (for instance,
+with 'cjpeg -grayscale -sample 2x2').  Subsampling technically has no meaning
+with grayscale JPEGs, and thus the horizontal and vertical sampling factors
+for such images are ignored by the decompressor.  However, the TurboJPEG API
+was being too rigid and was expecting the sampling factors to be equal to 1
+before it treated the image as a grayscale JPEG.
+
 
 1.3.90 (1.4 beta1)
 ==================
diff --git a/turbojpeg.c b/turbojpeg.c
index 06cca665..bc153da9 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c
@@ -316,6 +316,14 @@ static int setDecompDefaults(struct jpeg_decompress_struct *dinfo,
 static int getSubsamp(j_decompress_ptr dinfo)
 {
 	int retval=-1, i, k;
+
+	/* The sampling factors actually have no meaning with grayscale JPEG files,
+	   and in fact it's possible to generate grayscale JPEGs with sampling
+	   factors > 1 (even though those sampling factors are ignored by the
+	   decompressor.)  Thus, we need to treat grayscale as a special case. */
+	if(dinfo->num_components==1 && dinfo->jpeg_color_space==JCS_GRAYSCALE)
+		return TJSAMP_GRAY;
+
 	for(i=0; i<NUMSUBOPT; i++)
 	{
 		if(dinfo->num_components==pixelsize[i]

From 9cb270a2169f66fb5141a0115dcf5efd9c8c9160 Mon Sep 17 00:00:00 2001
From: DRC <information@virtualgl.org>
Date: Wed, 19 Nov 2014 23:31:20 -0600
Subject: [PATCH 015/108] Use mozjpeg defaults by default Since mozjpeg is now
 backward ABI-compatible with libjpeg[-turbo], it is now possible to
 temporarily load mozjpeg into a binary application and cause that application
 to generate uber-compressed JPEGs (at the expense of an extreme performance
 loss, of course.)  For instance, someone could do

LD_LIBRARY_PATH=/opt/mozjpeg/lib convert blah_blah_blah

to make ImageMagick use mozjpeg instead of the system's pre-installed JPEG
library (libjpeg-turbo, in most cases.)  However, this only makes sense if
mozjpeg is actually producing different behavior by default than libjpeg-turbo.
Currently it isn't.  Currently it requires the application to set
JBOOLEAN_USE_MOZ_DEFAULTS to TRUE in order to enable the mozjpeg-specific
behavior, but of course applications that were built to use libjpeg[-turbo]
won't do that.  Thus, this patch sets use_moz_defaults to TRUE by default,
requiring an application to explicitly set it to FALSE in order to revert to
the libjpeg[-turbo] behavior (makes sense, since the only applications that
would need to revert to the libjpeg[-turbo] behavior would be mozjpeg-aware
applications.)

Note that we discussed the possibility of adding a function
(jpeg_revert_defaults()), which would act the same as jpeg_set_defaults() does
in libjpeg[-turbo].  This is a good solution for implementing the -revert
switch in cjpeg, but unfortunately it doesn't work for jpegtran.  The reason
is that jpeg_set_defaults() is called within the body of
jpeg_copy_critical_parameters(), which is part of the API.  So yet again,
if mozjpeg were loaded into a non-mozjpeg-aware application at run time, it
would be desirable for jpeg_copy_critical_parameters() to set the parameters
to mozjpeg defaults.  That means that, in order to implement the -revert
switch in jpegtran, it would be necessary to introduce a new function
(jpeg_revert_critical_parameters(), perhaps).  It seems cleaner to just keep
using the JBOOLEAN_USE_MOZ_DEFAULTS parameter to control the behavior of
jpeg_set_defaults(), even though this represents a minor abuse of the libjpeg
API (jpeg_set_defaults() is technically supposed to set all of the parameters
to defaults, irrespective of any previous state.  However, as long as we
document that JBOOLEAN_USE_MOZ_DEFAULTS works differently, then it should be
OK.)
---
 cjpeg.c     | 2 --
 example.c   | 2 --
 jcapimin.c  | 1 +
 jcstest.c   | 4 ----
 jpegtran.c  | 2 --
 turbojpeg.c | 1 -
 yuvjpeg.c   | 3 ---
 7 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/cjpeg.c b/cjpeg.c
index 784e7bc0..c6ea0910 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -634,8 +634,6 @@ main (int argc, char **argv)
    */
 
   cinfo.in_color_space = JCS_RGB; /* arbitrary guess */
-  if (jpeg_c_bool_param_supported(&cinfo, JBOOLEAN_USE_MOZ_DEFAULTS))
-    jpeg_c_set_bool_param(&cinfo, JBOOLEAN_USE_MOZ_DEFAULTS, TRUE);
   jpeg_set_defaults(&cinfo);
 
   /* Scan command line to find file names.
diff --git a/example.c b/example.c
index 72939acf..0a65a6cc 100644
--- a/example.c
+++ b/example.c
@@ -126,8 +126,6 @@ write_JPEG_file (char * filename, int quality)
   cinfo.image_height = image_height;
   cinfo.input_components = 3;           /* # of color components per pixel */
   cinfo.in_color_space = JCS_RGB;       /* colorspace of input image */
-  jpeg_c_set_bool_param(&cinfo, JBOOLEAN_USE_MOZ_DEFAULTS, TRUE); /* use Mozilla defaults for improved compression */
-  
   /* Now use the library's routine to set default compression parameters.
    * (You must set at least cinfo.in_color_space before calling this,
    * since the defaults depend on the source color space.)
diff --git a/jcapimin.c b/jcapimin.c
index b16f9e40..8935546b 100644
--- a/jcapimin.c
+++ b/jcapimin.c
@@ -99,6 +99,7 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
   cinfo->master = (struct jpeg_comp_master *)
       jpeg_get_small ((j_common_ptr) cinfo, sizeof(struct jpeg_comp_master));
   MEMZERO(cinfo->master, sizeof(struct jpeg_comp_master));
+  cinfo->master->use_moz_defaults = TRUE;
 }
 
 
diff --git a/jcstest.c b/jcstest.c
index 79829e6b..358ed251 100644
--- a/jcstest.c
+++ b/jcstest.c
@@ -1,7 +1,5 @@
 /*
  * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
- * mozjpeg Modifications:
- * Copyright (C) 2014, Mozilla Corporation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -87,8 +85,6 @@ int main(void)
 
   jpeg_create_compress(&cinfo);
   cinfo.input_components = 3;
-  if (jpeg_c_bool_param_supported(&cinfo, JBOOLEAN_USE_MOZ_DEFAULTS))
-    jpeg_c_set_bool_param(&cinfo, JBOOLEAN_USE_MOZ_DEFAULTS, TRUE);
   jpeg_set_defaults(&cinfo);
   cinfo.in_color_space = JCS_EXT_RGB;
   jpeg_default_colorspace(&cinfo);
diff --git a/jpegtran.c b/jpegtran.c
index 8275d1a9..926c2774 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -415,8 +415,6 @@ main (int argc, char **argv)
   /* Initialize the JPEG compression object with default error handling. */
   dstinfo.err = jpeg_std_error(&jdsterr);
   jpeg_create_compress(&dstinfo);
-  if (jpeg_c_bool_param_supported(&dstinfo, JBOOLEAN_USE_MOZ_DEFAULTS))
-    jpeg_c_set_bool_param(&dstinfo, JBOOLEAN_USE_MOZ_DEFAULTS, TRUE);
 
   /* Scan command line to find file names.
    * It is convenient to use just one switch-parsing routine, but the switch
diff --git a/turbojpeg.c b/turbojpeg.c
index 16aa9088..7617ef8b 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c
@@ -206,7 +206,6 @@ static int setCompDefaults(struct jpeg_compress_struct *cinfo,
 	}
 
 	cinfo->input_components=tjPixelSize[pixelFormat];
-	cinfo->master->use_moz_defaults = TRUE;
 	jpeg_set_defaults(cinfo);
 
 	if((env=getenv("TJ_OPTIMIZE"))!=NULL && strlen(env)>0 && !strcmp(env, "1"))
diff --git a/yuvjpeg.c b/yuvjpeg.c
index 0a94079c..11437941 100644
--- a/yuvjpeg.c
+++ b/yuvjpeg.c
@@ -217,9 +217,6 @@ int main(int argc, char *argv[]) {
 
   jpeg_stdio_dest(&cinfo, jpg_fd);
 
-  if (jpeg_c_bool_param_supported(&cinfo, JBOOLEAN_USE_MOZ_DEFAULTS))
-    jpeg_c_set_bool_param(&cinfo, JBOOLEAN_USE_MOZ_DEFAULTS, TRUE);
-
   cinfo.image_width = luma_width;
   cinfo.image_height = luma_height;
   cinfo.input_components = 3;

From 602f5bea744221aaea87e0a87f1fb16e29e12dab Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Sat, 22 Nov 2014 04:03:21 +0000
Subject: [PATCH 016/108] Print the library version and exit whenever -version
 is passed to cjpeg, djpeg, or jpegtran.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1421 632fc199-4ca6-4c93-a231-07263d6284db
---
 ChangeLog.txt | 3 +++
 cjpeg.1       | 5 ++++-
 cjpeg.c       | 9 ++++++++-
 djpeg.1       | 5 ++++-
 djpeg.c       | 6 ++++++
 jpegtran.1    | 5 ++++-
 jpegtran.c    | 8 +++++++-
 7 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index e21ef968..f0d126b4 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -32,6 +32,9 @@ for such images are ignored by the decompressor.  However, the TurboJPEG API
 was being too rigid and was expecting the sampling factors to be equal to 1
 before it treated the image as a grayscale JPEG.
 
+[8] cjpeg, djpeg, and jpegtran now accept an argument of -version, which will
+print the library version and exit.
+
 
 1.3.90 (1.4 beta1)
 ==================
diff --git a/cjpeg.1 b/cjpeg.1
index 5f5090cf..e338c80c 100644
--- a/cjpeg.1
+++ b/cjpeg.1
@@ -1,4 +1,4 @@
-.TH CJPEG 1 "11 May 2014"
+.TH CJPEG 1 "21 November 2014"
 .SH NAME
 cjpeg \- compress an image file to a JPEG file
 .SH SYNOPSIS
@@ -221,6 +221,9 @@ give more output.  Also, version information is printed at startup.
 .B \-debug
 Same as
 .BR \-verbose .
+.TP
+.B \-version
+Print version information and exit.
 .PP
 The
 .B \-restart
diff --git a/cjpeg.c b/cjpeg.c
index a7d25047..af946d28 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modified 2003-2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2013, D. R. Commander.
+ * Copyright (C) 2010, 2013-2014, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains a command-line user interface for the JPEG compressor.
@@ -192,6 +192,7 @@ usage (void)
   fprintf(stderr, "  -memdst        Compress to memory instead of file (useful for benchmarking)\n");
 #endif
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
+  fprintf(stderr, "  -version       Print version information and exit\n");
   fprintf(stderr, "Switches for wizards:\n");
   fprintf(stderr, "  -baseline      Force baseline quantization tables\n");
   fprintf(stderr, "  -qtables file  Use quantization tables given in file\n");
@@ -289,8 +290,14 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
                 JVERSION);
         printed_version = TRUE;
       }
+
       cinfo->err->trace_level++;
 
+    } else if (keymatch(arg, "version", 4)) {
+      fprintf(stderr, "%s version %s (build %s)\n",
+              PACKAGE_NAME, VERSION, BUILD);
+      exit(EXIT_SUCCESS);        
+
     } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
       /* Force a monochrome JPEG file to be generated. */
       jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
diff --git a/djpeg.1 b/djpeg.1
index 9b040ae9..73d05148 100644
--- a/djpeg.1
+++ b/djpeg.1
@@ -1,4 +1,4 @@
-.TH DJPEG 1 "11 May 2014"
+.TH DJPEG 1 "21 November 2014"
 .SH NAME
 djpeg \- decompress a JPEG file to an image file
 .SH SYNOPSIS
@@ -202,6 +202,9 @@ give more output.  Also, version information is printed at startup.
 .B \-debug
 Same as
 .BR \-verbose .
+.TP
+.B \-version
+Print version information and exit.
 .SH EXAMPLES
 .LP
 This example decompresses the JPEG file foo.jpg, quantizes it to
diff --git a/djpeg.c b/djpeg.c
index 7c63f250..0a669476 100644
--- a/djpeg.c
+++ b/djpeg.c
@@ -165,6 +165,7 @@ usage (void)
 #endif
 
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
+  fprintf(stderr, "  -version       Print version information and exit\n");
   exit(EXIT_FAILURE);
 }
 
@@ -261,6 +262,11 @@ parse_switches (j_decompress_ptr cinfo, int argc, char **argv,
       }
       cinfo->err->trace_level++;
 
+    } else if (keymatch(arg, "version", 4)) {
+      fprintf(stderr, "%s version %s (build %s)\n",
+              PACKAGE_NAME, VERSION, BUILD);
+      exit(EXIT_SUCCESS);        
+
     } else if (keymatch(arg, "fast", 1)) {
       /* Select recommended processing options for quick-and-dirty output. */
       cinfo->two_pass_quantize = FALSE;
diff --git a/jpegtran.1 b/jpegtran.1
index b6a3e566..5b8e1263 100644
--- a/jpegtran.1
+++ b/jpegtran.1
@@ -1,4 +1,4 @@
-.TH JPEGTRAN 1 "1 January 2013"
+.TH JPEGTRAN 1 "21 November 2014"
 .SH NAME
 jpegtran \- lossless transformation of JPEG files
 .SH SYNOPSIS
@@ -214,6 +214,9 @@ give more output.  Also, version information is printed at startup.
 .B \-debug
 Same as
 .BR \-verbose .
+.TP
+.B \-version
+Print version information and exit.
 .SH EXAMPLES
 .LP
 This example converts a baseline JPEG file to progressive form:
diff --git a/jpegtran.c b/jpegtran.c
index 6b40de39..e4024cbb 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-2010, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, D. R. Commander.
+ * Copyright (C) 2010, 2014, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains a command-line user interface for JPEG transcoding.
@@ -86,6 +86,7 @@ usage (void)
   fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
   fprintf(stderr, "  -outfile name  Specify name for output file\n");
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
+  fprintf(stderr, "  -version       Print version information and exit\n");
   fprintf(stderr, "Switches for wizards:\n");
 #ifdef C_MULTISCAN_FILES_SUPPORTED
   fprintf(stderr, "  -scans file    Create multi-scan JPEG per script file\n");
@@ -212,6 +213,11 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       }
       cinfo->err->trace_level++;
 
+    } else if (keymatch(arg, "version", 4)) {
+      fprintf(stderr, "%s version %s (build %s)\n",
+              PACKAGE_NAME, VERSION, BUILD);
+      exit(EXIT_SUCCESS);        
+
     } else if (keymatch(arg, "flip", 1)) {
       /* Mirror left-right or top-bottom. */
       if (++argn >= argc)       /* advance to next argument */

From 0e9c14e1bb06b3543264e2944fa26d934704e168 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Sat, 22 Nov 2014 04:25:42 +0000
Subject: [PATCH 017/108] Fix whitespace issues introduced with previous
 commit.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1424 632fc199-4ca6-4c93-a231-07263d6284db
---
 cjpeg.c    | 3 +--
 djpeg.c    | 2 +-
 jpegtran.c | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cjpeg.c b/cjpeg.c
index af946d28..92e2824f 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -290,13 +290,12 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
                 JVERSION);
         printed_version = TRUE;
       }
-
       cinfo->err->trace_level++;
 
     } else if (keymatch(arg, "version", 4)) {
       fprintf(stderr, "%s version %s (build %s)\n",
               PACKAGE_NAME, VERSION, BUILD);
-      exit(EXIT_SUCCESS);        
+      exit(EXIT_SUCCESS);
 
     } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
       /* Force a monochrome JPEG file to be generated. */
diff --git a/djpeg.c b/djpeg.c
index 0a669476..8ddff96e 100644
--- a/djpeg.c
+++ b/djpeg.c
@@ -265,7 +265,7 @@ parse_switches (j_decompress_ptr cinfo, int argc, char **argv,
     } else if (keymatch(arg, "version", 4)) {
       fprintf(stderr, "%s version %s (build %s)\n",
               PACKAGE_NAME, VERSION, BUILD);
-      exit(EXIT_SUCCESS);        
+      exit(EXIT_SUCCESS);
 
     } else if (keymatch(arg, "fast", 1)) {
       /* Select recommended processing options for quick-and-dirty output. */
diff --git a/jpegtran.c b/jpegtran.c
index e4024cbb..f978cef8 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -216,7 +216,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
     } else if (keymatch(arg, "version", 4)) {
       fprintf(stderr, "%s version %s (build %s)\n",
               PACKAGE_NAME, VERSION, BUILD);
-      exit(EXIT_SUCCESS);        
+      exit(EXIT_SUCCESS);
 
     } else if (keymatch(arg, "flip", 1)) {
       /* Mirror left-right or top-bottom. */

From f20cba1bb078b978e042b50355931f6e041dd6a5 Mon Sep 17 00:00:00 2001
From: DRC <information@virtualgl.org>
Date: Fri, 21 Nov 2014 22:26:15 -0600
Subject: [PATCH 018/108] Print the library version and exit whenever -version
 is passed to cjpeg, djpeg, or jpegtran.

---
 ChangeLog.txt | 3 +++
 cjpeg.1       | 5 ++++-
 cjpeg.c       | 8 +++++++-
 djpeg.1       | 5 ++++-
 djpeg.c       | 6 ++++++
 jpegtran.1    | 5 ++++-
 jpegtran.c    | 8 +++++++-
 7 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index 6b71ac2a..9fa884a4 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -13,6 +13,9 @@ instead of -1 if componentID was > 0 and subsamp was TJSAMP_GRAY.
 [3] Fixed an issue in tjBufSizeYUV2() wherby it would erroneously return 0
 instead of -1 if width was < 1.
 
+[8] cjpeg, djpeg, and jpegtran now accept an argument of -version, which will
+print the library version and exit.
+
 
 1.3.90 (1.4 beta1)
 ==================
diff --git a/cjpeg.1 b/cjpeg.1
index 5f5090cf..e338c80c 100644
--- a/cjpeg.1
+++ b/cjpeg.1
@@ -1,4 +1,4 @@
-.TH CJPEG 1 "11 May 2014"
+.TH CJPEG 1 "21 November 2014"
 .SH NAME
 cjpeg \- compress an image file to a JPEG file
 .SH SYNOPSIS
@@ -221,6 +221,9 @@ give more output.  Also, version information is printed at startup.
 .B \-debug
 Same as
 .BR \-verbose .
+.TP
+.B \-version
+Print version information and exit.
 .PP
 The
 .B \-restart
diff --git a/cjpeg.c b/cjpeg.c
index c6ea0910..76707d3c 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modified 2003-2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2013, D. R. Commander.
+ * Copyright (C) 2010, 2013-2014, D. R. Commander.
  * mozjpeg Modifications:
  * Copyright (C) 2014, Mozilla Corporation.
  * For conditions of distribution and use, see the accompanying README file.
@@ -214,6 +214,7 @@ usage (void)
   fprintf(stderr, "  -memdst        Compress to memory instead of file (useful for benchmarking)\n");
 #endif
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
+  fprintf(stderr, "  -version       Print version information and exit\n");
   fprintf(stderr, "Switches for wizards:\n");
   fprintf(stderr, "  -qtables file  Use quantization tables given in file\n");
   fprintf(stderr, "  -qslots N[,...]    Set component quantization tables\n");
@@ -324,6 +325,11 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       }
       cinfo->err->trace_level++;
 
+    } else if (keymatch(arg, "version", 4)) {
+      fprintf(stderr, "%s version %s (build %s)\n",
+              PACKAGE_NAME, VERSION, BUILD);
+      exit(EXIT_SUCCESS);
+
     } else if (keymatch(arg, "fastcrush", 4)) {
       jpeg_c_set_bool_param(cinfo, JBOOLEAN_OPTIMIZE_SCANS, FALSE);
 
diff --git a/djpeg.1 b/djpeg.1
index 9b040ae9..73d05148 100644
--- a/djpeg.1
+++ b/djpeg.1
@@ -1,4 +1,4 @@
-.TH DJPEG 1 "11 May 2014"
+.TH DJPEG 1 "21 November 2014"
 .SH NAME
 djpeg \- decompress a JPEG file to an image file
 .SH SYNOPSIS
@@ -202,6 +202,9 @@ give more output.  Also, version information is printed at startup.
 .B \-debug
 Same as
 .BR \-verbose .
+.TP
+.B \-version
+Print version information and exit.
 .SH EXAMPLES
 .LP
 This example decompresses the JPEG file foo.jpg, quantizes it to
diff --git a/djpeg.c b/djpeg.c
index 7c63f250..8ddff96e 100644
--- a/djpeg.c
+++ b/djpeg.c
@@ -165,6 +165,7 @@ usage (void)
 #endif
 
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
+  fprintf(stderr, "  -version       Print version information and exit\n");
   exit(EXIT_FAILURE);
 }
 
@@ -261,6 +262,11 @@ parse_switches (j_decompress_ptr cinfo, int argc, char **argv,
       }
       cinfo->err->trace_level++;
 
+    } else if (keymatch(arg, "version", 4)) {
+      fprintf(stderr, "%s version %s (build %s)\n",
+              PACKAGE_NAME, VERSION, BUILD);
+      exit(EXIT_SUCCESS);
+
     } else if (keymatch(arg, "fast", 1)) {
       /* Select recommended processing options for quick-and-dirty output. */
       cinfo->two_pass_quantize = FALSE;
diff --git a/jpegtran.1 b/jpegtran.1
index b6a3e566..5b8e1263 100644
--- a/jpegtran.1
+++ b/jpegtran.1
@@ -1,4 +1,4 @@
-.TH JPEGTRAN 1 "1 January 2013"
+.TH JPEGTRAN 1 "21 November 2014"
 .SH NAME
 jpegtran \- lossless transformation of JPEG files
 .SH SYNOPSIS
@@ -214,6 +214,9 @@ give more output.  Also, version information is printed at startup.
 .B \-debug
 Same as
 .BR \-verbose .
+.TP
+.B \-version
+Print version information and exit.
 .SH EXAMPLES
 .LP
 This example converts a baseline JPEG file to progressive form:
diff --git a/jpegtran.c b/jpegtran.c
index 926c2774..782c4732 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-2010, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, D. R. Commander.
+ * Copyright (C) 2010, 2014, D. R. Commander.
  * mozjpeg Modifications:
  * Copyright (C) 2014, Mozilla Corporation.
  * For conditions of distribution and use, see the accompanying README file.
@@ -92,6 +92,7 @@ usage (void)
   fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
   fprintf(stderr, "  -outfile name  Specify name for output file\n");
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
+  fprintf(stderr, "  -version       Print version information and exit\n");
   fprintf(stderr, "Switches for wizards:\n");
 #ifdef C_MULTISCAN_FILES_SUPPORTED
   fprintf(stderr, "  -scans file    Create multi-scan JPEG per script file\n");
@@ -222,6 +223,11 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       }
       cinfo->err->trace_level++;
 
+    } else if (keymatch(arg, "version", 4)) {
+      fprintf(stderr, "%s version %s (build %s)\n",
+              PACKAGE_NAME, VERSION, BUILD);
+      exit(EXIT_SUCCESS);
+
     } else if (keymatch(arg, "flip", 1)) {
       /* Mirror left-right or top-bottom. */
       if (++argn >= argc)       /* advance to next argument */

From f03d5df238b9d14cd61715db171799a10d93ec52 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Sat, 22 Nov 2014 22:07:05 +0000
Subject: [PATCH 019/108] Fix Huffman local buffer overrun discovered by Debian
 developers when attempting to transform a junk image using ImageMagick:
 https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=768369

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1425 632fc199-4ca6-4c93-a231-07263d6284db
---
 ChangeLog.txt | 12 ++++++++++++
 jchuff.c      | 11 ++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index f0d126b4..58b5208c 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -35,6 +35,18 @@ before it treated the image as a grayscale JPEG.
 [8] cjpeg, djpeg, and jpegtran now accept an argument of -version, which will
 print the library version and exit.
 
+[9] Referring to 1.4 beta1 [15], another extremely rare circumstance was
+discovered under which the Huffman encoder's local buffer can be overrun
+when a buffered destination manager is being used and an
+extremely-high-frequency block (basically junk image data) is being encoded.
+Even though the Huffman local buffer was increased from 128 bytes to 136 bytes
+to address the previous issue, the new issue caused even the larger buffer to
+be overrun.  Further analysis reveals that, in the absolute worst case (such as
+setting alternating AC coefficients to 32767 and -32768 in the JPEG scanning
+order), the Huffman encoder can produce encoded blocks that approach double the
+size of the unencoded blocks.  Thus, the Huffman local buffer was increased to
+256 bytes, which should prevent any such issue from re-occurring in the future.
+
 
 1.3.90 (1.4 beta1)
 ==================
diff --git a/jchuff.c b/jchuff.c
index 447209a0..a5c0a1fd 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -408,7 +408,16 @@ dump_buffer (working_state * state)
 #endif
 
 
-#define BUFSIZE (DCTSIZE2 * 2) + 8
+/* Although it is exceedingly rare, it is possible for a Huffman-encoded
+ * coefficient block to be larger than the 128-byte unencoded block.  For each
+ * of the 64 coefficients, PUT_BITS is invoked twice, and each invocation can
+ * theoretically store 16 bits (for a maximum of 2048 bits or 256 bytes per
+ * encoded block.)  If, for instance, one artificially sets the AC
+ * coefficients to alternating values of 32767 and -32768 (using the JPEG
+ * scanning order-- 1, 8, 16, etc.), then this will produce an encoded block
+ * larger than 200 bytes.
+ */
+#define BUFSIZE (DCTSIZE2 * 4)
 
 #define LOAD_BUFFER() { \
   if (state->free_in_buffer < BUFSIZE) { \

From a9cad80d193242aaaa85aeddb48114ff86e90797 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Sat, 22 Nov 2014 23:55:25 +0000
Subject: [PATCH 020/108] Oops.  Include the tjPlane*() functions in the
 mapfile so that they are exposed in the shared library on ELF systems.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1428 632fc199-4ca6-4c93-a231-07263d6284db
---
 ChangeLog.txt         | 5 +++++
 turbojpeg-mapfile     | 3 +++
 turbojpeg-mapfile.jni | 3 +++
 3 files changed, 11 insertions(+)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index 58b5208c..ac6aad33 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -47,6 +47,11 @@ order), the Huffman encoder can produce encoded blocks that approach double the
 size of the unencoded blocks.  Thus, the Huffman local buffer was increased to
 256 bytes, which should prevent any such issue from re-occurring in the future.
 
+[10] The new tjPlaneSizeYUV(), tjPlaneWidth(), and tjPlaneHeight() functions
+were not actually usable on any platform except OS X and Windows, because
+those functions were not included in the libturbojpeg mapfile.  This has been
+fixed.
+
 
 1.3.90 (1.4 beta1)
 ==================
diff --git a/turbojpeg-mapfile b/turbojpeg-mapfile
index bd675519..35d55ae3 100755
--- a/turbojpeg-mapfile
+++ b/turbojpeg-mapfile
@@ -50,4 +50,7 @@ TURBOJPEG_1.4
 		tjDecompressToYUVPlanes;
 		tjEncodeYUV3;
 		tjEncodeYUVPlanes;
+		tjPlaneHeight;
+		tjPlaneSizeYUV;
+		tjPlaneWidth;
 } TURBOJPEG_1.2;
diff --git a/turbojpeg-mapfile.jni b/turbojpeg-mapfile.jni
index 3370d1f2..9c1d25bb 100755
--- a/turbojpeg-mapfile.jni
+++ b/turbojpeg-mapfile.jni
@@ -76,6 +76,9 @@ TURBOJPEG_1.4
 		tjDecompressToYUVPlanes;
 		tjEncodeYUV3;
 		tjEncodeYUVPlanes;
+		tjPlaneHeight;
+		tjPlaneSizeYUV;
+		tjPlaneWidth;
 		Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII;
 		Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFromYUV___3_3B_3II_3III_3BII;
 		Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIIIII_3_3B_3I_3III;

From c1afc7921d340408d8f152cdd3281ae3614fe36d Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Tue, 25 Nov 2014 09:48:54 +0000
Subject: [PATCH 021/108] Restore the JPP() and JMETHOD() macros.  Even though
 libjpeg-turbo doesn't use them anymore, other software apparently does:
 https://bugzilla.redhat.com/show_bug.cgi?id=1164815
 https://bugs.kde.org/show_bug.cgi?id=340944
 https://bugzilla.mozilla.org/show_bug.cgi?id=1093615

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1431 632fc199-4ca6-4c93-a231-07263d6284db
---
 ChangeLog.txt |  8 ++++++++
 jmorecfg.h    | 10 ++++++++++
 jpeglib.h     | 10 ++++++++++
 3 files changed, 28 insertions(+)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index ac6aad33..bdb1708f 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -52,6 +52,14 @@ were not actually usable on any platform except OS X and Windows, because
 those functions were not included in the libturbojpeg mapfile.  This has been
 fixed.
 
+[11] Restored the JPP() and JMETHOD() macros in the libjpeg-turbo header files.
+libjpeg-turbo no longer supports compilers that don't support prototype
+parameters, so the JPP() and JMETHOD() macros are no longer needed, but some
+software packages still use them to define their own prototypes.  A pretty
+good argument can be made that this is a bad practice on the part of the
+software in question, but since this affects more than one package, it's just
+easier to fix it here.
+
 
 1.3.90 (1.4 beta1)
 ==================
diff --git a/jmorecfg.h b/jmorecfg.h
index 36fe9714..0ee8f502 100644
--- a/jmorecfg.h
+++ b/jmorecfg.h
@@ -180,6 +180,16 @@ typedef unsigned int JDIMENSION;
 #define EXTERN(type)            extern type
 
 
+/* Originally, this macro was used as a way of defining function prototypes
+ * for both modern compilers as well as older compilers that did not support
+ * prototype parameters.  libjpeg-turbo no longer supports these older
+ * compilers, but the macro is still included because there is some software
+ * out there that uses it.
+ */
+
+#define JMETHOD(type,methodname,arglist)  type (*methodname) arglist
+
+
 /*
  * On a few systems, type boolean and/or its values FALSE, TRUE may appear
  * in standard header files.  Or you may have conflicts with application-
diff --git a/jpeglib.h b/jpeglib.h
index 27045a42..b7089706 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -875,6 +875,16 @@ struct jpeg_memory_mgr {
 typedef boolean (*jpeg_marker_parser_method) (j_decompress_ptr cinfo);
 
 
+/* Originally, this macro was used as a way of defining function prototypes
+ * for both modern compilers as well as older compilers that did not support
+ * prototype parameters.  libjpeg-turbo no longer supports these older
+ * compilers, but the macro is still included because there is some software
+ * out there that uses it.
+ */
+
+#define JPP(arglist)    arglist
+
+
 /* Default error-management setup */
 EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr * err);
 

From de852420c0db014802405cae24751633e17c4bc6 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Tue, 25 Nov 2014 10:09:33 +0000
Subject: [PATCH 022/108] Some software also needs the FAR macro.  Ugh.  Also
 wordsmithing.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1433 632fc199-4ca6-4c93-a231-07263d6284db
---
 ChangeLog.txt | 11 +++++++----
 jmorecfg.h    | 14 +++++++++++---
 jpeglib.h     |  6 +++---
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index bdb1708f..40ae4814 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -52,10 +52,13 @@ were not actually usable on any platform except OS X and Windows, because
 those functions were not included in the libturbojpeg mapfile.  This has been
 fixed.
 
-[11] Restored the JPP() and JMETHOD() macros in the libjpeg-turbo header files.
-libjpeg-turbo no longer supports compilers that don't support prototype
-parameters, so the JPP() and JMETHOD() macros are no longer needed, but some
-software packages still use them to define their own prototypes.  A pretty
+[11] Restored the JPP(), JMETHOD(), and FAR macros in the libjpeg-turbo header
+files.  The JPP() and JMETHOD() macros were originally implemented in libjpeg
+as a way of supporting non-ANSI compilers that lacked support for prototype
+parameters.  libjpeg-turbo has never supported such compilers, but some
+software packages still use the macros to define their own prototypes.
+Similarly, libjpeg-turbo has never supported MS-DOS and other platforms that
+have far symbols, but some software packages still use the FAR macro.  A pretty
 good argument can be made that this is a bad practice on the part of the
 software in question, but since this affects more than one package, it's just
 easier to fix it here.
diff --git a/jmorecfg.h b/jmorecfg.h
index 0ee8f502..108e7de9 100644
--- a/jmorecfg.h
+++ b/jmorecfg.h
@@ -182,14 +182,22 @@ typedef unsigned int JDIMENSION;
 
 /* Originally, this macro was used as a way of defining function prototypes
  * for both modern compilers as well as older compilers that did not support
- * prototype parameters.  libjpeg-turbo no longer supports these older
- * compilers, but the macro is still included because there is some software
- * out there that uses it.
+ * prototype parameters.  libjpeg-turbo has never supported these older,
+ * non-ANSI compilers, but the macro is still included because there is some
+ * software out there that uses it.
  */
 
 #define JMETHOD(type,methodname,arglist)  type (*methodname) arglist
 
 
+/* libjpeg-turbo no longer supports platforms that have far symbols (MS-DOS),
+ * but again, some software relies on this macro.
+ */
+
+#undef FAR
+#define FAR
+
+
 /*
  * On a few systems, type boolean and/or its values FALSE, TRUE may appear
  * in standard header files.  Or you may have conflicts with application-
diff --git a/jpeglib.h b/jpeglib.h
index b7089706..9615c5d0 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -877,9 +877,9 @@ typedef boolean (*jpeg_marker_parser_method) (j_decompress_ptr cinfo);
 
 /* Originally, this macro was used as a way of defining function prototypes
  * for both modern compilers as well as older compilers that did not support
- * prototype parameters.  libjpeg-turbo no longer supports these older
- * compilers, but the macro is still included because there is some software
- * out there that uses it.
+ * prototype parameters.  libjpeg-turbo has never supported these older,
+ * non-ANSI compilers, but the macro is still included because there is some
+ * software out there that uses it.
  */
 
 #define JPP(arglist)    arglist

From 30ef0977beff87fbed6f24d9959a0912775ae3d4 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Thu, 27 Nov 2014 16:11:57 -0500
Subject: [PATCH 023/108] Clean up qtable code and change defaults

---
 cjpeg.c    |   4 +-
 jcparam.c  | 141 ++++++++++++++++++++++++++++++++++++-----------------
 rdswitch.c | 131 ++++++++++++++++++++++++++++++++++---------------
 3 files changed, 189 insertions(+), 87 deletions(-)

diff --git a/cjpeg.c b/cjpeg.c
index edc128bc..a8197a58 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -534,14 +534,14 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       jpeg_set_quality(cinfo, 75, TRUE);
       
     } else if (keymatch(arg, "tune-ms-ssim", 6)) {
-      jpeg_c_set_int_param(cinfo, JINT_BASE_QUANT_TBL_IDX, 2);
+      jpeg_c_set_int_param(cinfo, JINT_BASE_QUANT_TBL_IDX, 3);
       jpeg_c_set_float_param(cinfo, JFLOAT_LAMBDA_LOG_SCALE1, 12.0);
       jpeg_c_set_float_param(cinfo, JFLOAT_LAMBDA_LOG_SCALE2, 13.0);
       jpeg_c_set_bool_param(cinfo, JBOOLEAN_USE_LAMBDA_WEIGHT_TBL, TRUE);
       jpeg_set_quality(cinfo, 75, TRUE);
       
     } else if (keymatch(arg, "tune-hvs-psnr", 6)) {
-      jpeg_c_set_int_param(cinfo, JINT_BASE_QUANT_TBL_IDX, 4);
+      jpeg_c_set_int_param(cinfo, JINT_BASE_QUANT_TBL_IDX, 3);
       jpeg_c_set_float_param(cinfo, JFLOAT_LAMBDA_LOG_SCALE1, 14.75);
       jpeg_c_set_float_param(cinfo, JFLOAT_LAMBDA_LOG_SCALE2, 16.5);
       jpeg_c_set_bool_param(cinfo, JBOOLEAN_USE_LAMBDA_WEIGHT_TBL, TRUE);
diff --git a/jcparam.c b/jcparam.c
index 514f6cdd..b254b99f 100644
--- a/jcparam.c
+++ b/jcparam.c
@@ -70,9 +70,10 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
  * The spec says that the values given produce "good" quality, and
  * when divided by 2, "very good" quality.
  */
-static const unsigned int std_luminance_quant_tbl[6][DCTSIZE2] = {
+static const unsigned int std_luminance_quant_tbl[9][DCTSIZE2] = {
   {
-    // JPEG Annex K
+    /* JPEG Annex K
+     */
     16,  11,  10,  16,  24,  40,  51,  61,
     12,  12,  14,  19,  26,  58,  60,  55,
     14,  13,  16,  24,  40,  57,  69,  56,
@@ -83,7 +84,8 @@ static const unsigned int std_luminance_quant_tbl[6][DCTSIZE2] = {
     72,  92,  95,  98, 112, 100, 103,  99
   },
   {
-    // Flat
+    /* flat
+     */
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16,
@@ -104,7 +106,8 @@ static const unsigned int std_luminance_quant_tbl[6][DCTSIZE2] = {
     68, 90, 90, 96, 113, 102, 105, 103
   },
   {
-    // From http://www.imagemagick.org/discourse-server/viewtopic.php?f=22&t=20333&p=98008#p98008
+    /* From http://www.imagemagick.org/discourse-server/viewtopic.php?f=22&t=20333&p=98008#p98008
+     */
     16,  16,  16,  18,  25,  37,  56,  85,
     16,  17,  20,  27,  34,  40,  53,  75,
     16,  20,  24,  31,  43,  62,  91,  135,
@@ -125,30 +128,8 @@ static const unsigned int std_luminance_quant_tbl[6][DCTSIZE2] = {
     70, 75, 100, 102, 116, 100, 107, 98
   },
   {
-#if 0
-    // DCTune perceptual optimization of compressed dental X-Rays (1997) Watson, Taylor, Borthwick
-    7, 8, 10, 14, 23, 44, 95, 241,
-    8, 8, 11, 15, 25, 47, 102, 255,
-    10, 11, 13, 19, 31, 58, 127, 255,
-    14, 15, 19, 27, 44, 83, 181, 255,
-    23, 25, 31, 44, 72, 136, 255, 255,
-    44, 47, 58, 83, 136, 255, 255, 255,
-    95, 102, 127, 181, 255, 255, 255, 255,
-    241, 255, 255, 255, 255, 255, 255, 255
-#endif
-#if 0
-    // A visual detection model for DCT coefficient quantization (12/9/93) Ahumada, Watson, Peterson
-    15, 11, 11, 12, 15, 19, 25, 32,
-    11, 13, 10, 10, 12, 15, 19, 24,
-    11, 10, 14, 14, 16, 18, 22, 27,
-    12, 10, 14, 18, 21, 24, 28, 33,
-    15, 12, 16, 21, 26, 31, 36, 42,
-    19, 15, 18, 24, 31, 38, 45, 53,
-    25, 19, 22, 28, 36, 45, 55, 65,
-    32, 24, 27, 33, 42, 53, 65, 77
-#endif
-#if 1
-    // Relevance of human vision to JPEG-DCT compression (1992) Klein, Silverstein and Carney.
+    /* Relevance of human vision to JPEG-DCT compression (1992) Klein, Silverstein and Carney.
+     */
     10, 12, 14, 19, 26, 38, 57, 86,
     12, 18, 21, 28, 35, 41, 54, 76,
     14, 21, 25, 32, 44, 63, 92, 136,
@@ -157,9 +138,34 @@ static const unsigned int std_luminance_quant_tbl[6][DCTSIZE2] = {
     38, 41, 63, 75, 95, 125, 170, 239,
     57, 54, 92, 107, 132, 170, 227, 312,
     86, 76, 136, 157, 190, 239, 312, 419
-#endif
-#if 0
-    // An improved detection model for DCT coefficient quantization (1993) Peterson, Ahumada and Watson
+  },
+  {
+    /* DCTune perceptual optimization of compressed dental X-Rays (1997) Watson, Taylor, Borthwick
+     */
+    7, 8, 10, 14, 23, 44, 95, 241,
+    8, 8, 11, 15, 25, 47, 102, 255,
+    10, 11, 13, 19, 31, 58, 127, 255,
+    14, 15, 19, 27, 44, 83, 181, 255,
+    23, 25, 31, 44, 72, 136, 255, 255,
+    44, 47, 58, 83, 136, 255, 255, 255,
+    95, 102, 127, 181, 255, 255, 255, 255,
+    241, 255, 255, 255, 255, 255, 255, 255
+  },
+  {
+    /* A visual detection model for DCT coefficient quantization (12/9/93) Ahumada, Watson, Peterson
+     */
+    15, 11, 11, 12, 15, 19, 25, 32,
+    11, 13, 10, 10, 12, 15, 19, 24,
+    11, 10, 14, 14, 16, 18, 22, 27,
+    12, 10, 14, 18, 21, 24, 28, 33,
+    15, 12, 16, 21, 26, 31, 36, 42,
+    19, 15, 18, 24, 31, 38, 45, 53,
+    25, 19, 22, 28, 36, 45, 55, 65,
+    32, 24, 27, 33, 42, 53, 65, 77
+  },
+  {
+    /* An improved detection model for DCT coefficient quantization (1993) Peterson, Ahumada and Watson
+     */
     14, 10, 11, 14, 19, 25, 34, 45,
     10, 11, 11, 12, 15, 20, 26, 33,
     11, 11, 15, 18, 21, 25, 31, 38,
@@ -168,13 +174,13 @@ static const unsigned int std_luminance_quant_tbl[6][DCTSIZE2] = {
     25, 20, 25, 33, 43, 54, 64, 74,
     34, 26, 31, 39, 51, 64, 77, 91,
     45, 33, 38, 47, 59, 74, 91, 108
-#endif
   }
 };
 
-static const unsigned int std_chrominance_quant_tbl[6][DCTSIZE2] = {
+static const unsigned int std_chrominance_quant_tbl[9][DCTSIZE2] = {
   {
-    // JPEG Annex K
+    /* JPEG Annex K
+     */
     17,  18,  24,  47,  99,  99,  99,  99,
     18,  21,  26,  66,  99,  99,  99,  99,
     24,  26,  56,  99,  99,  99,  99,  99,
@@ -185,7 +191,8 @@ static const unsigned int std_chrominance_quant_tbl[6][DCTSIZE2] = {
     99,  99,  99,  99,  99,  99,  99,  99
   },
   {
-    // Flat
+    /* flat
+     */
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16,
@@ -227,15 +234,56 @@ static const unsigned int std_chrominance_quant_tbl[6][DCTSIZE2] = {
     97, 97, 99, 99, 99, 99, 97, 99
   },
   {
-    // JPEG Annex K
-    17,  18,  24,  47,  99,  99,  99,  99,
-    18,  21,  26,  66,  99,  99,  99,  99,
-    24,  26,  56,  99,  99,  99,  99,  99,
-    47,  66,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99
+    /* Relevance of human vision to JPEG-DCT compression (1992) Klein, Silverstein and Carney.
+     * Copied from luma
+     */
+    10, 12, 14, 19, 26, 38, 57, 86,
+    12, 18, 21, 28, 35, 41, 54, 76,
+    14, 21, 25, 32, 44, 63, 92, 136,
+    19, 28, 32, 41, 54, 75, 107, 157,
+    26, 35, 44, 54, 70, 95, 132, 190,
+    38, 41, 63, 75, 95, 125, 170, 239,
+    57, 54, 92, 107, 132, 170, 227, 312,
+    86, 76, 136, 157, 190, 239, 312, 419
+  },
+  {
+    /* DCTune perceptual optimization of compressed dental X-Rays (1997) Watson, Taylor, Borthwick
+     * Copied from luma
+     */
+    7, 8, 10, 14, 23, 44, 95, 241,
+    8, 8, 11, 15, 25, 47, 102, 255,
+    10, 11, 13, 19, 31, 58, 127, 255,
+    14, 15, 19, 27, 44, 83, 181, 255,
+    23, 25, 31, 44, 72, 136, 255, 255,
+    44, 47, 58, 83, 136, 255, 255, 255,
+    95, 102, 127, 181, 255, 255, 255, 255,
+    241, 255, 255, 255, 255, 255, 255, 255
+  },
+  {
+    /* A visual detection model for DCT coefficient quantization (12/9/93) Ahumada, Watson, Peterson
+     * Copied from luma
+     */
+    15, 11, 11, 12, 15, 19, 25, 32,
+    11, 13, 10, 10, 12, 15, 19, 24,
+    11, 10, 14, 14, 16, 18, 22, 27,
+    12, 10, 14, 18, 21, 24, 28, 33,
+    15, 12, 16, 21, 26, 31, 36, 42,
+    19, 15, 18, 24, 31, 38, 45, 53,
+    25, 19, 22, 28, 36, 45, 55, 65,
+    32, 24, 27, 33, 42, 53, 65, 77
+  },
+  {
+    /* An improved detection model for DCT coefficient quantization (1993) Peterson, Ahumada and Watson
+     * Copied from luma
+     */
+    14, 10, 11, 14, 19, 25, 34, 45,
+    10, 11, 11, 12, 15, 20, 26, 33,
+    11, 11, 15, 18, 21, 25, 31, 38,
+    14, 12, 18, 24, 28, 33, 39, 47,
+    19, 15, 21, 28, 36, 43, 51, 59,
+    25, 20, 25, 33, 43, 54, 64, 74,
+    34, 26, 31, 39, 51, 64, 77, 91,
+    45, 33, 38, 47, 59, 74, 91, 108
   }
 };
 
@@ -458,8 +506,9 @@ jpeg_set_defaults (j_compress_ptr cinfo)
 #endif
   
   cinfo->master->trellis_quant = cinfo->master->use_moz_defaults;
-  cinfo->master->lambda_log_scale1 = 16.0;
-  cinfo->master->lambda_log_scale2 = 15.5;
+  cinfo->master->lambda_log_scale1 = 14.75;
+  cinfo->master->lambda_log_scale2 = 16.5;
+  cinfo->master->quant_tbl_master_idx = 3;
   
   cinfo->master->use_lambda_weight_tbl = TRUE;
   cinfo->master->use_scans_in_trellis = FALSE;
diff --git a/rdswitch.c b/rdswitch.c
index 51d67974..c53ddbbf 100644
--- a/rdswitch.c
+++ b/rdswitch.c
@@ -279,8 +279,10 @@ bogus:
  * The spec says that the values given produce "good" quality, and
  * when divided by 2, "very good" quality.
  */
-static const unsigned int std_luminance_quant_tbl[6][DCTSIZE2] = {
+static const unsigned int std_luminance_quant_tbl[9][DCTSIZE2] = {
   {
+    /* JPEG Annex K
+     */
     16,  11,  10,  16,  24,  40,  51,  61,
     12,  12,  14,  19,  26,  58,  60,  55,
     14,  13,  16,  24,  40,  57,  69,  56,
@@ -291,6 +293,8 @@ static const unsigned int std_luminance_quant_tbl[6][DCTSIZE2] = {
     72,  92,  95,  98, 112, 100, 103,  99
   },
   {
+    /* flat
+     */
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16,
@@ -311,7 +315,8 @@ static const unsigned int std_luminance_quant_tbl[6][DCTSIZE2] = {
     68, 90, 90, 96, 113, 102, 105, 103
   },
   {
-    // From http://www.imagemagick.org/discourse-server/viewtopic.php?f=22&t=20333&p=98008#p98008
+    /* From http://www.imagemagick.org/discourse-server/viewtopic.php?f=22&t=20333&p=98008#p98008
+     */
     16,  16,  16,  18,  25,  37,  56,  85,
     16,  17,  20,  27,  34,  40,  53,  75,
     16,  20,  24,  31,  43,  62,  91,  135,
@@ -332,30 +337,8 @@ static const unsigned int std_luminance_quant_tbl[6][DCTSIZE2] = {
     70, 75, 100, 102, 116, 100, 107, 98
   },
   {
-#if 0
-    // DCTune perceptual optimization of compressed dental X-Rays (1997) Watson, Taylor, Borthwick
-    7, 8, 10, 14, 23, 44, 95, 241,
-    8, 8, 11, 15, 25, 47, 102, 255,
-    10, 11, 13, 19, 31, 58, 127, 255,
-    14, 15, 19, 27, 44, 83, 181, 255,
-    23, 25, 31, 44, 72, 136, 255, 255,
-    44, 47, 58, 83, 136, 255, 255, 255,
-    95, 102, 127, 181, 255, 255, 255, 255,
-    241, 255, 255, 255, 255, 255, 255, 255
-#endif
-#if 0
-    // A visual detection model for DCT coefficient quantization (12/9/93) Ahumada, Watson, Peterson
-    15, 11, 11, 12, 15, 19, 25, 32,
-    11, 13, 10, 10, 12, 15, 19, 24,
-    11, 10, 14, 14, 16, 18, 22, 27,
-    12, 10, 14, 18, 21, 24, 28, 33,
-    15, 12, 16, 21, 26, 31, 36, 42,
-    19, 15, 18, 24, 31, 38, 45, 53,
-    25, 19, 22, 28, 36, 45, 55, 65,
-    32, 24, 27, 33, 42, 53, 65, 77
-#endif
-#if 1
-    // Relevance of human vision to JPEG-DCT compression (1992) Klein, Silverstein and Carney.
+    /* Relevance of human vision to JPEG-DCT compression (1992) Klein, Silverstein and Carney.
+     */
     10, 12, 14, 19, 26, 38, 57, 86,
     12, 18, 21, 28, 35, 41, 54, 76,
     14, 21, 25, 32, 44, 63, 92, 136,
@@ -364,9 +347,34 @@ static const unsigned int std_luminance_quant_tbl[6][DCTSIZE2] = {
     38, 41, 63, 75, 95, 125, 170, 239,
     57, 54, 92, 107, 132, 170, 227, 312,
     86, 76, 136, 157, 190, 239, 312, 419
-#endif
-#if 0
-    // An improved detection model for DCT coefficient quantization (1993) Peterson, Ahumada and Watson
+  },
+  {
+    /* DCTune perceptual optimization of compressed dental X-Rays (1997) Watson, Taylor, Borthwick
+     */
+    7, 8, 10, 14, 23, 44, 95, 241,
+    8, 8, 11, 15, 25, 47, 102, 255,
+    10, 11, 13, 19, 31, 58, 127, 255,
+    14, 15, 19, 27, 44, 83, 181, 255,
+    23, 25, 31, 44, 72, 136, 255, 255,
+    44, 47, 58, 83, 136, 255, 255, 255,
+    95, 102, 127, 181, 255, 255, 255, 255,
+    241, 255, 255, 255, 255, 255, 255, 255
+  },
+  {
+    /* A visual detection model for DCT coefficient quantization (12/9/93) Ahumada, Watson, Peterson
+     */
+    15, 11, 11, 12, 15, 19, 25, 32,
+    11, 13, 10, 10, 12, 15, 19, 24,
+    11, 10, 14, 14, 16, 18, 22, 27,
+    12, 10, 14, 18, 21, 24, 28, 33,
+    15, 12, 16, 21, 26, 31, 36, 42,
+    19, 15, 18, 24, 31, 38, 45, 53,
+    25, 19, 22, 28, 36, 45, 55, 65,
+    32, 24, 27, 33, 42, 53, 65, 77
+  },
+  {
+    /* An improved detection model for DCT coefficient quantization (1993) Peterson, Ahumada and Watson
+     */
     14, 10, 11, 14, 19, 25, 34, 45,
     10, 11, 11, 12, 15, 20, 26, 33,
     11, 11, 15, 18, 21, 25, 31, 38,
@@ -375,12 +383,13 @@ static const unsigned int std_luminance_quant_tbl[6][DCTSIZE2] = {
     25, 20, 25, 33, 43, 54, 64, 74,
     34, 26, 31, 39, 51, 64, 77, 91,
     45, 33, 38, 47, 59, 74, 91, 108
-#endif
   }
 };
 
-static const unsigned int std_chrominance_quant_tbl[6][DCTSIZE2] = {
+static const unsigned int std_chrominance_quant_tbl[9][DCTSIZE2] = {
   {
+    /* JPEG Annex K
+     */
     17,  18,  24,  47,  99,  99,  99,  99,
     18,  21,  26,  66,  99,  99,  99,  99,
     24,  26,  56,  99,  99,  99,  99,  99,
@@ -391,6 +400,8 @@ static const unsigned int std_chrominance_quant_tbl[6][DCTSIZE2] = {
     99,  99,  99,  99,  99,  99,  99,  99
   },
   {
+    /* flat
+     */
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16,
     16,  16,  16,  16,  16,  16,  16,  16,
@@ -432,14 +443,56 @@ static const unsigned int std_chrominance_quant_tbl[6][DCTSIZE2] = {
     97, 97, 99, 99, 99, 99, 97, 99
   },
   {
-    17,  18,  24,  47,  99,  99,  99,  99,
-    18,  21,  26,  66,  99,  99,  99,  99,
-    24,  26,  56,  99,  99,  99,  99,  99,
-    47,  66,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99    
+    /* Relevance of human vision to JPEG-DCT compression (1992) Klein, Silverstein and Carney.
+     * Copied from luma
+     */
+    10, 12, 14, 19, 26, 38, 57, 86,
+    12, 18, 21, 28, 35, 41, 54, 76,
+    14, 21, 25, 32, 44, 63, 92, 136,
+    19, 28, 32, 41, 54, 75, 107, 157,
+    26, 35, 44, 54, 70, 95, 132, 190,
+    38, 41, 63, 75, 95, 125, 170, 239,
+    57, 54, 92, 107, 132, 170, 227, 312,
+    86, 76, 136, 157, 190, 239, 312, 419
+  },
+  {
+    /* DCTune perceptual optimization of compressed dental X-Rays (1997) Watson, Taylor, Borthwick
+     * Copied from luma
+     */
+    7, 8, 10, 14, 23, 44, 95, 241,
+    8, 8, 11, 15, 25, 47, 102, 255,
+    10, 11, 13, 19, 31, 58, 127, 255,
+    14, 15, 19, 27, 44, 83, 181, 255,
+    23, 25, 31, 44, 72, 136, 255, 255,
+    44, 47, 58, 83, 136, 255, 255, 255,
+    95, 102, 127, 181, 255, 255, 255, 255,
+    241, 255, 255, 255, 255, 255, 255, 255
+  },
+  {
+    /* A visual detection model for DCT coefficient quantization (12/9/93) Ahumada, Watson, Peterson
+     * Copied from luma
+     */
+    15, 11, 11, 12, 15, 19, 25, 32,
+    11, 13, 10, 10, 12, 15, 19, 24,
+    11, 10, 14, 14, 16, 18, 22, 27,
+    12, 10, 14, 18, 21, 24, 28, 33,
+    15, 12, 16, 21, 26, 31, 36, 42,
+    19, 15, 18, 24, 31, 38, 45, 53,
+    25, 19, 22, 28, 36, 45, 55, 65,
+    32, 24, 27, 33, 42, 53, 65, 77
+  },
+  {
+    /* An improved detection model for DCT coefficient quantization (1993) Peterson, Ahumada and Watson
+     * Copied from luma
+     */
+    14, 10, 11, 14, 19, 25, 34, 45,
+    10, 11, 11, 12, 15, 20, 26, 33,
+    11, 11, 15, 18, 21, 25, 31, 38,
+    14, 12, 18, 24, 28, 33, 39, 47,
+    19, 15, 21, 28, 36, 43, 51, 59,
+    25, 20, 25, 33, 43, 54, 64, 74,
+    34, 26, 31, 39, 51, 64, 77, 91,
+    45, 33, 38, 47, 59, 74, 91, 108
   }
 };
 

From 2a41c84487dbdaf0b429b5c7d4ad91c79289eaf5 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Fri, 28 Nov 2014 08:10:20 -0500
Subject: [PATCH 024/108] Fix C comment style

---
 jcparam.c  | 3 ++-
 rdswitch.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/jcparam.c b/jcparam.c
index 41db06fc..4a723a32 100644
--- a/jcparam.c
+++ b/jcparam.c
@@ -213,7 +213,8 @@ static const unsigned int std_chrominance_quant_tbl[9][DCTSIZE2] = {
     99, 99, 99, 99, 99, 99, 99, 99
   },
   {
-    // From http://www.imagemagick.org/discourse-server/viewtopic.php?f=22&t=20333&p=98008#p98008
+    /* From http://www.imagemagick.org/discourse-server/viewtopic.php?f=22&t=20333&p=98008#p98008
+     */
     16,  16,  16,  18,  25,  37,  56,  85,
     16,  17,  20,  27,  34,  40,  53,  75,
     16,  20,  24,  31,  43,  62,  91,  135,
diff --git a/rdswitch.c b/rdswitch.c
index d6619591..d4c630e6 100644
--- a/rdswitch.c
+++ b/rdswitch.c
@@ -422,7 +422,8 @@ static const unsigned int std_chrominance_quant_tbl[9][DCTSIZE2] = {
     99, 99, 99, 99, 99, 99, 99, 99
   },
   {
-    // From http://www.imagemagick.org/discourse-server/viewtopic.php?f=22&t=20333&p=98008#p98008
+    /* From http://www.imagemagick.org/discourse-server/viewtopic.php?f=22&t=20333&p=98008#p98008
+     */
     16,  16,  16,  18,  25,  37,  56,  85,
     16,  17,  20,  27,  34,  40,  53,  75,
     16,  20,  24,  31,  43,  62,  91,  135,

From 5dae26de7cae6f4171f18402fe9bc36f8ff718db Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Fri, 28 Nov 2014 15:06:30 -0500
Subject: [PATCH 025/108] Remove access to parameter that shouldn't be exposed

---
 jccompat.c | 6 ------
 jpegint.h  | 2 +-
 jpeglib.h  | 1 -
 3 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/jccompat.c b/jccompat.c
index f6ddf530..989a1456 100644
--- a/jccompat.c
+++ b/jccompat.c
@@ -28,7 +28,6 @@ jpeg_c_bool_param_supported (j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
   case JBOOLEAN_TRELLIS_EOB_OPT:
   case JBOOLEAN_USE_LAMBDA_WEIGHT_TBL:
   case JBOOLEAN_USE_SCANS_IN_TRELLIS:
-  case JBOOLEAN_TRELLIS_PASSES:
   case JBOOLEAN_TRELLIS_Q_OPT:
   case JBOOLEAN_OVERSHOOT_DERINGING:
     return TRUE;
@@ -70,9 +69,6 @@ jpeg_c_set_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param,
   case JBOOLEAN_USE_SCANS_IN_TRELLIS:
     cinfo->master->use_scans_in_trellis = value;
     break;
-  case JBOOLEAN_TRELLIS_PASSES:
-    cinfo->master->trellis_passes = value;
-    break;
   case JBOOLEAN_TRELLIS_Q_OPT:
     cinfo->master->trellis_q_opt = value;
     break;
@@ -107,8 +103,6 @@ jpeg_c_get_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
     return cinfo->master->use_lambda_weight_tbl;
   case JBOOLEAN_USE_SCANS_IN_TRELLIS:
     return cinfo->master->use_scans_in_trellis;
-  case JBOOLEAN_TRELLIS_PASSES:
-    return cinfo->master->trellis_passes;
   case JBOOLEAN_TRELLIS_Q_OPT:
     return cinfo->master->trellis_q_opt;
   case JBOOLEAN_OVERSHOOT_DERINGING:
diff --git a/jpegint.h b/jpegint.h
index 86d120f8..5e960d13 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -67,7 +67,7 @@ struct jpeg_comp_master {
   boolean trellis_eob_opt; /* TRUE=optimize for sequences of EOB */
   boolean use_lambda_weight_tbl; /* TRUE=use lambda weighting table */
   boolean use_scans_in_trellis; /* TRUE=use scans in trellis optimization */
-  boolean trellis_passes; /* TRUE=currently doing trellis-related passes */
+  boolean trellis_passes; /* TRUE=currently doing trellis-related passes [not exposed] */
   boolean trellis_q_opt; /* TRUE=optimize quant table in trellis loop */
   boolean overshoot_deringing; /* TRUE=preprocess input to reduce ringing of edges on white background */
 
diff --git a/jpeglib.h b/jpeglib.h
index 00f3de36..a9c2c9f3 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -283,7 +283,6 @@ typedef enum {
   JBOOLEAN_TRELLIS_EOB_OPT = 0xD7F73780, /* TRUE=optimize for sequences of EOB */
   JBOOLEAN_USE_LAMBDA_WEIGHT_TBL = 0x339DB65F, /* TRUE=use lambda weighting table */
   JBOOLEAN_USE_SCANS_IN_TRELLIS = 0xFD841435, /* TRUE=use scans in trellis optimization */
-  JBOOLEAN_TRELLIS_PASSES = 0x3FF8A439, /* TRUE=currently doing trellis-related passes */
   JBOOLEAN_TRELLIS_Q_OPT = 0xE12AE269, /* TRUE=optimize quant table in trellis loop */
   JBOOLEAN_OVERSHOOT_DERINGING = 0x3F4BBBF9 /* TRUE=preprocess input to reduce ringing of edges on white background */
 } J_BOOLEAN_PARAM;

From faa4c444533e49e7bf970789d5900e7173fa65fd Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Sat, 29 Nov 2014 19:15:46 -0500
Subject: [PATCH 026/108] Use single parameter for DC scan opt mode

JBOOLEAN_ONE_DC_SCAN and JBOOLEAN_SEP_DC_SCAN are merged into a single
parameter JINT_DC_SCAN_OPT_MODE
Default behavior is modified to use one DC scan per component
---
 cjpeg.c    | 19 +++++++++++--------
 jccompat.c | 18 ++++++------------
 jcmaster.c |  4 ++--
 jcparam.c  |  8 ++++----
 jpegint.h  |  3 +--
 jpeglib.h  |  5 ++---
 6 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/cjpeg.c b/cjpeg.c
index ed1cb6dc..1277cae0 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -178,8 +178,11 @@ usage (void)
 #endif
   fprintf(stderr, "  -revert        Revert to standard defaults (instead of mozjpeg defaults)\n");
   fprintf(stderr, "  -fastcrush     Disable progressive scan optimization\n");
-  fprintf(stderr, "  -opt-dc-scan   Optimize DC scans (may be incompatible with some JPEG decoders)\n");
-  fprintf(stderr, "  -split-dc-scan Use one DC scan per component (may be incompatible with some JPEG decoders?)\n");
+  fprintf(stderr, "  -dc-scan-opt   DC scan optimization mode\n");
+  fprintf(stderr, "                 - 0 One scan for all components\n");
+  fprintf(stderr, "                 - 1 One scan per component (default)\n");
+  fprintf(stderr, "                 - 2 Optimize between one scan for all components and one scan for 1st component\n");
+  fprintf(stderr, "                     plus one scan for remaining components\n");
   fprintf(stderr, "  -notrellis     Disable trellis optimization\n");
   fprintf(stderr, "  -trellis-dc    Enable trellis optimization of DC coefficients (default)\n");
   fprintf(stderr, "  -notrellis-dc  Disable trellis optimization of DC coefficients\n");
@@ -373,8 +376,12 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
         lval *= 1000L;
       cinfo->mem->max_memory_to_use = lval * 1000L;
 
-    } else if (keymatch(arg, "opt-dc-scan", 6)) {
-      jpeg_c_set_bool_param(cinfo, JBOOLEAN_ONE_DC_SCAN, FALSE);
+    } else if (keymatch(arg, "dc-scan-opt", 3)) {
+      if (++argn >= argc) {      /* advance to next argument */
+        fprintf(stderr, "%s: missing argument for dc-scan-opt\n", progname);
+        usage();
+      }
+      jpeg_c_set_int_param(cinfo, JINT_DC_SCAN_OPT_MODE, atoi(argv[argn]));
 
     } else if (keymatch(arg, "optimize", 1) || keymatch(arg, "optimise", 1)) {
       /* Enable entropy parm optimization. */
@@ -505,10 +512,6 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
         usage();
       cinfo->smoothing_factor = val;
 
-    } else if (keymatch(arg, "split-dc-scans", 3)) {
-      jpeg_c_set_bool_param(cinfo, JBOOLEAN_ONE_DC_SCAN, FALSE);
-      jpeg_c_set_bool_param(cinfo, JBOOLEAN_SEP_DC_SCAN, TRUE);
-      
     } else if (keymatch(arg, "targa", 1)) {
       /* Input file is Targa format. */
       is_targa = TRUE;
diff --git a/jccompat.c b/jccompat.c
index 989a1456..d0a398b2 100644
--- a/jccompat.c
+++ b/jccompat.c
@@ -21,8 +21,6 @@ jpeg_c_bool_param_supported (j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
   switch (param) {
   case JBOOLEAN_USE_MOZ_DEFAULTS:
   case JBOOLEAN_OPTIMIZE_SCANS:
-  case JBOOLEAN_ONE_DC_SCAN:
-  case JBOOLEAN_SEP_DC_SCAN:
   case JBOOLEAN_TRELLIS_QUANT:
   case JBOOLEAN_TRELLIS_QUANT_DC:
   case JBOOLEAN_TRELLIS_EOB_OPT:
@@ -48,12 +46,6 @@ jpeg_c_set_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param,
   case JBOOLEAN_OPTIMIZE_SCANS:
     cinfo->master->optimize_scans = value;
     break;
-  case JBOOLEAN_ONE_DC_SCAN:
-    cinfo->master->one_dc_scan = value;
-    break;
-  case JBOOLEAN_SEP_DC_SCAN:
-    cinfo->master->sep_dc_scan = value;
-    break;
   case JBOOLEAN_TRELLIS_QUANT:
     cinfo->master->trellis_quant = value;
     break;
@@ -89,10 +81,6 @@ jpeg_c_get_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
     return cinfo->master->use_moz_defaults;
   case JBOOLEAN_OPTIMIZE_SCANS:
     return cinfo->master->optimize_scans;
-  case JBOOLEAN_ONE_DC_SCAN:
-    return cinfo->master->one_dc_scan;
-  case JBOOLEAN_SEP_DC_SCAN:
-    return cinfo->master->sep_dc_scan;
   case JBOOLEAN_TRELLIS_QUANT:
     return cinfo->master->trellis_quant;
   case JBOOLEAN_TRELLIS_QUANT_DC:
@@ -167,6 +155,7 @@ jpeg_c_int_param_supported (j_compress_ptr cinfo, J_INT_PARAM param)
   case JINT_TRELLIS_FREQ_SPLIT:
   case JINT_TRELLIS_NUM_LOOPS:
   case JINT_BASE_QUANT_TBL_IDX:
+  case JINT_DC_SCAN_OPT_MODE:
     return TRUE;
   }
 
@@ -187,6 +176,9 @@ jpeg_c_set_int_param (j_compress_ptr cinfo, J_INT_PARAM param, int value)
   case JINT_BASE_QUANT_TBL_IDX:
     cinfo->master->quant_tbl_master_idx = value;
     break;
+  case JINT_DC_SCAN_OPT_MODE:
+    cinfo->master->dc_scan_opt_mode = value;
+    break;
   default:
     ERREXIT(cinfo, JERR_BAD_PARAM);
   }
@@ -203,6 +195,8 @@ jpeg_c_get_int_param (j_compress_ptr cinfo, J_INT_PARAM param)
     return cinfo->master->trellis_num_loops;
   case JINT_BASE_QUANT_TBL_IDX:
     return cinfo->master->quant_tbl_master_idx;
+  case JINT_DC_SCAN_OPT_MODE:
+    return cinfo->master->dc_scan_opt_mode;
   default:
     ERREXIT(cinfo, JERR_BAD_PARAM);
   }
diff --git a/jcmaster.c b/jcmaster.c
index ceb4282c..16b509f5 100644
--- a/jcmaster.c
+++ b/jcmaster.c
@@ -779,10 +779,10 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
     copy_buffer(cinfo, 0);
 
     if (cinfo->num_scans > cinfo->master->num_scans_luma &&
-        !cinfo->master->one_dc_scan) {
+        cinfo->master->dc_scan_opt_mode != 0) {
       base_scan_idx = cinfo->master->num_scans_luma;
       
-      if (master->interleave_chroma_dc && !cinfo->master->sep_dc_scan)
+      if (master->interleave_chroma_dc && cinfo->master->dc_scan_opt_mode != 1)
         copy_buffer(cinfo, base_scan_idx);
       else {
         copy_buffer(cinfo, base_scan_idx+1);
diff --git a/jcparam.c b/jcparam.c
index 4a723a32..b6c491df 100644
--- a/jcparam.c
+++ b/jcparam.c
@@ -496,7 +496,7 @@ jpeg_set_defaults (j_compress_ptr cinfo)
 
   jpeg_default_colorspace(cinfo);
   
-  cinfo->master->one_dc_scan = TRUE;
+  cinfo->master->dc_scan_opt_mode = 1;
   
 #ifdef C_PROGRESSIVE_SUPPORTED
   if (cinfo->master->use_moz_defaults) {
@@ -786,7 +786,7 @@ jpeg_search_progression (j_compress_ptr cinfo)
   /* last 4 done conditionally */
   
   /* luma DC by itself */
-  if (cinfo->master->one_dc_scan)
+  if (cinfo->master->dc_scan_opt_mode == 0)
     scanptr = fill_dc_scans(scanptr, ncomps, 0, 0);
   else
     scanptr = fill_dc_scans(scanptr, 1, 0, 0);
@@ -912,9 +912,9 @@ jpeg_simple_progression (j_compress_ptr cinfo)
     if (cinfo->master->use_moz_defaults == TRUE) {
       /* scan defined in jpeg_scan_rgb.txt in jpgcrush */
     /* Initial DC scan */
-      if (cinfo->master->one_dc_scan)
+      if (cinfo->master->dc_scan_opt_mode == 0)
         scanptr = fill_dc_scans(scanptr, ncomps, 0, 0);
-      else if (cinfo->master->sep_dc_scan) {
+      else if (cinfo->master->dc_scan_opt_mode == 1) {
         scanptr = fill_a_scan(scanptr, 0, 0, 0, 0, 0);
         scanptr = fill_a_scan(scanptr, 1, 0, 0, 0, 0);
         scanptr = fill_a_scan(scanptr, 2, 0, 0, 0, 0);
diff --git a/jpegint.h b/jpegint.h
index 5e960d13..49de57ed 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -60,8 +60,6 @@ struct jpeg_comp_master {
   /* Extension parameters */
   boolean use_moz_defaults; /* TRUE=use Mozilla defaults */
   boolean optimize_scans; /* TRUE=optimize progressive coding scans */
-  boolean one_dc_scan; /* TRUE=use a single DC scan interleaving all components */
-  boolean sep_dc_scan; /* TRUE=each DC scan is separate */
   boolean trellis_quant; /* TRUE=use trellis quantization */
   boolean trellis_quant_dc; /* TRUE=use trellis quant for DC coefficient */
   boolean trellis_eob_opt; /* TRUE=optimize for sequences of EOB */
@@ -74,6 +72,7 @@ struct jpeg_comp_master {
   double norm_src[NUM_QUANT_TBLS][DCTSIZE2];
   double norm_coef[NUM_QUANT_TBLS][DCTSIZE2];
 
+  int dc_scan_opt_mode; /* DC scan optimization mode */
   int quant_tbl_master_idx; /* Quantization table master index */
   int trellis_freq_split; /* splitting point for frequency in trellis quantization */
   int trellis_num_loops; /* number of trellis loops */
diff --git a/jpeglib.h b/jpeglib.h
index a9c2c9f3..1b95a36e 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -276,8 +276,6 @@ typedef enum {
 typedef enum {
   JBOOLEAN_USE_MOZ_DEFAULTS = 0xAE2F5D7F, /* TRUE=use Mozilla defaults */
   JBOOLEAN_OPTIMIZE_SCANS = 0x680C061E, /* TRUE=optimize progressive coding scans */
-  JBOOLEAN_ONE_DC_SCAN = 0x3DA6A269, /* TRUE=use a single DC scan interleaving all components */
-  JBOOLEAN_SEP_DC_SCAN = 0xE20DFA9F, /* TRUE=each DC scan is separate */
   JBOOLEAN_TRELLIS_QUANT = 0xC5122033, /* TRUE=use trellis quantization */
   JBOOLEAN_TRELLIS_QUANT_DC = 0x339D4C0C, /* TRUE=use trellis quant for DC coefficient */
   JBOOLEAN_TRELLIS_EOB_OPT = 0xD7F73780, /* TRUE=optimize for sequences of EOB */
@@ -299,7 +297,8 @@ typedef enum {
 typedef enum {
   JINT_TRELLIS_FREQ_SPLIT = 0x6FAFF127, /* splitting point for frequency in trellis quantization */
   JINT_TRELLIS_NUM_LOOPS = 0xB63EBF39, /* number of trellis loops */
-  JINT_BASE_QUANT_TBL_IDX = 0x44492AB1 /* base quantization table index */
+  JINT_BASE_QUANT_TBL_IDX = 0x44492AB1, /* base quantization table index */
+  JINT_DC_SCAN_OPT_MODE = 0x0BE7AD3C /* DC scan optimization mode */
 } J_INT_PARAM;
 
 

From 413ac73d98c16bbaebfd3279ca2e440d7da9149c Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Sat, 29 Nov 2014 19:20:33 -0500
Subject: [PATCH 027/108] Add documentation for mozjpeg

A brief explanation is provided for the parameters introduced in
mozjpeg
---
 README-mozilla.txt | 109 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

diff --git a/README-mozilla.txt b/README-mozilla.txt
index e69de29b..8adbb3bb 100644
--- a/README-mozilla.txt
+++ b/README-mozilla.txt
@@ -0,0 +1,109 @@
+Mozilla JPEG Encoder Project
+============================
+
+This project's goal is to reduce the size of JPEG files without reducing
+quality or compatibility with the vast majority of the world's deployed decoders.
+
+The idea is to reduce transfer times for JPEGs on the Web, thus reducing page load times.
+
+'mozjpeg' is not intended to be a general JPEG library replacement. It makes tradeoffs that
+are intended to benefit Web use cases and focuses solely on improving encoding. It is best
+used as part of a Web encoding workflow. For a general JPEG library (e.g. your system libjpeg),
+especially if you care about decoding, we recommend libjpeg-turbo.
+
+
+Data structures
+===============
+
+New parameters introduced by the Mozilla JPEG encoder are placed into the jpeg_comp_master data
+structure which his not directly accessible. Several functions are introduced to get and set these
+parameters:
+
+EXTERN(boolean) jpeg_c_bool_param_supported (j_compress_ptr cinfo, J_BOOLEAN_PARAM param);
+EXTERN(void) jpeg_c_set_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param, boolean value);
+EXTERN(boolean) jpeg_c_get_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param);
+
+EXTERN(boolean) jpeg_c_float_param_supported (j_compress_ptr cinfo, J_FLOAT_PARAM param);
+EXTERN(void) jpeg_c_set_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param, float value);
+EXTERN(float) jpeg_c_get_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param);
+
+EXTERN(boolean) jpeg_c_int_param_supported (j_compress_ptr cinfo, J_INT_PARAM param);
+EXTERN(void) jpeg_c_set_int_param (j_compress_ptr cinfo, J_INT_PARAM param, int value);
+EXTERN(int) jpeg_c_get_int_param (j_compress_ptr cinfo, J_INT_PARAM param);
+
+
+Boolean parameters
+------------------
+
+* JBOOLEAN_USE_MOZ_DEFAULTS indicates whether the Mozilla default settings should be used. Otherwise
+  the behavior reverts to the default from libjpeg-turbo. Note that this parameter should be set
+  before calling jpeg_set_defaults(). By default this parameter is enabled.
+
+* JBOOLEAN_OPTIMIZE_SCANS indicates whether to optimize scan parameters. Parameter optimization is
+  done as in jpgcrush. By default this parameter is enabled.
+
+* JBOOLEAN_TRELLIS_QUANT indicates whether to apply trellis quantization. For each 8x8 block trellis
+  quantization determines the best trade-off between rate and distortion. By default this parameter
+  is enabled.
+
+* JBOOLEAN_TRELLIS_QUANT_DC indicates whether to apply trellis quantization to DC coefficients. By
+  default this parameter is enabled.
+
+* JBOOLEAN_TRELLIS_EOB_OPT indicates whether to optimize runs of zero blocks in trellis quantization.
+  This is applicable only when JBOOLEAN_USE_SCANS_IN_TRELLIS is enabled. By default this parameter
+  is disabled.
+
+* JBOOLEAN_USE_LAMBDA_WEIGHT_TBL has currently no effect.
+
+* JBOOLEAN_USE_SCANS_IN_TRELLIS indicates whether multiple scans are considered during trellis
+  quantization. By default this parameter is disabled.
+
+* JBOOLEAN_TRELLIS_Q_OPT indicates whether to optimize the quantization table after trellis quantization.
+  If enabled a revised quantization table is derived such as to minimize the reconstruction error
+  given the quantized coefficients. By default this parameter is disabled.
+
+* JBOOLEAN_OVERSHOOT_DERINGING indicates whether overshooting is applied to samples with extreme
+  values (e.g., 0 and 255 for 8-bit samples). Overshooting may reduce ringing artifacts from
+  compression, in particular in areas where black text appears on a white background. By default
+  this parameter is enabled.
+
+Floating-point parameters
+-------------------------
+
+* JFLOAT_LAMBDA_LOG_SCALE1 and JFLOAT_LAMBDA_LOG_SCALE2 determine the lambda value used in
+  trellis quantization. By default these parameters are set to 14.75 and 16.5. The lambda value
+  (Lagrance multiplier) in the R + lambda * D equation is derived from
+  lambda = 2^s1 / ((2^s2 + n) * q^2) where s1 and s2 are the values of JFLOAT_LAMBDA_LOG_SCALE1
+  and JFLOAT_LAMBDA_LOG_SCALE2, n is the average of the squared unquantized AC coefficients
+  within the current 8x8 block, and q is the quantization table entry associated with the
+  current coefficient frequency. If JFLOAT_LAMBDA_LOG_SCALE2 is 0, an alternate form is used that
+  does not rely on n: lambda = 2^(s1-12) / q^2.
+
+Integer parameters
+------------------
+
+* JINT_TRELLIS_FREQ_SPLIT determines the position within the zigzag scan at which the split between
+  scans is positioned in the context of trellis quantization. JBOOLEAN_USE_SCANS_IN_TRELLIS must
+  be enabled for this parameter to take effect. By default this parameter is set to value 8.
+
+* JINT_TRELLIS_NUM_LOOPS determines the number of trellis quantization passes. Huffman tables are
+  updated between passes. By default this parameter is set to value 1.
+
+* JINT_BASE_QUANT_TBL_IDX determines which quantization table set to use. Multiple sets are defined
+  as below. By default this parameter is set to value 3.
+  - 0  Tables from JPEG Annex K
+  - 1  Flat table
+  - 2  Table tuned for MSSIM on Kodak image set
+  - 3  Table from http://www.imagemagick.org/discourse-server/viewtopic.php?f=22&t=20333&p=98008#p98008
+  - 4  Table tuned for PSNR-HVS-M on Kodak image set
+  - 5  Table from Relevance of human vision to JPEG-DCT compression (1992) Klein, Silverstein and Carney
+  - 6  Table from DCTune perceptual optimization of compressed dental X-Rays (1997) Watson, Taylor, Borthwick
+  - 7  Table from A visual detection model for DCT coefficient quantization (12/9/93) Ahumada, Watson, Peterson
+  - 8  Table from An improved detection model for DCT coefficient quantization (1993) Peterson, Ahumada and Watson
+
+* JINT_DC_SCAN_OPT_MODE determines the DC scan optimization mode. Modes are defined as below. By default
+  this parameter is set to value 1.
+  - 0  One scan for all components
+  - 1  One scan per component
+  - 2  Optimize between one scan for all components and one scan for 1st component plus one scan for
+       remaining components

From bbaa3afe234c5826f838b33769ae39eb702c20e0 Mon Sep 17 00:00:00 2001
From: DRC <information@virtualgl.org>
Date: Mon, 1 Dec 2014 15:50:00 -0600
Subject: [PATCH 028/108] Wordsmith mozjpeg README file This includes more
 descriptive text for the project summary (the same text that is in the
 package descriptions), a more thorough description of the libjpeg API
 extensibility framework, reformatting to improve readability (particularly on
 80-column terminals), and numerous grammar tweaks.

---
 README-mozilla.txt | 235 +++++++++++++++++++++++++++++----------------
 1 file changed, 152 insertions(+), 83 deletions(-)

diff --git a/README-mozilla.txt b/README-mozilla.txt
index 8adbb3bb..88978f12 100644
--- a/README-mozilla.txt
+++ b/README-mozilla.txt
@@ -1,109 +1,178 @@
 Mozilla JPEG Encoder Project
 ============================
 
-This project's goal is to reduce the size of JPEG files without reducing
-quality or compatibility with the vast majority of the world's deployed decoders.
+mozjpeg is a fork of libjpeg-turbo that aims to speed up load times of web
+pages by reducing the size (and, by extension, the transmission time) of JPEG
+files.  It accomplishes this by enabling optimized Huffman trees and
+progressive entropy coding by default in the JPEG compressor, as well as
+splitting the spectrum of DCT coefficients into separate scans and using
+Trellis quantisation.
 
-The idea is to reduce transfer times for JPEGs on the Web, thus reducing page load times.
-
-'mozjpeg' is not intended to be a general JPEG library replacement. It makes tradeoffs that
-are intended to benefit Web use cases and focuses solely on improving encoding. It is best
-used as part of a Web encoding workflow. For a general JPEG library (e.g. your system libjpeg),
-especially if you care about decoding, we recommend libjpeg-turbo.
+Although it is based on libjpeg-turbo, mozjpeg is not intended to be a
+general-purpose or high-performance JPEG library.  Its performance is highly
+"asymmetric".  That is, the JPEG files it generates require much more time to
+compress than to decompress.  When the default settings are used, mozjpeg is
+considerably slower than libjpeg-turbo or even libjpeg at compressing images.
+Thus, it is not generally suitable for real-time compression.  It is best used
+as part of a web encoding workflow.
 
 
-Data structures
-===============
+libjpeg API Extensibility Framework
+===================================
 
-New parameters introduced by the Mozilla JPEG encoder are placed into the jpeg_comp_master data
-structure which his not directly accessible. Several functions are introduced to get and set these
-parameters:
+mozjpeg's implementation of the libjpeg API includes an extensibility framework
+that allows new features to be added without modifying the transparent libjpeg
+compress/decompress structures (which would break backward ABI compatibility.)
+Extension parameters are placed into the opaque jpeg_comp_master structure, and
+a set of accessor functions and globally unique tokens allows for 
+getting/setting those parameters without directly accessing the structure.
 
-EXTERN(boolean) jpeg_c_bool_param_supported (j_compress_ptr cinfo, J_BOOLEAN_PARAM param);
-EXTERN(void) jpeg_c_set_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param, boolean value);
-EXTERN(boolean) jpeg_c_get_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param);
-
-EXTERN(boolean) jpeg_c_float_param_supported (j_compress_ptr cinfo, J_FLOAT_PARAM param);
-EXTERN(void) jpeg_c_set_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param, float value);
-EXTERN(float) jpeg_c_get_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param);
-
-EXTERN(boolean) jpeg_c_int_param_supported (j_compress_ptr cinfo, J_INT_PARAM param);
-EXTERN(void) jpeg_c_set_int_param (j_compress_ptr cinfo, J_INT_PARAM param, int value);
-EXTERN(int) jpeg_c_get_int_param (j_compress_ptr cinfo, J_INT_PARAM param);
+Currently, only the accessor functions necessary to support the mozjpeg
+extensions are implemented, but the framework can be easily extended in the
+future to accommodate additional simple parameter types, complex or
+multi-valued parameters, or decompressor extensions.
 
 
-Boolean parameters
-------------------
+The currently-implemented accessor functions are as follows:
 
-* JBOOLEAN_USE_MOZ_DEFAULTS indicates whether the Mozilla default settings should be used. Otherwise
-  the behavior reverts to the default from libjpeg-turbo. Note that this parameter should be set
-  before calling jpeg_set_defaults(). By default this parameter is enabled.
+boolean jpeg_c_bool_param_supported (j_compress_ptr cinfo,
+                                     J_BOOLEAN_PARAM param)
+        Returns TRUE if the given boolean extension parameter is supported by
+        this implementation of the libjpeg API, or FALSE otherwise.
 
-* JBOOLEAN_OPTIMIZE_SCANS indicates whether to optimize scan parameters. Parameter optimization is
-  done as in jpgcrush. By default this parameter is enabled.
+void jpeg_c_set_bool_param (j_compress_ptr cinfo,
+                            J_BOOLEAN_PARAM param, boolean value);
+        Set the given boolean extension parameter to the given value (TRUE or
+        FALSE.)
 
-* JBOOLEAN_TRELLIS_QUANT indicates whether to apply trellis quantization. For each 8x8 block trellis
-  quantization determines the best trade-off between rate and distortion. By default this parameter
-  is enabled.
+boolean jpeg_c_get_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
+        Get the value of the given boolean extension parameter (TRUE or FALSE.)
 
-* JBOOLEAN_TRELLIS_QUANT_DC indicates whether to apply trellis quantization to DC coefficients. By
-  default this parameter is enabled.
+boolean jpeg_c_float_param_supported (j_compress_ptr cinfo,
+                                      J_FLOAT_PARAM param)
+        Returns TRUE if the given floating point extension parameter is
+        supported by this implementation of the libjpeg API, or FALSE
+        otherwise.
 
-* JBOOLEAN_TRELLIS_EOB_OPT indicates whether to optimize runs of zero blocks in trellis quantization.
-  This is applicable only when JBOOLEAN_USE_SCANS_IN_TRELLIS is enabled. By default this parameter
-  is disabled.
+void jpeg_c_set_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param,
+                             float value)
+        Set the given floating point extension parameter to the given value.
 
-* JBOOLEAN_USE_LAMBDA_WEIGHT_TBL has currently no effect.
+float jpeg_c_get_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param);
+        Get the value of the given floating point extension parameter.
 
-* JBOOLEAN_USE_SCANS_IN_TRELLIS indicates whether multiple scans are considered during trellis
-  quantization. By default this parameter is disabled.
+boolean jpeg_c_int_param_supported (j_compress_ptr cinfo,
+                                    J_INT_PARAM param)
+        Returns TRUE if the given integer extension parameter is supported by
+        this implementation of the libjpeg API, or FALSE otherwise.
 
-* JBOOLEAN_TRELLIS_Q_OPT indicates whether to optimize the quantization table after trellis quantization.
-  If enabled a revised quantization table is derived such as to minimize the reconstruction error
-  given the quantized coefficients. By default this parameter is disabled.
+void jpeg_c_set_int_param (j_compress_ptr cinfo, J_INT_PARAM param,
+                          int value)
+        Set the given integer extension parameter to the given value.
 
-* JBOOLEAN_OVERSHOOT_DERINGING indicates whether overshooting is applied to samples with extreme
-  values (e.g., 0 and 255 for 8-bit samples). Overshooting may reduce ringing artifacts from
-  compression, in particular in areas where black text appears on a white background. By default
-  this parameter is enabled.
+int jpeg_c_get_int_param (j_compress_ptr cinfo, J_INT_PARAM param)
+        Get the value of the given integer extension parameter.
 
-Floating-point parameters
--------------------------
 
-* JFLOAT_LAMBDA_LOG_SCALE1 and JFLOAT_LAMBDA_LOG_SCALE2 determine the lambda value used in
-  trellis quantization. By default these parameters are set to 14.75 and 16.5. The lambda value
-  (Lagrance multiplier) in the R + lambda * D equation is derived from
-  lambda = 2^s1 / ((2^s2 + n) * q^2) where s1 and s2 are the values of JFLOAT_LAMBDA_LOG_SCALE1
-  and JFLOAT_LAMBDA_LOG_SCALE2, n is the average of the squared unquantized AC coefficients
-  within the current 8x8 block, and q is the quantization table entry associated with the
-  current coefficient frequency. If JFLOAT_LAMBDA_LOG_SCALE2 is 0, an alternate form is used that
-  does not rely on n: lambda = 2^(s1-12) / q^2.
+Boolean Extension Parameters Supported by mozjpeg
+-------------------------------------------------
 
-Integer parameters
-------------------
+* JBOOLEAN_USE_MOZ_DEFAULTS (default: TRUE)
+  This parameter controls the behavior of the jpeg_set_defaults() function and
+  should thus be set prior to calling that function.  If this parameter is
+  TRUE, then jpeg_set_defaults() will configure the library to use the mozjpeg
+  defaults (which will enable settings that increase the compression ratio as
+  much as possible, at the expense of increased encoding time.)  If this
+  parameter is FALSE, then jpeg_set_defaults() will configure the library to
+  use the libjpeg[-turbo] defaults (baseline entropy coding, no mozjpeg
+  extensions enabled.)
 
-* JINT_TRELLIS_FREQ_SPLIT determines the position within the zigzag scan at which the split between
-  scans is positioned in the context of trellis quantization. JBOOLEAN_USE_SCANS_IN_TRELLIS must
-  be enabled for this parameter to take effect. By default this parameter is set to value 8.
+* JBOOLEAN_OPTIMIZE_SCANS (default: TRUE)
+  Specifies whether scan parameters should be optimized.  Parameter
+  optimization is done as in jpgcrush.
 
-* JINT_TRELLIS_NUM_LOOPS determines the number of trellis quantization passes. Huffman tables are
-  updated between passes. By default this parameter is set to value 1.
+* JBOOLEAN_TRELLIS_QUANT (default: TRUE)
+  Specifies whether to apply trellis quantization.  For each 8x8 block, trellis
+  quantization determines the best tradeoff between rate and distortion.
 
-* JINT_BASE_QUANT_TBL_IDX determines which quantization table set to use. Multiple sets are defined
-  as below. By default this parameter is set to value 3.
-  - 0  Tables from JPEG Annex K
-  - 1  Flat table
-  - 2  Table tuned for MSSIM on Kodak image set
-  - 3  Table from http://www.imagemagick.org/discourse-server/viewtopic.php?f=22&t=20333&p=98008#p98008
-  - 4  Table tuned for PSNR-HVS-M on Kodak image set
-  - 5  Table from Relevance of human vision to JPEG-DCT compression (1992) Klein, Silverstein and Carney
-  - 6  Table from DCTune perceptual optimization of compressed dental X-Rays (1997) Watson, Taylor, Borthwick
-  - 7  Table from A visual detection model for DCT coefficient quantization (12/9/93) Ahumada, Watson, Peterson
-  - 8  Table from An improved detection model for DCT coefficient quantization (1993) Peterson, Ahumada and Watson
+* JBOOLEAN_TRELLIS_QUANT_DC (default: TRUE)
+  Specifies whether to apply trellis quantization to DC coefficients.
 
-* JINT_DC_SCAN_OPT_MODE determines the DC scan optimization mode. Modes are defined as below. By default
-  this parameter is set to value 1.
-  - 0  One scan for all components
-  - 1  One scan per component
-  - 2  Optimize between one scan for all components and one scan for 1st component plus one scan for
-       remaining components
+* JBOOLEAN_TRELLIS_EOB_OPT (default: FALSE)
+  Specifies whether to optimize runs of zero blocks in trellis quantization.
+  This is applicable only when JBOOLEAN_USE_SCANS_IN_TRELLIS is enabled.
+
+* JBOOLEAN_USE_LAMBDA_WEIGHT_TBL currently has no effect.
+
+* JBOOLEAN_USE_SCANS_IN_TRELLIS (default: FALSE)
+  Specifies whether multiple scans should be considered during trellis
+  quantization.
+
+* JBOOLEAN_TRELLIS_Q_OPT (default: FALSE)
+  Specifies whether to optimize the quantization table after trellis
+  quantization.  If enabled, then a revised quantization table is derived so
+  as to minimize the reconstruction error of the quantized coefficients.
+
+* JBOOLEAN_OVERSHOOT_DERINGING (default: TRUE)
+  Specifies whether overshooting is applied to samples with extreme values
+  (for example, 0 and 255 for 8-bit samples).  Overshooting may reduce ringing
+  artifacts from compression, in particular in areas where black text appears
+  on a white background.
+
+
+Floating Point Extension Parameters Supported by mozjpeg
+--------------------------------------------------------
+
+* JFLOAT_LAMBDA_LOG_SCALE1 (default: 14.75)
+  JFLOAT_LAMBDA_LOG_SCALE2 (default: 16.5)
+  These parameters specify the lambda value used in trellis quantization.  The
+  lambda value (Lagrange multiplier) in the
+    R + lambda * D
+  equation is derived from
+    lambda = 2^s1 / ((2^s2 + n) * q^2),
+  where s1 and s2 are the values of JFLOAT_LAMBDA_LOG_SCALE1 and
+  JFLOAT_LAMBDA_LOG_SCALE2, n is the average of the squared unquantized AC
+  coefficients within the current 8x8 block, and q is the quantization table
+  entry associated with the current coefficient frequency.  If
+  JFLOAT_LAMBDA_LOG_SCALE2 is 0, then an alternate form is used that does not
+  rely on n:
+    lambda = 2^(s1-12) / q^2.
+
+
+Integer Extension Parameters Supported by mozjpeg
+-------------------------------------------------
+
+* JINT_TRELLIS_FREQ_SPLIT (default: 8)
+  Specifies the position within the zigzag scan at which the split between
+  scans is positioned in the context of trellis quantization.
+  JBOOLEAN_USE_SCANS_IN_TRELLIS must be enabled for this parameter to have any
+  effect.
+
+* JINT_TRELLIS_NUM_LOOPS (default: 1)
+  Specifies the number of trellis quantization passes.  Huffman tables are
+  updated between passes.
+
+* JINT_BASE_QUANT_TBL_IDX (default: 3)
+  Specifies which quantization table set to use.  The following options are
+  available:
+  0 = Tables from JPEG Annex K
+  1 = Flat table
+  2 = Table tuned for MSSIM on Kodak image set
+  3 = Table from http://www.imagemagick.org/discourse-server/viewtopic.php?f=22&t=20333&p=98008#p98008
+  4 = Table tuned for PSNR-HVS-M on Kodak image set
+  5 = Table from:  Relevance of Human Vision to JPEG-DCT Compression
+      (1992) Klein, Silverstein and Carney
+  6 = Table from:  DCTune Perceptual Optimization of Compressed Dental X-Rays
+      (1997) Watson, Taylor, Borthwick
+  7 = Table from:  A Visual Detection Model for DCT Coefficient Quantization
+      (12/9/93) Ahumada, Watson, Peterson
+  8 = Table from:  An Improved Detection Model for DCT Coefficient Quantization
+      (1993) Peterson, Ahumada and Watson
+
+* JINT_DC_SCAN_OPT_MODE (default: 1)
+  Specifies the DC scan optimization mode.  The following options are
+  available:
+  0 = One scan for all components
+  1 = One scan per component
+  2 = Optimize between one scan for all components and one scan for the first
+      component plus one scan for the remaining components

From aed7d4661e01a59db7d7fd632b2e45ef277523fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kornel=20Lesin=CC=81ski?= <kornel@geekhood.net>
Date: Fri, 5 Dec 2014 01:36:27 +0000
Subject: [PATCH 029/108] Fix build when INPUT_SMOOTHING_SUPPORTED is undefined

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1434 632fc199-4ca6-4c93-a231-07263d6284db
---
 jcsample.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/jcsample.c b/jcsample.c
index 286f7585..24d31ae1 100644
--- a/jcsample.c
+++ b/jcsample.c
@@ -514,8 +514,9 @@ jinit_downsampler (j_compress_ptr cinfo)
 #endif
           downsample->methods[ci] = h2v2_smooth_downsample;
         downsample->pub.need_context_rows = TRUE;
-      } else {
+      } else
 #endif
+      {
         if (jsimd_can_h2v2_downsample())
           downsample->methods[ci] = jsimd_h2v2_downsample;
         else

From 0cbef40560dd01a401ca5629fc7b857276837b1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kornel=20Lesin=CC=81ski?= <kornel@geekhood.net>
Date: Fri, 5 Dec 2014 01:36:34 +0000
Subject: [PATCH 030/108] Remove unused code

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1436 632fc199-4ca6-4c93-a231-07263d6284db
---
 jcmainct.c | 142 ++---------------------------------------------------
 1 file changed, 4 insertions(+), 138 deletions(-)

diff --git a/jcmainct.c b/jcmainct.c
index fc4014b1..6ca3768f 100644
--- a/jcmainct.c
+++ b/jcmainct.c
@@ -17,14 +17,6 @@
 #include "jpeglib.h"
 
 
-/* Note: currently, there is no operating mode in which a full-image buffer
- * is needed at this step.  If there were, that mode could not be used with
- * "raw data" input, since this module is bypassed in that case.  However,
- * we've left the code here for possible use in special applications.
- */
-#undef FULL_MAIN_BUFFER_SUPPORTED
-
-
 /* Private buffer controller object */
 
 typedef struct {
@@ -40,13 +32,6 @@ typedef struct {
    * points to the currently accessible strips of the virtual arrays.
    */
   JSAMPARRAY buffer[MAX_COMPONENTS];
-
-#ifdef FULL_MAIN_BUFFER_SUPPORTED
-  /* If using full-image storage, this array holds pointers to virtual-array
-   * control blocks for each component.  Unused if not full-image storage.
-   */
-  jvirt_sarray_ptr whole_image[MAX_COMPONENTS];
-#endif
 } my_main_controller;
 
 typedef my_main_controller * my_main_ptr;
@@ -56,11 +41,6 @@ typedef my_main_controller * my_main_ptr;
 METHODDEF(void) process_data_simple_main
         (j_compress_ptr cinfo, JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
          JDIMENSION in_rows_avail);
-#ifdef FULL_MAIN_BUFFER_SUPPORTED
-METHODDEF(void) process_data_buffer_main
-        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-         JDIMENSION in_rows_avail);
-#endif
 
 
 /*
@@ -76,32 +56,14 @@ start_pass_main (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
   if (cinfo->raw_data_in)
     return;
 
+  if (pass_mode != JBUF_PASS_THRU)
+    ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+
   main_ptr->cur_iMCU_row = 0;   /* initialize counters */
   main_ptr->rowgroup_ctr = 0;
   main_ptr->suspended = FALSE;
   main_ptr->pass_mode = pass_mode;      /* save mode for use by process_data */
-
-  switch (pass_mode) {
-  case JBUF_PASS_THRU:
-#ifdef FULL_MAIN_BUFFER_SUPPORTED
-    if (main_ptr->whole_image[0] != NULL)
-      ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-#endif
-    main_ptr->pub.process_data = process_data_simple_main;
-    break;
-#ifdef FULL_MAIN_BUFFER_SUPPORTED
-  case JBUF_SAVE_SOURCE:
-  case JBUF_CRANK_DEST:
-  case JBUF_SAVE_AND_PASS:
-    if (main_ptr->whole_image[0] == NULL)
-      ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-    main_ptr->pub.process_data = process_data_buffer_main;
-    break;
-#endif
-  default:
-    ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-    break;
-  }
+  main_ptr->pub.process_data = process_data_simple_main;
 }
 
 
@@ -160,85 +122,6 @@ process_data_simple_main (j_compress_ptr cinfo,
 }
 
 
-#ifdef FULL_MAIN_BUFFER_SUPPORTED
-
-/*
- * Process some data.
- * This routine handles all of the modes that use a full-size buffer.
- */
-
-METHODDEF(void)
-process_data_buffer_main (j_compress_ptr cinfo,
-                          JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-                          JDIMENSION in_rows_avail)
-{
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
-  int ci;
-  jpeg_component_info *compptr;
-  boolean writing = (main_ptr->pass_mode != JBUF_CRANK_DEST);
-
-  while (main_ptr->cur_iMCU_row < cinfo->total_iMCU_rows) {
-    /* Realign the virtual buffers if at the start of an iMCU row. */
-    if (main_ptr->rowgroup_ctr == 0) {
-      for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-           ci++, compptr++) {
-        main_ptr->buffer[ci] = (*cinfo->mem->access_virt_sarray)
-          ((j_common_ptr) cinfo, main_ptr->whole_image[ci],
-           main_ptr->cur_iMCU_row * (compptr->v_samp_factor * DCTSIZE),
-           (JDIMENSION) (compptr->v_samp_factor * DCTSIZE), writing);
-      }
-      /* In a read pass, pretend we just read some source data. */
-      if (! writing) {
-        *in_row_ctr += cinfo->max_v_samp_factor * DCTSIZE;
-        main_ptr->rowgroup_ctr = DCTSIZE;
-      }
-    }
-
-    /* If a write pass, read input data until the current iMCU row is full. */
-    /* Note: preprocessor will pad if necessary to fill the last iMCU row. */
-    if (writing) {
-      (*cinfo->prep->pre_process_data) (cinfo,
-                                        input_buf, in_row_ctr, in_rows_avail,
-                                        main_ptr->buffer, &main_ptr->rowgroup_ctr,
-                                        (JDIMENSION) DCTSIZE);
-      /* Return to application if we need more data to fill the iMCU row. */
-      if (main_ptr->rowgroup_ctr < DCTSIZE)
-        return;
-    }
-
-    /* Emit data, unless this is a sink-only pass. */
-    if (main_ptr->pass_mode != JBUF_SAVE_SOURCE) {
-      if (! (*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
-        /* If compressor did not consume the whole row, then we must need to
-         * suspend processing and return to the application.  In this situation
-         * we pretend we didn't yet consume the last input row; otherwise, if
-         * it happened to be the last row of the image, the application would
-         * think we were done.
-         */
-        if (! main_ptr->suspended) {
-          (*in_row_ctr)--;
-          main_ptr->suspended = TRUE;
-        }
-        return;
-      }
-      /* We did finish the row.  Undo our little suspension hack if a previous
-       * call suspended; then mark the main buffer empty.
-       */
-      if (main_ptr->suspended) {
-        (*in_row_ctr)++;
-        main_ptr->suspended = FALSE;
-      }
-    }
-
-    /* If get here, we are done with this iMCU row.  Mark buffer empty. */
-    main_ptr->rowgroup_ctr = 0;
-    main_ptr->cur_iMCU_row++;
-  }
-}
-
-#endif /* FULL_MAIN_BUFFER_SUPPORTED */
-
-
 /*
  * Initialize main buffer controller.
  */
@@ -264,25 +147,8 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
    * may be of a different size.
    */
   if (need_full_buffer) {
-#ifdef FULL_MAIN_BUFFER_SUPPORTED
-    /* Allocate a full-image virtual array for each component */
-    /* Note we pad the bottom to a multiple of the iMCU height */
-    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-         ci++, compptr++) {
-      main_ptr->whole_image[ci] = (*cinfo->mem->request_virt_sarray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
-         compptr->width_in_blocks * DCTSIZE,
-         (JDIMENSION) jround_up((long) compptr->height_in_blocks,
-                                (long) compptr->v_samp_factor) * DCTSIZE,
-         (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
-    }
-#else
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-#endif
   } else {
-#ifdef FULL_MAIN_BUFFER_SUPPORTED
-    main_ptr->whole_image[0] = NULL; /* flag for no virtual arrays */
-#endif
     /* Allocate a strip buffer for each component */
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
          ci++, compptr++) {

From 775862809bf499060d9490af76cf42e4dddb1e11 Mon Sep 17 00:00:00 2001
From: Tristan Matthews <tmatth@videolan.org>
Date: Sun, 7 Dec 2014 02:49:17 -0500
Subject: [PATCH 031/108] jpegyuv: fix unused but set warning

---
 jpegyuv.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/jpegyuv.c b/jpegyuv.c
index ebd8dd94..86c737d8 100644
--- a/jpegyuv.c
+++ b/jpegyuv.c
@@ -53,7 +53,6 @@ int main(int argc, char *argv[]) {
   int chroma_width;
   int chroma_height;
   int frame_width;
-  int frame_height;
   int yuv_size;
   JSAMPLE *image_buffer;
   JSAMPROW yrow_pointer[16];
@@ -110,7 +109,6 @@ int main(int argc, char *argv[]) {
   }
 
   frame_width = (cinfo.output_width + (16 - 1)) & ~(16 - 1);
-  frame_height = (cinfo.output_height + (16 - 1)) & ~(16 - 1);
 
   image_buffer = malloc(frame_width*16 + 2*(frame_width/2)*8);
   if (!image_buffer) {

From 00ace7679afd96641f1c06c048276eda4ff8bdcf Mon Sep 17 00:00:00 2001
From: Josh Aas <joshmoz@gmail.com>
Date: Sun, 7 Dec 2014 10:15:45 -0600
Subject: [PATCH 032/108] Bump mozjpeg version to 3.0

---
 CMakeLists.txt | 2 +-
 configure.ac   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8095f351..b3e618f1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,7 @@ if(POLICY CMP0022)
 endif()
 
 project(mozjpeg C)
-set(VERSION 2.1)
+set(VERSION 3.0)
 
 if(CYGWIN OR NOT CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
   execute_process(COMMAND "date" "+%Y%m%d" OUTPUT_VARIABLE BUILD)
diff --git a/configure.ac b/configure.ac
index 3a079da5..8c511917 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.56])
-AC_INIT([mozjpeg], [2.1])
+AC_INIT([mozjpeg], [3.0])
 BUILD=`date +%Y%m%d`
 
 AM_INIT_AUTOMAKE([-Wall foreign dist-bzip2])

From 79929eae12236d56c769ec167e0cd88759de7bc4 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Sun, 7 Dec 2014 16:54:48 -0400
Subject: [PATCH 033/108] Fix argument name mismatch #128

---
 cjpeg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cjpeg.c b/cjpeg.c
index 1277cae0..9efede99 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -447,7 +447,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       qtablefile = argv[argn];
       /* We postpone actually reading the file in case -quality comes later. */
 
-    } else if (keymatch(arg, "quant_table", 2)) {
+    } else if (keymatch(arg, "quant-table", 2)) {
       if (++argn >= argc)       /* advance to next argument */
         usage();
       jpeg_c_set_int_param(cinfo, JINT_BASE_QUANT_TBL_IDX, atoi(argv[argn]));

From fa6d5a29da8b8b925280e97142bc8fc647fe6e06 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Sun, 7 Dec 2014 17:08:20 -0400
Subject: [PATCH 034/108] Check quantization table index

Fixes #130
---
 cjpeg.c    | 8 +++++++-
 jccompat.c | 3 ++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/cjpeg.c b/cjpeg.c
index 9efede99..2054ae8e 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -448,9 +448,15 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       /* We postpone actually reading the file in case -quality comes later. */
 
     } else if (keymatch(arg, "quant-table", 2)) {
+      int val;
       if (++argn >= argc)       /* advance to next argument */
         usage();
-      jpeg_c_set_int_param(cinfo, JINT_BASE_QUANT_TBL_IDX, atoi(argv[argn]));
+      val = atoi(argv[argn]);
+      jpeg_c_set_int_param(cinfo, JINT_BASE_QUANT_TBL_IDX, val);
+      if (jpeg_c_get_int_param(cinfo, JINT_BASE_QUANT_TBL_IDX) != val) {
+        fprintf(stderr, "%s: %d is invalid argument for quant-table\n", progname, val);
+        usage();
+      }
       jpeg_set_quality(cinfo, 75, TRUE);
       
     } else if (keymatch(arg, "restart", 1)) {
diff --git a/jccompat.c b/jccompat.c
index d0a398b2..6a7f1c33 100644
--- a/jccompat.c
+++ b/jccompat.c
@@ -174,7 +174,8 @@ jpeg_c_set_int_param (j_compress_ptr cinfo, J_INT_PARAM param, int value)
     cinfo->master->trellis_num_loops = value;
     break;
   case JINT_BASE_QUANT_TBL_IDX:
-    cinfo->master->quant_tbl_master_idx = value;
+    if (value >= 0 && value <= 8)
+      cinfo->master->quant_tbl_master_idx = value;
     break;
   case JINT_DC_SCAN_OPT_MODE:
     cinfo->master->dc_scan_opt_mode = value;

From 6d172465cc8e00d507fc73dbc7a0913f2fdc599c Mon Sep 17 00:00:00 2001
From: Josh Aas <joshmoz@gmail.com>
Date: Mon, 8 Dec 2014 13:28:32 -0600
Subject: [PATCH 035/108] Drop executable bit for README-turbo.txt

---
 README-turbo.txt | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 README-turbo.txt

diff --git a/README-turbo.txt b/README-turbo.txt
old mode 100755
new mode 100644

From 779f5622aa3662d5e07137f050e523c68bcc588e Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Mon, 8 Dec 2014 23:22:53 +0000
Subject: [PATCH 036/108] Fix typos in test names

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1438 632fc199-4ca6-4c93-a231-07263d6284db
---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 46c6450e..2f8eaa42 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -557,7 +557,7 @@ foreach(libtype shared static)
   add_test(djpeg${suffix}-gray-islow-rgb
     ${dir}djpeg${suffix} -dct int -rgb -outfile testout_gray_islow_rgb.ppm
       testout_gray_islow.jpg)
-  add_test(cjpeg${suffix}-gray-islow-rgb-cmp
+  add_test(djpeg${suffix}-gray-islow-rgb-cmp
     ${CMAKE_COMMAND} -DMD5=${MD5_PPM_GRAY_ISLOW_RGB}
       -DFILE=testout_gray_islow_rgb.ppm
       -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
@@ -637,10 +637,10 @@ foreach(libtype shared static)
   endif()
   if(WITH_ARITH_DEC)
     # CC: RGB->YCC  SAMP: h2v2 merged  IDCT: ifast  ENT: arith
-    add_test(cjpeg${suffix}-420m-ifast-ari
+    add_test(djpeg${suffix}-420m-ifast-ari
       ${dir}djpeg${suffix} -fast -ppm -outfile testout_420m_ifast_ari.ppm
         ${CMAKE_SOURCE_DIR}/testimages/testimgari.jpg)
-    add_test(cjpeg${suffix}-420m-ifast-ari-cmp
+    add_test(djpeg${suffix}-420m-ifast-ari-cmp
       ${CMAKE_COMMAND} -DMD5=${MD5_PPM_420M_IFAST_ARI}
         -DFILE=testout_420m_ifast_ari.ppm
         -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)

From c51334f4d0311e3d772557501c35cf535a90c2a8 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Tue, 9 Dec 2014 16:22:36 -0500
Subject: [PATCH 037/108] Add clarification re usage of scan optimization

---
 README-mozilla.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README-mozilla.txt b/README-mozilla.txt
index 88978f12..70efb5dc 100644
--- a/README-mozilla.txt
+++ b/README-mozilla.txt
@@ -89,7 +89,10 @@ Boolean Extension Parameters Supported by mozjpeg
 
 * JBOOLEAN_OPTIMIZE_SCANS (default: TRUE)
   Specifies whether scan parameters should be optimized.  Parameter
-  optimization is done as in jpgcrush.
+  optimization is done as in jpgcrush. jpeg_simple_progression() should be called
+  after setting JBOOLEAN_OPTIMIZE_SCANS.
+  When disabling JBOOLEAN_OPTIMIZE_SCANS, cinfo.scan_info should additionally be
+  set to NULL to disable use of the progressive coding mode, if so desired.
 
 * JBOOLEAN_TRELLIS_QUANT (default: TRUE)
   Specifies whether to apply trellis quantization.  For each 8x8 block, trellis

From f8dd3830eb2387585d4f759a23abda7286e4718e Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Tue, 9 Dec 2014 21:24:53 -0500
Subject: [PATCH 038/108] Fix trellis / no Huffman opt combination

Enabling trellis and disabling Huffman table optimization was causing a
"Bogus buffer control mode" error as reported in #111
---
 jcinit.c   |  2 +-
 jcmaster.c | 20 ++++++++++++++------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/jcinit.c b/jcinit.c
index f5ecf5f5..e86be0c8 100644
--- a/jcinit.c
+++ b/jcinit.c
@@ -63,7 +63,7 @@ jinit_compress_master (j_compress_ptr cinfo)
   /* Need a full-image coefficient buffer in any multi-pass mode. */
   jinit_c_coef_controller(cinfo,
     (boolean) (cinfo->num_scans > 1 || cinfo->optimize_coding ||
-               cinfo->master->optimize_scans));
+               cinfo->master->optimize_scans || cinfo->master->trellis_quant));
   jinit_c_main_controller(cinfo, FALSE /* never need full buffer here */);
 
   jinit_marker_writer(cinfo);
diff --git a/jcmaster.c b/jcmaster.c
index 16b509f5..eb706a2b 100644
--- a/jcmaster.c
+++ b/jcmaster.c
@@ -510,12 +510,12 @@ prepare_for_pass (j_compress_ptr cinfo)
       (*cinfo->prep->start_pass) (cinfo, JBUF_PASS_THRU);
     }
     (*cinfo->fdct->start_pass) (cinfo);
-    (*cinfo->entropy->start_pass) (cinfo, cinfo->optimize_coding);
+    (*cinfo->entropy->start_pass) (cinfo, cinfo->optimize_coding || cinfo->master->trellis_quant);
     (*cinfo->coef->start_pass) (cinfo,
                                 (master->total_passes > 1 ?
                                  JBUF_SAVE_AND_PASS : JBUF_PASS_THRU));
     (*cinfo->main->start_pass) (cinfo, JBUF_PASS_THRU);
-    if (cinfo->optimize_coding) {
+    if (cinfo->optimize_coding || cinfo->master->trellis_quant) {
       /* No immediate data output; postpone writing frame/scan headers */
       master->pub.call_pass_startup = FALSE;
     } else {
@@ -883,7 +883,10 @@ finish_pass_master (j_compress_ptr cinfo)
     master->scan_number++;
     break;
   case trellis_pass:
-    master->pass_type = (cinfo->optimize_coding || master->pass_number < master->pass_number_scan_opt_base-1) ? huff_opt_pass : output_pass;
+    if (cinfo->optimize_coding)
+      master->pass_type = huff_opt_pass;
+    else
+      master->pass_type = (master->pass_number < master->pass_number_scan_opt_base-1) ? trellis_pass : output_pass;
       
     if ((master->pass_number + 1) %
         (cinfo->num_components * (cinfo->master->use_scans_in_trellis ? 4 : 2)) == 0 &&
@@ -969,9 +972,14 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
   
   master->pass_number_scan_opt_base = 0;
   if (cinfo->master->trellis_quant) {
-    master->pass_number_scan_opt_base =
-      ((cinfo->master->use_scans_in_trellis) ? 4 : 2) * cinfo->num_components *
-      cinfo->master->trellis_num_loops;
+    if (cinfo->optimize_coding)
+      master->pass_number_scan_opt_base =
+        ((cinfo->master->use_scans_in_trellis) ? 4 : 2) * cinfo->num_components *
+        cinfo->master->trellis_num_loops;
+    else
+      master->pass_number_scan_opt_base =
+        ((cinfo->master->use_scans_in_trellis) ? 2 : 1) * cinfo->num_components *
+        cinfo->master->trellis_num_loops + 1;
     master->total_passes += master->pass_number_scan_opt_base;
   }
   

From 3e2cf6909c295d6dce5b8d3fd0612d2788a0e55a Mon Sep 17 00:00:00 2001
From: DRC <information@virtualgl.org>
Date: Sun, 14 Dec 2014 01:56:26 -0600
Subject: [PATCH 039/108] Convert JBOOLEAN_USE_MOZ_DEFAULTS into an integer
 "compression profile" parameter This eliminates JBOOLEAN_USE_MOZ_DEFAULTS and
 replaces it with JINT_COMPRESS_PROFILE, a more flexible and descriptive
 parameter.  Currently, this new parameter works in much the same way as the
 old-- it changes the behavior of jpeg_set_defaults().  It currently supports
 only two values (max. compression, i.e. mozjpeg defaults, and fastest, i.e.
 libjpeg-turbo defaults), but it can be extended in the future with additional
 profiles that balance compression ratio with performance.

---
 README-mozilla.txt | 26 ++++++++++++++++----------
 cjpeg.c            |  2 +-
 jcapimin.c         |  2 +-
 jccompat.c         | 19 +++++++++++++------
 jcparam.c          | 26 +++++++++-----------------
 jerror.h           |  1 +
 jpegint.h          |  2 +-
 jpeglib.h          | 10 +++++++++-
 jpegtran.c         | 20 ++++++++++++--------
 turbojpeg.c        |  2 +-
 10 files changed, 64 insertions(+), 46 deletions(-)

diff --git a/README-mozilla.txt b/README-mozilla.txt
index 70efb5dc..ebb86e6e 100644
--- a/README-mozilla.txt
+++ b/README-mozilla.txt
@@ -77,16 +77,6 @@ int jpeg_c_get_int_param (j_compress_ptr cinfo, J_INT_PARAM param)
 Boolean Extension Parameters Supported by mozjpeg
 -------------------------------------------------
 
-* JBOOLEAN_USE_MOZ_DEFAULTS (default: TRUE)
-  This parameter controls the behavior of the jpeg_set_defaults() function and
-  should thus be set prior to calling that function.  If this parameter is
-  TRUE, then jpeg_set_defaults() will configure the library to use the mozjpeg
-  defaults (which will enable settings that increase the compression ratio as
-  much as possible, at the expense of increased encoding time.)  If this
-  parameter is FALSE, then jpeg_set_defaults() will configure the library to
-  use the libjpeg[-turbo] defaults (baseline entropy coding, no mozjpeg
-  extensions enabled.)
-
 * JBOOLEAN_OPTIMIZE_SCANS (default: TRUE)
   Specifies whether scan parameters should be optimized.  Parameter
   optimization is done as in jpgcrush. jpeg_simple_progression() should be called
@@ -145,6 +135,22 @@ Floating Point Extension Parameters Supported by mozjpeg
 Integer Extension Parameters Supported by mozjpeg
 -------------------------------------------------
 
+* JINT_COMPRESS_PROFILE (default: JCP_MAX_COMPRESSION)
+  Select a compression profile, which is a set of default parameters that will
+  achieve a desired compression goal.  This parameter controls the behavior of
+  the jpeg_set_defaults() function.  Thus, setting JINT_COMPRESS_PROFILE does
+  not cause any other parameters to be modified until jpeg_set_defaults() is
+  called.  The following compression profiles are supported:
+
+  - JCP_MAX_COMPRESSION (default)
+    Increase the compression ratio as much as possible, at the expense of
+    increased encoding time.  This enables progressive entropy coding and all
+    mozjpeg extensions.
+
+  - JCP_FASTEST
+    Use the libjpeg[-turbo] defaults (baseline entropy coding, no mozjpeg
+    extensions enabled.)
+
 * JINT_TRELLIS_FREQ_SPLIT (default: 8)
   Specifies the position within the zigzag scan at which the split between
   scans is positioned in the context of trellis quantization.
diff --git a/cjpeg.c b/cjpeg.c
index 2054ae8e..41558a79 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -480,7 +480,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
 
     } else if (keymatch(arg, "revert", 3)) {
       /* revert to old JPEG default */
-      jpeg_c_set_bool_param(cinfo, JBOOLEAN_USE_MOZ_DEFAULTS, FALSE);
+      jpeg_c_set_int_param(cinfo, JINT_COMPRESS_PROFILE, JCP_FASTEST);
       jpeg_set_defaults(cinfo);
 
     } else if (keymatch(arg, "sample", 2)) {
diff --git a/jcapimin.c b/jcapimin.c
index 8935546b..804e347f 100644
--- a/jcapimin.c
+++ b/jcapimin.c
@@ -99,7 +99,7 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
   cinfo->master = (struct jpeg_comp_master *)
       jpeg_get_small ((j_common_ptr) cinfo, sizeof(struct jpeg_comp_master));
   MEMZERO(cinfo->master, sizeof(struct jpeg_comp_master));
-  cinfo->master->use_moz_defaults = TRUE;
+  cinfo->master->compress_profile = JCP_MAX_COMPRESSION;
 }
 
 
diff --git a/jccompat.c b/jccompat.c
index 6a7f1c33..30ec3a4f 100644
--- a/jccompat.c
+++ b/jccompat.c
@@ -19,7 +19,6 @@ GLOBAL(boolean)
 jpeg_c_bool_param_supported (j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
 {
   switch (param) {
-  case JBOOLEAN_USE_MOZ_DEFAULTS:
   case JBOOLEAN_OPTIMIZE_SCANS:
   case JBOOLEAN_TRELLIS_QUANT:
   case JBOOLEAN_TRELLIS_QUANT_DC:
@@ -40,9 +39,6 @@ jpeg_c_set_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param,
                        boolean value)
 {
   switch(param) {
-  case JBOOLEAN_USE_MOZ_DEFAULTS:
-    cinfo->master->use_moz_defaults = value;
-    break;
   case JBOOLEAN_OPTIMIZE_SCANS:
     cinfo->master->optimize_scans = value;
     break;
@@ -77,8 +73,6 @@ GLOBAL(boolean)
 jpeg_c_get_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
 {
   switch(param) {
-  case JBOOLEAN_USE_MOZ_DEFAULTS:
-    return cinfo->master->use_moz_defaults;
   case JBOOLEAN_OPTIMIZE_SCANS:
     return cinfo->master->optimize_scans;
   case JBOOLEAN_TRELLIS_QUANT:
@@ -152,6 +146,7 @@ GLOBAL(boolean)
 jpeg_c_int_param_supported (j_compress_ptr cinfo, J_INT_PARAM param)
 {
   switch (param) {
+  case JINT_COMPRESS_PROFILE:
   case JINT_TRELLIS_FREQ_SPLIT:
   case JINT_TRELLIS_NUM_LOOPS:
   case JINT_BASE_QUANT_TBL_IDX:
@@ -167,6 +162,16 @@ GLOBAL(void)
 jpeg_c_set_int_param (j_compress_ptr cinfo, J_INT_PARAM param, int value)
 {
   switch (param) {
+  case JINT_COMPRESS_PROFILE:
+    switch (value) {
+    case JCP_MAX_COMPRESSION:
+    case JCP_FASTEST:
+      cinfo->master->compress_profile = value;
+      break;
+    default:
+      ERREXIT(cinfo, JERR_BAD_PARAM_VALUE);
+    }
+    break;
   case JINT_TRELLIS_FREQ_SPLIT:
     cinfo->master->trellis_freq_split = value;
     break;
@@ -190,6 +195,8 @@ GLOBAL(int)
 jpeg_c_get_int_param (j_compress_ptr cinfo, J_INT_PARAM param)
 {
   switch (param) {
+  case JINT_COMPRESS_PROFILE:
+    return cinfo->master->compress_profile;
   case JINT_TRELLIS_FREQ_SPLIT:
     return cinfo->master->trellis_freq_split;
   case JINT_TRELLIS_NUM_LOOPS:
diff --git a/jcparam.c b/jcparam.c
index b6c491df..9b68761b 100644
--- a/jcparam.c
+++ b/jcparam.c
@@ -417,19 +417,9 @@ jpeg_set_defaults (j_compress_ptr cinfo)
     cinfo->arith_ac_K[i] = 5;
   }
 
-#ifdef C_PROGRESSIVE_SUPPORTED
-  cinfo->scan_info = NULL;
-  cinfo->num_scans = 0;
-  if (!cinfo->master->use_moz_defaults) {
-    /* Default is no multiple-scan output */
-    cinfo->scan_info = NULL;
-    cinfo->num_scans = 0;
-  }
-#else
   /* Default is no multiple-scan output */
   cinfo->scan_info = NULL;
   cinfo->num_scans = 0;
-#endif
 
   /* Expect normal source image, not raw downsampled data */
   cinfo->raw_data_in = FALSE;
@@ -438,7 +428,7 @@ jpeg_set_defaults (j_compress_ptr cinfo)
   cinfo->arith_code = FALSE;
 
 #ifdef ENTROPY_OPT_SUPPORTED
-  if (cinfo->master->use_moz_defaults)
+  if (cinfo->master->compress_profile == JCP_MAX_COMPRESSION)
     /* By default, do extra passes to optimize entropy coding */
     cinfo->optimize_coding = TRUE;
   else
@@ -465,7 +455,8 @@ jpeg_set_defaults (j_compress_ptr cinfo)
   cinfo->do_fancy_downsampling = TRUE;
 #endif
 
-  cinfo->master->overshoot_deringing = cinfo->master->use_moz_defaults;
+  cinfo->master->overshoot_deringing =
+    cinfo->master->compress_profile == JCP_MAX_COMPRESSION;
 
   /* No input smoothing */
   cinfo->smoothing_factor = 0;
@@ -499,14 +490,15 @@ jpeg_set_defaults (j_compress_ptr cinfo)
   cinfo->master->dc_scan_opt_mode = 1;
   
 #ifdef C_PROGRESSIVE_SUPPORTED
-  if (cinfo->master->use_moz_defaults) {
+  if (cinfo->master->compress_profile == JCP_MAX_COMPRESSION) {
     cinfo->master->optimize_scans = TRUE;
     jpeg_simple_progression(cinfo);
   } else
     cinfo->master->optimize_scans = FALSE;
 #endif
   
-  cinfo->master->trellis_quant = cinfo->master->use_moz_defaults;
+  cinfo->master->trellis_quant =
+    cinfo->master->compress_profile == JCP_MAX_COMPRESSION;
   cinfo->master->lambda_log_scale1 = 14.75;
   cinfo->master->lambda_log_scale2 = 16.5;
   cinfo->master->quant_tbl_master_idx = 3;
@@ -877,7 +869,7 @@ jpeg_simple_progression (j_compress_ptr cinfo)
     nscans = 10;
   } else {
     /* All-purpose script for other color spaces. */
-    if (cinfo->master->use_moz_defaults == TRUE) {
+    if (cinfo->master->compress_profile == JCP_MAX_COMPRESSION) {
       if (ncomps > MAX_COMPS_IN_SCAN)
         nscans = 5 * ncomps;      /* 2 DC + 4 AC scans per component */
       else
@@ -909,7 +901,7 @@ jpeg_simple_progression (j_compress_ptr cinfo)
 
   if (ncomps == 3 && cinfo->jpeg_color_space == JCS_YCbCr) {
     /* Custom script for YCbCr color images. */
-    if (cinfo->master->use_moz_defaults == TRUE) {
+    if (cinfo->master->compress_profile == JCP_MAX_COMPRESSION) {
       /* scan defined in jpeg_scan_rgb.txt in jpgcrush */
     /* Initial DC scan */
       if (cinfo->master->dc_scan_opt_mode == 0)
@@ -957,7 +949,7 @@ jpeg_simple_progression (j_compress_ptr cinfo)
     }
   } else {
     /* All-purpose script for other color spaces. */
-    if (cinfo->master->use_moz_defaults == TRUE) {
+    if (cinfo->master->compress_profile == JCP_MAX_COMPRESSION) {
       /* scan defined in jpeg_scan_bw.txt in jpgcrush */
       /* DC component, no successive approximation */
       scanptr = fill_dc_scans(scanptr, ncomps, 0, 0);
diff --git a/jerror.h b/jerror.h
index 816fdf9a..cb5d6f84 100644
--- a/jerror.h
+++ b/jerror.h
@@ -208,6 +208,7 @@ JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 #endif
 #endif
 JMESSAGE(JERR_BAD_PARAM, "Bogus parameter")
+JMESSAGE(JERR_BAD_PARAM_VALUE, "Bogus parameter value")
 
 #ifdef JMAKE_ENUM_LIST
 
diff --git a/jpegint.h b/jpegint.h
index 49de57ed..36c5e409 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -58,7 +58,6 @@ struct jpeg_comp_master {
   boolean is_last_pass;         /* True during last pass */
 
   /* Extension parameters */
-  boolean use_moz_defaults; /* TRUE=use Mozilla defaults */
   boolean optimize_scans; /* TRUE=optimize progressive coding scans */
   boolean trellis_quant; /* TRUE=use trellis quantization */
   boolean trellis_quant_dc; /* TRUE=use trellis quant for DC coefficient */
@@ -72,6 +71,7 @@ struct jpeg_comp_master {
   double norm_src[NUM_QUANT_TBLS][DCTSIZE2];
   double norm_coef[NUM_QUANT_TBLS][DCTSIZE2];
 
+  int compress_profile; /* compression profile */
   int dc_scan_opt_mode; /* DC scan optimization mode */
   int quant_tbl_master_idx; /* Quantization table master index */
   int trellis_freq_split; /* splitting point for frequency in trellis quantization */
diff --git a/jpeglib.h b/jpeglib.h
index 1b95a36e..be895a96 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -274,7 +274,6 @@ typedef enum {
 /* Boolean extension parameters */
 
 typedef enum {
-  JBOOLEAN_USE_MOZ_DEFAULTS = 0xAE2F5D7F, /* TRUE=use Mozilla defaults */
   JBOOLEAN_OPTIMIZE_SCANS = 0x680C061E, /* TRUE=optimize progressive coding scans */
   JBOOLEAN_TRELLIS_QUANT = 0xC5122033, /* TRUE=use trellis quantization */
   JBOOLEAN_TRELLIS_QUANT_DC = 0x339D4C0C, /* TRUE=use trellis quant for DC coefficient */
@@ -295,6 +294,7 @@ typedef enum {
 /* Integer parameters */
 
 typedef enum {
+  JINT_COMPRESS_PROFILE = 0xE9918625, /* compression profile */
   JINT_TRELLIS_FREQ_SPLIT = 0x6FAFF127, /* splitting point for frequency in trellis quantization */
   JINT_TRELLIS_NUM_LOOPS = 0xB63EBF39, /* number of trellis loops */
   JINT_BASE_QUANT_TBL_IDX = 0x44492AB1, /* base quantization table index */
@@ -302,6 +302,14 @@ typedef enum {
 } J_INT_PARAM;
 
 
+/* Values for the JINT_COMPRESS_PROFILE parameter (32-bit GUIDs) */
+
+enum {
+  JCP_MAX_COMPRESSION = 0x5D083AAD, /* best compression ratio (progressive, all mozjpeg extensions) */
+  JCP_FASTEST = 0x2AEA5CB4 /* libjpeg[-turbo] defaults (baseline, no mozjpeg extensions) */
+};
+
+
 /* Common fields between JPEG compression and decompression master structs. */
 
 #define jpeg_common_fields \
diff --git a/jpegtran.c b/jpegtran.c
index 782c4732..e5161c4b 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -44,7 +44,7 @@ static const char * progname;   /* program name for error messages */
 static char * outfilename;      /* for -outfile switch */
 static JCOPY_OPTION copyoption; /* -copy switch */
 static jpeg_transform_info transformoption; /* image transformation options */
-boolean memsrc;  /* for -memsrc switch */
+boolean memsrc = FALSE;  /* for -memsrc switch */
 #define INPUT_BUF_SIZE  4096
 
 
@@ -316,7 +316,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
 
     } else if (keymatch(arg, "revert", 3)) {
       /* revert to old JPEG default */
-      jpeg_c_set_bool_param(cinfo, JBOOLEAN_USE_MOZ_DEFAULTS, FALSE);
+      jpeg_c_set_int_param(cinfo, JINT_COMPRESS_PROFILE, JCP_FASTEST);
       
     } else if (keymatch(arg, "rotate", 2)) {
       /* Rotate 90, 180, or 270 degrees (measured clockwise). */
@@ -474,8 +474,10 @@ main (int argc, char **argv)
 #endif
 
   /* Specify data source for decompression */
-  if (jpeg_c_bool_param_supported(&dstinfo, JBOOLEAN_USE_MOZ_DEFAULTS))
-    memsrc = jpeg_c_get_bool_param(&dstinfo, JBOOLEAN_USE_MOZ_DEFAULTS); /* needed to revert to original */
+  if (jpeg_c_int_param_supported(&dstinfo, JINT_COMPRESS_PROFILE) &&
+      jpeg_c_get_int_param(&dstinfo, JINT_COMPRESS_PROFILE)
+        == JCP_MAX_COMPRESSION)
+    memsrc = TRUE; /* needed to revert to original */
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
   if (memsrc) {
     size_t nbytes;
@@ -561,8 +563,9 @@ main (int argc, char **argv)
 
   /* Specify data destination for compression */
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
-  if (jpeg_c_bool_param_supported(&dstinfo, JBOOLEAN_USE_MOZ_DEFAULTS) &&
-      jpeg_c_get_bool_param(&dstinfo, JBOOLEAN_USE_MOZ_DEFAULTS))
+  if (jpeg_c_int_param_supported(&dstinfo, JINT_COMPRESS_PROFILE) &&
+      jpeg_c_get_int_param(&dstinfo, JINT_COMPRESS_PROFILE)
+        == JCP_MAX_COMPRESSION)
     jpeg_mem_dest(&dstinfo, &outbuffer, &outsize);
   else
 #endif
@@ -584,8 +587,9 @@ main (int argc, char **argv)
   /* Finish compression and release memory */
   jpeg_finish_compress(&dstinfo);
   
-  if (jpeg_c_bool_param_supported(&dstinfo, JBOOLEAN_USE_MOZ_DEFAULTS) &&
-      jpeg_c_get_bool_param(&dstinfo, JBOOLEAN_USE_MOZ_DEFAULTS)) {
+  if (jpeg_c_int_param_supported(&dstinfo, JINT_COMPRESS_PROFILE) &&
+      jpeg_c_get_int_param(&dstinfo, JINT_COMPRESS_PROFILE)
+        == JCP_MAX_COMPRESSION) {
     size_t nbytes;
     
     unsigned char *buffer = outbuffer;
diff --git a/turbojpeg.c b/turbojpeg.c
index 7617ef8b..4d625c8c 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c
@@ -240,7 +240,7 @@ static int setCompDefaults(struct jpeg_compress_struct *cinfo,
 	else jpeg_set_colorspace(cinfo, JCS_YCbCr);
 
 	/* Set scan pattern again as colorspace might have changed */
-	if (cinfo->master->use_moz_defaults)
+	if(cinfo->master->compress_profile == JCP_MAX_COMPRESSION)
 		jpeg_simple_progression(cinfo);
 
 	cinfo->comp_info[0].h_samp_factor=tjMCUWidth[subsamp]/8;

From 668bb847d59d146bd0aed6dcf2ef3f33d9987d44 Mon Sep 17 00:00:00 2001
From: DRC <information@virtualgl.org>
Date: Sun, 14 Dec 2014 02:02:26 -0600
Subject: [PATCH 040/108] Rename jccompat.c to jcext.c "jcext" is a bit more
 descriptive, since this code is primarily intended to extend the libjpeg API.
  It does so in a backward-ABI-compatible manner, but "jccompat" could be
 misinterpreted to mean that the code is providing backward compatibility at
 the code level..

---
 CMakeLists.txt        | 4 ++--
 Makefile.am           | 2 +-
 jccompat.c => jcext.c | 0
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename jccompat.c => jcext.c (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b3e618f1..143fd370 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -179,8 +179,8 @@ endif()
 # Targets
 #
 
-set(JPEG_SOURCES jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c
-  jccompat.c jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c
+set(JPEG_SOURCES jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jcext.c
+  jchuff.c jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c
   jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c jdatadst.c
   jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdinput.c jdmainct.c
   jdmarker.c jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c
diff --git a/Makefile.am b/Makefile.am
index 3b99865b..8a9a4d4b 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -16,7 +16,7 @@ HDRS = jchuff.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
 	jpeg_nbits_table.h
 
 libjpeg_la_SOURCES = $(HDRS) jcapimin.c jcapistd.c jccoefct.c jccolor.c \
-	jccompat.c jcdctmgr.c jchuff.c jcinit.c jcmainct.c jcmarker.c \
+	jcdctmgr.c jcext.c jchuff.c jcinit.c jcmainct.c jcmarker.c \
 	jcmaster.c jcomapi.c jcparam.c jcphuff.c jcprepct.c jcsample.c \
 	jctrans.c jdapimin.c jdapistd.c jdatadst.c jdatasrc.c jdcoefct.c \
 	jdcolor.c jddctmgr.c jdhuff.c jdinput.c jdmainct.c jdmarker.c \
diff --git a/jccompat.c b/jcext.c
similarity index 100%
rename from jccompat.c
rename to jcext.c

From f645002fb68351b9711cf4badcff5d6deb525638 Mon Sep 17 00:00:00 2001
From: DRC <information@virtualgl.org>
Date: Sun, 14 Dec 2014 03:59:44 -0600
Subject: [PATCH 041/108] Fix double free of cinfo->master caused by the
 extension framework modifications There was an oversight in the extension
 framework.  jpeg_start_compress() can be called multiple times between the
 time that a compress structure is created and the time it is destroyed.  If
 this happened, then the following sequence would occur:

-- heap alloc of master struct within jpeg_create_compress()
-- heap free of master struct within jinit_c_master_control()
-- static alloc of extended master struct (JPOOL_IMAGE) within
   jinit_c_master_control()
-- free extended master struct in jpeg_finish_compress()
-- jinit_c_master_control() now sees that cinfo->master is set and tries to
   free it, even though it has already been freed.  Chaos ensues.

The fix involved breaking out the extended master struct into a header so that
jpeg_create_compress() can go ahead and allocate it to the correct size, thus
eliminating the need to free and reallocate it in jinit_c_master_control().
Further, the master struct is now created in the permanent pool, so it will
survive until the compression struct is destroyed.  Further,
jinit_c_master_control() now resets all fields in the master struct that
are not related to the extension parameters.
---
 jcapimin.c |  7 +++++--
 jcmaster.c | 49 +++----------------------------------------------
 jcmaster.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 48 deletions(-)
 create mode 100644 jcmaster.h

diff --git a/jcapimin.c b/jcapimin.c
index 804e347f..f5fa4167 100644
--- a/jcapimin.c
+++ b/jcapimin.c
@@ -23,6 +23,7 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jmemsys.h"
+#include "jcmaster.h"
 
 
 /*
@@ -97,8 +98,10 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
    * here.  It is later reallocated by jinit_c_master_control().
    */
   cinfo->master = (struct jpeg_comp_master *)
-      jpeg_get_small ((j_common_ptr) cinfo, sizeof(struct jpeg_comp_master));
-  MEMZERO(cinfo->master, sizeof(struct jpeg_comp_master));
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+                                  sizeof(my_comp_master));
+  MEMZERO(cinfo->master, sizeof(my_comp_master));
+
   cinfo->master->compress_profile = JCP_MAX_COMPRESSION;
 }
 
diff --git a/jcmaster.c b/jcmaster.c
index eb706a2b..c02f5903 100644
--- a/jcmaster.c
+++ b/jcmaster.c
@@ -21,42 +21,7 @@
 #include "jpeglib.h"
 #include "jpegcomp.h"
 #include "jmemsys.h"
-
-
-/* Private state */
-
-typedef enum {
-        main_pass,              /* input data, also do first output step */
-        huff_opt_pass,          /* Huffman code optimization pass */
-        output_pass,            /* data output pass */
-        trellis_pass            /* trellis quantization pass */
-} c_pass_type;
-
-typedef struct {
-  struct jpeg_comp_master pub;  /* public fields */
-
-  c_pass_type pass_type;        /* the type of the current pass */
-
-  int pass_number;              /* # of passes completed */
-  int total_passes;             /* total # of passes needed */
-
-  int scan_number;              /* current index in scan_info[] */
-  
-  /* fields for scan optimisation */
-  int pass_number_scan_opt_base; /* pass number where scan optimization begins */
-  unsigned char * scan_buffer[64]; /* buffer for a given scan */
-  unsigned long scan_size[64]; /* size for a given scan */
-  int actual_Al[64]; /* actual value of Al used for a scan */
-  unsigned long best_cost; /* bit count for best frequency split */
-  int best_freq_split_idx_luma; /* index for best frequency split (luma) */
-  int best_freq_split_idx_chroma; /* index for best frequency split (chroma) */
-  int best_Al_luma; /* best value for Al found in scan search (luma) */
-  int best_Al_chroma; /* best value for Al found in scan search (luma) */
-  boolean interleave_chroma_dc; /* indicate whether to interleave chroma DC scans */
-  struct jpeg_destination_mgr * saved_dest; /* saved value of cinfo->dest */
-} my_comp_master;
-
-typedef my_comp_master * my_master_ptr;
+#include "jcmaster.h"
 
 
 /*
@@ -919,21 +884,13 @@ finish_pass_master (j_compress_ptr cinfo)
 GLOBAL(void)
 jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
 {
-  my_master_ptr master;
+  my_master_ptr master = (my_master_ptr) cinfo->master;
 
-  master = (my_master_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  sizeof(my_comp_master));
-  if (cinfo->master) {
-    MEMCOPY(&master->pub, cinfo->master, sizeof(struct jpeg_comp_master));
-    jpeg_free_small((j_common_ptr) cinfo, cinfo->master,
-                    sizeof(struct jpeg_comp_master));
-  }
-  cinfo->master = (struct jpeg_comp_master *) master;
   master->pub.prepare_for_pass = prepare_for_pass;
   master->pub.pass_startup = pass_startup;
   master->pub.finish_pass = finish_pass_master;
   master->pub.is_last_pass = FALSE;
+  master->pub.call_pass_startup = FALSE;
 
   /* Validate parameters, determine derived values */
   initial_setup(cinfo, transcode_only);
diff --git a/jcmaster.h b/jcmaster.h
new file mode 100644
index 00000000..8b1559d1
--- /dev/null
+++ b/jcmaster.h
@@ -0,0 +1,47 @@
+/*
+ * jcmaster.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * mozjpeg Modifications:
+ * Copyright (C) 2014, Mozilla Corporation.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains the master control structures for the JPEG compressor.
+ */
+
+
+/* Private state */
+
+typedef enum {
+        main_pass,              /* input data, also do first output step */
+        huff_opt_pass,          /* Huffman code optimization pass */
+        output_pass,            /* data output pass */
+        trellis_pass            /* trellis quantization pass */
+} c_pass_type;
+
+typedef struct {
+  struct jpeg_comp_master pub;  /* public fields */
+
+  c_pass_type pass_type;        /* the type of the current pass */
+
+  int pass_number;              /* # of passes completed */
+  int total_passes;             /* total # of passes needed */
+
+  int scan_number;              /* current index in scan_info[] */
+
+  /* fields for scan optimisation */
+  int pass_number_scan_opt_base; /* pass number where scan optimization begins */
+  unsigned char * scan_buffer[64]; /* buffer for a given scan */
+  unsigned long scan_size[64]; /* size for a given scan */
+  int actual_Al[64]; /* actual value of Al used for a scan */
+  unsigned long best_cost; /* bit count for best frequency split */
+  int best_freq_split_idx_luma; /* index for best frequency split (luma) */
+  int best_freq_split_idx_chroma; /* index for best frequency split (chroma) */
+  int best_Al_luma; /* best value for Al found in scan search (luma) */
+  int best_Al_chroma; /* best value for Al found in scan search (luma) */
+  boolean interleave_chroma_dc; /* indicate whether to interleave chroma DC scans */
+  struct jpeg_destination_mgr * saved_dest; /* saved value of cinfo->dest */
+} my_comp_master;
+
+typedef my_comp_master * my_master_ptr;

From 7432a53e2b849a1ccf0632aca516474561a1b546 Mon Sep 17 00:00:00 2001
From: DRC <information@virtualgl.org>
Date: Sun, 14 Dec 2014 04:43:02 -0600
Subject: [PATCH 042/108] Introduce a TJ_REVERT environment variable, which
 reverts the behavior of the TurboJPEG API to libjpeg[-turbo] defaults. This
 serves the same purpose as the -revert switch in cjpeg.  It is primarily
 included for testing purposes.

---
 turbojpeg.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/turbojpeg.c b/turbojpeg.c
index 4d625c8c..e38fb984 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c
@@ -206,6 +206,8 @@ static int setCompDefaults(struct jpeg_compress_struct *cinfo,
 	}
 
 	cinfo->input_components=tjPixelSize[pixelFormat];
+	if((env=getenv("TJ_REVERT"))!=NULL && strlen(env)>0 && !strcmp(env, "1"))
+		cinfo->master->compress_profile=JCP_FASTEST;
 	jpeg_set_defaults(cinfo);
 
 	if((env=getenv("TJ_OPTIMIZE"))!=NULL && strlen(env)>0 && !strcmp(env, "1"))

From 3bc0839d7f1ea775385128028808ef757a941eb9 Mon Sep 17 00:00:00 2001
From: DRC <information@virtualgl.org>
Date: Sun, 14 Dec 2014 05:31:57 -0600
Subject: [PATCH 043/108] Restore full bitwise compatibility with libjpeg-turbo
 when -revert is used mozjpeg should produce identical output to libjpeg-turbo
 when the JCP_FASTEST compression profile is used.  That means that that
 profile needs to revert to the default libjpeg quantization/Huffman tables as
 well as disable mozjpeg's duplicate table checking feature.  This patch also
 adds -revert to any instance of cjpeg and jpegtran called by 'make test' (or
 ctest on Windows), so that those tests actually work again.  The tests aren't
 useful for regression testing the mozjpeg extensions, but at least they can
 now be used to regression test the underlying code.

---
 CMakeLists.txt | 22 +++++++++++-----------
 Makefile.am    | 22 +++++++++++-----------
 jcmarker.c     |  6 ++++++
 jcparam.c      |  3 ++-
 4 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 143fd370..fe4ef969 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -438,7 +438,7 @@ foreach(libtype shared static)
 
   # CC: null  SAMP: fullsize  FDCT: islow  ENT: huff
   add_test(cjpeg${suffix}-rgb-islow
-    ${dir}cjpeg${suffix} -rgb -dct int -outfile testout_rgb_islow.jpg
+    ${dir}cjpeg${suffix} -revert -rgb -dct int -outfile testout_rgb_islow.jpg
       ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
   add_test(cjpeg${suffix}-rgb-islow-cmp
     ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_RGB_ISLOW} -DFILE=testout_rgb_islow.jpg
@@ -471,7 +471,7 @@ foreach(libtype shared static)
 
   # CC: RGB->YCC  SAMP: fullsize/h2v1  FDCT: ifast  ENT: 2-pass huff
   add_test(cjpeg${suffix}-422-ifast-opt
-    ${dir}cjpeg${suffix} -sample 2x1 -dct fast -opt
+    ${dir}cjpeg${suffix} -revert -sample 2x1 -dct fast -opt
       -outfile testout_422_ifast_opt.jpg
       ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
   add_test(cjpeg${suffix}-422-ifast-opt-cmp
@@ -513,7 +513,7 @@ foreach(libtype shared static)
 
   # CC: RGB->YCC  SAMP: fullsize/h2v2  FDCT: ifast  ENT: prog huff
   add_test(cjpeg${suffix}-420-q100-ifast-prog
-    ${dir}cjpeg${suffix} -sample 2x2 -quality 100 -dct fast -prog
+    ${dir}cjpeg${suffix} -revert -sample 2x2 -quality 100 -dct fast -prog
       -outfile testout_420_q100_ifast_prog.jpg
       ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
   add_test(cjpeg${suffix}-420-q100-ifast-prog-cmp
@@ -539,7 +539,7 @@ foreach(libtype shared static)
 
   # CC: RGB->Gray  SAMP: fullsize  FDCT: islow  ENT: huff
   add_test(cjpeg${suffix}-gray-islow
-    ${dir}cjpeg${suffix} -gray -dct int -outfile testout_gray_islow.jpg
+    ${dir}cjpeg${suffix} -revert -gray -dct int -outfile testout_gray_islow.jpg
       ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
   add_test(cjpeg${suffix}-gray-islow-cmp
     ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_GRAY_ISLOW}
@@ -583,7 +583,7 @@ foreach(libtype shared static)
   # CC: RGB->YCC  SAMP: fullsize smooth/h2v2 smooth  FDCT: islow
   # ENT: 2-pass huff
   add_test(cjpeg${suffix}-420s-ifast-opt
-    ${dir}cjpeg${suffix} -sample 2x2 -smooth 1 -dct int -opt -outfile
+    ${dir}cjpeg${suffix} -revert -sample 2x2 -smooth 1 -dct int -opt -outfile
       testout_420s_ifast_opt.jpg ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
   add_test(cjpeg${suffix}-420s-ifast-opt-cmp
     ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_420S_IFAST_OPT}
@@ -592,7 +592,7 @@ foreach(libtype shared static)
 
   # CC: RGB->YCC  SAMP: fullsize/int  FDCT: float  ENT: prog huff
   add_test(cjpeg${suffix}-3x2-float-prog
-    ${dir}cjpeg${suffix} -sample 3x2 -dct float -prog
+    ${dir}cjpeg${suffix} -revert -sample 3x2 -dct float -prog
       -outfile testout_3x2_float_prog.jpg
       ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
   add_test(cjpeg${suffix}-3x2-float-prog-cmp
@@ -610,7 +610,7 @@ foreach(libtype shared static)
   if(WITH_ARITH_ENC)
     # CC: YCC->RGB  SAMP: fullsize/h2v2  FDCT: islow  ENT: arith
     add_test(cjpeg${suffix}-420-islow-ari
-      ${dir}cjpeg${suffix} -dct int -arithmetic
+      ${dir}cjpeg${suffix} -revert -dct int -arithmetic
         -outfile testout_420_islow_ari.jpg
         ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
     add_test(cjpeg${suffix}-420-islow-ari-cmp
@@ -618,7 +618,7 @@ foreach(libtype shared static)
         -DFILE=testout_420_islow_ari.jpg
         -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
     add_test(jpegtran${suffix}-420-islow-ari
-      ${dir}jpegtran${suffix} -arithmetic
+      ${dir}jpegtran${suffix} -revert -arithmetic
         -outfile testout_420_islow_ari.jpg
         ${CMAKE_SOURCE_DIR}/testimages/testimgint.jpg)
     add_test(jpegtran${suffix}-420-islow-ari-cmp
@@ -627,7 +627,7 @@ foreach(libtype shared static)
         -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
     # CC: YCC->RGB  SAMP: fullsize  FDCT: islow  ENT: prog arith
     add_test(cjpeg${suffix}-444-islow-progari
-      ${dir}cjpeg${suffix} -sample 1x1 -dct int -progressive -arithmetic
+      ${dir}cjpeg${suffix} -revert -sample 1x1 -dct int -progressive -arithmetic
         -outfile testout_444_islow_progari.jpg
         ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
     add_test(cjpeg${suffix}-444-islow-progari-cmp
@@ -645,7 +645,7 @@ foreach(libtype shared static)
         -DFILE=testout_420m_ifast_ari.ppm
         -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
     add_test(jpegtran${suffix}-420-islow
-      ${dir}jpegtran${suffix} -outfile testout_420_islow.jpg
+      ${dir}jpegtran${suffix} -revert -outfile testout_420_islow.jpg
         ${CMAKE_SOURCE_DIR}/testimages/testimgari.jpg)
     add_test(jpegtran${suffix}-420-islow-cmp
       ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_420_ISLOW}
@@ -732,7 +732,7 @@ foreach(libtype shared static)
         -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
   endif()
   add_test(jpegtran${suffix}-crop
-    ${dir}jpegtran${suffix} -crop 120x90+20+50 -transpose -perfect
+    ${dir}jpegtran${suffix} -revert -crop 120x90+20+50 -transpose -perfect
       -outfile testout_crop.jpg ${CMAKE_SOURCE_DIR}/testimages/${TESTORIG})
   add_test(jpegtran${suffix}-crop-cmp
     ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_CROP} -DFILE=testout_crop.jpg
diff --git a/Makefile.am b/Makefile.am
index 8a9a4d4b..9d6a231a 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -298,7 +298,7 @@ endif
 # ones.)
 
 # CC: null  SAMP: fullsize  FDCT: islow  ENT: huff
-	./cjpeg -rgb -dct int -outfile testout_rgb_islow.jpg $(srcdir)/testimages/testorig.ppm
+	./cjpeg -revert -rgb -dct int -outfile testout_rgb_islow.jpg $(srcdir)/testimages/testorig.ppm
 	md5/md5cmp $(MD5_JPEG_RGB_ISLOW) testout_rgb_islow.jpg
 # CC: null  SAMP: fullsize  IDCT: islow  ENT: huff
 	./djpeg -dct int -ppm -outfile testout_rgb_islow.ppm testout_rgb_islow.jpg
@@ -318,7 +318,7 @@ else
 endif
 
 # CC: RGB->YCC  SAMP: fullsize/h2v1  FDCT: ifast  ENT: 2-pass huff
-	./cjpeg -sample 2x1 -dct fast -opt -outfile testout_422_ifast_opt.jpg $(srcdir)/testimages/testorig.ppm
+	./cjpeg -revert -sample 2x1 -dct fast -opt -outfile testout_422_ifast_opt.jpg $(srcdir)/testimages/testorig.ppm
 	md5/md5cmp $(MD5_JPEG_422_IFAST_OPT) testout_422_ifast_opt.jpg
 # CC: YCC->RGB  SAMP: fullsize/h2v1 fancy  IDCT: ifast  ENT: huff
 	./djpeg -dct fast -outfile testout_422_ifast.ppm testout_422_ifast_opt.jpg
@@ -342,7 +342,7 @@ else
 endif
 
 # CC: RGB->YCC  SAMP: fullsize/h2v2  FDCT: ifast  ENT: prog huff
-	./cjpeg -sample 2x2 -quality 100 -dct fast -prog -outfile testout_420_q100_ifast_prog.jpg $(srcdir)/testimages/testorig.ppm
+	./cjpeg -revert -sample 2x2 -quality 100 -dct fast -prog -outfile testout_420_q100_ifast_prog.jpg $(srcdir)/testimages/testorig.ppm
 	md5/md5cmp $(MD5_JPEG_420_IFAST_Q100_PROG) testout_420_q100_ifast_prog.jpg
 # CC: YCC->RGB  SAMP: fullsize/h2v2 fancy  IDCT: ifast  ENT: prog huff
 	./djpeg -dct fast -outfile testout_420_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
@@ -354,7 +354,7 @@ endif
 	rm testout_420m_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
 
 # CC: RGB->Gray  SAMP: fullsize  FDCT: islow  ENT: huff
-	./cjpeg -gray -dct int -outfile testout_gray_islow.jpg $(srcdir)/testimages/testorig.ppm
+	./cjpeg -revert -gray -dct int -outfile testout_gray_islow.jpg $(srcdir)/testimages/testorig.ppm
 	md5/md5cmp $(MD5_JPEG_GRAY_ISLOW) testout_gray_islow.jpg
 # CC: Gray->Gray  SAMP: fullsize  IDCT: islow  ENT: huff
 	./djpeg -dct int -outfile testout_gray_islow.ppm testout_gray_islow.jpg
@@ -379,12 +379,12 @@ endif
 
 # CC: RGB->YCC  SAMP: fullsize smooth/h2v2 smooth  FDCT: islow
 # ENT: 2-pass huff
-	./cjpeg -sample 2x2 -smooth 1 -dct int -opt -outfile testout_420s_ifast_opt.jpg $(srcdir)/testimages/testorig.ppm
+	./cjpeg -revert -sample 2x2 -smooth 1 -dct int -opt -outfile testout_420s_ifast_opt.jpg $(srcdir)/testimages/testorig.ppm
 	md5/md5cmp $(MD5_JPEG_420S_IFAST_OPT) testout_420s_ifast_opt.jpg
 	rm testout_420s_ifast_opt.jpg
 
 # CC: RGB->YCC  SAMP: fullsize/int  FDCT: float  ENT: prog huff
-	./cjpeg -sample 3x2 -dct float -prog -outfile testout_3x2_float_prog.jpg $(srcdir)/testimages/testorig.ppm
+	./cjpeg -revert -sample 3x2 -dct float -prog -outfile testout_3x2_float_prog.jpg $(srcdir)/testimages/testorig.ppm
 	md5/md5cmp $(MD5_JPEG_3x2_FLOAT_PROG) testout_3x2_float_prog.jpg
 # CC: YCC->RGB  SAMP: fullsize/int  IDCT: float  ENT: prog huff
 	./djpeg -dct float -outfile testout_3x2_float.ppm testout_3x2_float_prog.jpg
@@ -393,14 +393,14 @@ endif
 
 if WITH_ARITH_ENC
 # CC: YCC->RGB  SAMP: fullsize/h2v2  FDCT: islow  ENT: arith
-	./cjpeg -dct int -arithmetic -outfile testout_420_islow_ari.jpg $(srcdir)/testimages/testorig.ppm
+	./cjpeg -revert -dct int -arithmetic -outfile testout_420_islow_ari.jpg $(srcdir)/testimages/testorig.ppm
 	md5/md5cmp $(MD5_JPEG_420_ISLOW_ARI) testout_420_islow_ari.jpg
 	rm testout_420_islow_ari.jpg
-	./jpegtran -arithmetic -outfile testout_420_islow_ari.jpg $(srcdir)/testimages/testimgint.jpg
+	./jpegtran -revert -arithmetic -outfile testout_420_islow_ari.jpg $(srcdir)/testimages/testimgint.jpg
 	md5/md5cmp $(MD5_JPEG_420_ISLOW_ARI) testout_420_islow_ari.jpg
 	rm testout_420_islow_ari.jpg
 # CC: YCC->RGB  SAMP: fullsize  FDCT: islow  ENT: prog arith
-	./cjpeg -sample 1x1 -dct int -progressive -arithmetic -outfile testout_444_islow_progari.jpg $(srcdir)/testimages/testorig.ppm
+	./cjpeg -revert -sample 1x1 -dct int -progressive -arithmetic -outfile testout_444_islow_progari.jpg $(srcdir)/testimages/testorig.ppm
 	md5/md5cmp $(MD5_JPEG_444_ISLOW_PROGARI) testout_444_islow_progari.jpg
 	rm testout_444_islow_progari.jpg
 endif
@@ -409,7 +409,7 @@ if WITH_ARITH_DEC
 	./djpeg -fast -ppm -outfile testout_420m_ifast_ari.ppm $(srcdir)/testimages/testimgari.jpg
 	md5/md5cmp $(MD5_PPM_420M_IFAST_ARI) testout_420m_ifast_ari.ppm
 	rm testout_420m_ifast_ari.ppm
-	./jpegtran -outfile testout_420_islow.jpg $(srcdir)/testimages/testimgari.jpg
+	./jpegtran -revert -outfile testout_420_islow.jpg $(srcdir)/testimages/testimgari.jpg
 	md5/md5cmp $(MD5_JPEG_420_ISLOW) testout_420_islow.jpg
 	rm testout_420_islow.jpg
 endif
@@ -486,7 +486,7 @@ else
 	rm testout_420m_islow_565D.bmp
 endif
 
-	./jpegtran -crop 120x90+20+50 -transpose -perfect -outfile testout_crop.jpg $(srcdir)/testimages/$(TESTORIG)
+	./jpegtran -revert -crop 120x90+20+50 -transpose -perfect -outfile testout_crop.jpg $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_JPEG_CROP) testout_crop.jpg
 	rm testout_crop.jpg
 
diff --git a/jcmarker.c b/jcmarker.c
index 9268b0c9..655f7de0 100644
--- a/jcmarker.c
+++ b/jcmarker.c
@@ -194,6 +194,9 @@ emit_multi_dqt (j_compress_ptr cinfo)
   int ci;
   int size = 0;
 
+  if (cinfo->master->compress_profile == JCP_FASTEST)
+    return -1;
+
   for (ci = 0; ci < cinfo->num_components; ci++) {
     int tbl_num = cinfo->comp_info[ci].quant_tbl_no;
     int i;
@@ -296,6 +299,9 @@ emit_multi_dht (j_compress_ptr cinfo)
   JHUFF_TBL *dcseen[NUM_HUFF_TBLS] = { NULL };
   JHUFF_TBL *acseen[NUM_HUFF_TBLS] = { NULL };
 
+  if (cinfo->master->compress_profile == JCP_FASTEST)
+    return 0;
+
   /* Calclate the total length. */
   for (i = 0; i < cinfo->comps_in_scan; i++) {
     jpeg_component_info *compptr = cinfo->cur_comp_info[i];
diff --git a/jcparam.c b/jcparam.c
index 9b68761b..e47c0576 100644
--- a/jcparam.c
+++ b/jcparam.c
@@ -501,7 +501,8 @@ jpeg_set_defaults (j_compress_ptr cinfo)
     cinfo->master->compress_profile == JCP_MAX_COMPRESSION;
   cinfo->master->lambda_log_scale1 = 14.75;
   cinfo->master->lambda_log_scale2 = 16.5;
-  cinfo->master->quant_tbl_master_idx = 3;
+  cinfo->master->quant_tbl_master_idx =
+    cinfo->master->compress_profile == JCP_MAX_COMPRESSION ? 3 : 0;
   
   cinfo->master->use_lambda_weight_tbl = TRUE;
   cinfo->master->use_scans_in_trellis = FALSE;

From 6b99f99b881d118f874db55aa63577cdbe9cb0fb Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Sun, 14 Dec 2014 11:47:08 +0000
Subject: [PATCH 044/108] Fix 'make dist'

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1440 632fc199-4ca6-4c93-a231-07263d6284db
---
 Makefile.am | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile.am b/Makefile.am
index 1aeb5ddb..569c0bbc 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -168,7 +168,8 @@ dist_example_DATA = example.c
 
 EXTRA_DIST = win release $(DOCS) testimages CMakeLists.txt \
 	sharedlib/CMakeLists.txt cmakescripts libjpeg.map.in doc doxygen.config \
-	doxygen-extra.css jccolext.c jdcolext.c jdcol565.c jdmrgext.c jstdhuff.c
+	doxygen-extra.css jccolext.c jdcolext.c jdcol565.c jdmrgext.c jdmrg565.c \
+	jstdhuff.c
 
 dist-hook:
 	rm -rf `find $(distdir) -name .svn`

From fa01dcb5191ab563e7cddbae1fafda623e9b0277 Mon Sep 17 00:00:00 2001
From: DRC <information@virtualgl.org>
Date: Sun, 14 Dec 2014 06:00:40 -0600
Subject: [PATCH 045/108] Partially fix 'make dist' This incorporates an
 upstream fix to add jdmrg565.c to the tarball created by 'make dist', as well
 as a fix to add the new jcmaster.h file to same.  There are still some
 mozjpeg-specific files that aren't added when doing 'make dist'. I'll let
 someone else worry about those.  This patch mainly ensures that any files
 that might be eventually adopted upstream are included.

---
 Makefile.am | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index 9d6a231a..1f84bb9b 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -11,8 +11,8 @@ endif
 nodist_include_HEADERS = jconfig.h
 
 
-HDRS = jchuff.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
-	jpegint.h jpeglib.h jversion.h jsimd.h jsimddct.h jpegcomp.h \
+HDRS = jchuff.h jcmaster.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h \
+	jmorecfg.h jpegint.h jpeglib.h jversion.h jsimd.h jsimddct.h jpegcomp.h \
 	jpeg_nbits_table.h
 
 libjpeg_la_SOURCES = $(HDRS) jcapimin.c jcapistd.c jccoefct.c jccolor.c \
@@ -182,7 +182,8 @@ dist_example_DATA = example.c
 
 EXTRA_DIST = win release $(DOCS) testimages CMakeLists.txt \
 	sharedlib/CMakeLists.txt cmakescripts libjpeg.map.in doc doxygen.config \
-	doxygen-extra.css jccolext.c jdcolext.c jdcol565.c jdmrgext.c jstdhuff.c
+	doxygen-extra.css jccolext.c jdcolext.c jdcol565.c jdmrgext.c jdmrg565.c \
+	jstdhuff.c
 
 dist-hook:
 	rm -rf `find $(distdir) -name .svn`

From a06aeb25f2c5bc986d46301113df2eaf2a3c055c Mon Sep 17 00:00:00 2001
From: Antony Dovgal <tony@daylessday.org>
Date: Tue, 16 Dec 2014 13:52:33 +0300
Subject: [PATCH 046/108] port upstream fix for local buffer overflow

r1425

Fix Huffman local buffer overrun discovered by Debian developers when
attempting to transform a junk image using ImageMagick:
https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=768369
---
 ChangeLog.txt | 12 ++++++++++++
 jchuff.c      | 11 ++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index 9fa884a4..ec9da645 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -16,6 +16,18 @@ instead of -1 if width was < 1.
 [8] cjpeg, djpeg, and jpegtran now accept an argument of -version, which will
 print the library version and exit.
 
+[9] Referring to 1.4 beta1 [15], another extremely rare circumstance was
+discovered under which the Huffman encoder's local buffer can be overrun
+when a buffered destination manager is being used and an
+extremely-high-frequency block (basically junk image data) is being encoded.
+Even though the Huffman local buffer was increased from 128 bytes to 136 bytes
+to address the previous issue, the new issue caused even the larger buffer to
+be overrun.  Further analysis reveals that, in the absolute worst case (such as
+setting alternating AC coefficients to 32767 and -32768 in the JPEG scanning
+order), the Huffman encoder can produce encoded blocks that approach double the
+size of the unencoded blocks.  Thus, the Huffman local buffer was increased to
+256 bytes, which should prevent any such issue from re-occurring in the future.
+
 
 1.3.90 (1.4 beta1)
 ==================
diff --git a/jchuff.c b/jchuff.c
index d8eb1d35..69a21cfe 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -408,7 +408,16 @@ dump_buffer (working_state * state)
 #endif
 
 
-#define BUFSIZE (DCTSIZE2 * 2) + 8
+/* Although it is exceedingly rare, it is possible for a Huffman-encoded
+ * coefficient block to be larger than the 128-byte unencoded block.  For each
+ * of the 64 coefficients, PUT_BITS is invoked twice, and each invocation can
+ * theoretically store 16 bits (for a maximum of 2048 bits or 256 bytes per
+ * encoded block.)  If, for instance, one artificially sets the AC
+ * coefficients to alternating values of 32767 and -32768 (using the JPEG
+ * scanning order-- 1, 8, 16, etc.), then this will produce an encoded block
+ * larger than 200 bytes.
+ */
+#define BUFSIZE (DCTSIZE2 * 4)
 
 #define LOAD_BUFFER() { \
   if (state->free_in_buffer < BUFSIZE) { \

From a6b1bda0940c50ea74b64541c55778b9e66fdb88 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Wed, 17 Dec 2014 07:51:12 +0900
Subject: [PATCH 047/108] Consider vertical gradient in DC trellis

Add extension parameter JFLOAT_TRELLIS_DELTA_DC_WEIGHT that controls
how distortion is calculated in DC trellis quantization. The parameter
defines weighting between actual distortion of DC and distortion of
vertical gradient of DC.
By default the parameter is 0.0 and has no effect.
Addresses #117
---
 README-mozilla.txt |  7 +++++++
 cjpeg.c            |  7 +++++++
 jccoefct.c         |  3 ++-
 jcdctmgr.c         | 21 ++++++++++++++++++++-
 jcext.c            |  6 ++++++
 jchuff.h           |  3 ++-
 jcparam.c          |  1 +
 jpegint.h          |  2 ++
 jpeglib.h          |  3 ++-
 9 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/README-mozilla.txt b/README-mozilla.txt
index ebb86e6e..7187f939 100644
--- a/README-mozilla.txt
+++ b/README-mozilla.txt
@@ -131,6 +131,13 @@ Floating Point Extension Parameters Supported by mozjpeg
   rely on n:
     lambda = 2^(s1-12) / q^2.
 
+* JFLOAT_TRELLIS_DELTA_DC_WEIGHT (default: 0.0)
+  This parameter controls how distortion is calculated in DC trellis quantization
+  (enabled with JBOOLEAN_TRELLIS_QUANT_DC). It defines weighting between distortion
+  of the DC coefficient and distortion of the vertical gradient of DC coefficients.
+  The value of the parameter corresponds to the weight applied to the distortion
+  of the vertical gradient.
+
 
 Integer Extension Parameters Supported by mozjpeg
 -------------------------------------------------
diff --git a/cjpeg.c b/cjpeg.c
index 41558a79..c6ffb09f 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -530,6 +530,13 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       /* disable trellis quantization */
       jpeg_c_set_bool_param(cinfo, JBOOLEAN_TRELLIS_QUANT, FALSE);
       
+    } else if (keymatch(arg, "trellis-dc-ver-weight", 12)) {
+      if (++argn >= argc) {      /* advance to next argument */
+        fprintf(stderr, "%s: missing argument for trellis-dc-ver-weight\n", progname);
+        usage();
+      }
+      jpeg_c_set_float_param(cinfo, JFLOAT_TRELLIS_DELTA_DC_WEIGHT, atof(argv[argn]));
+      
     } else if (keymatch(arg, "trellis-dc", 9)) {
       /* enable DC trellis quantization */
       jpeg_c_set_bool_param(cinfo, JBOOLEAN_TRELLIS_QUANT_DC, TRUE);
diff --git a/jccoefct.c b/jccoefct.c
index 38769976..3a792c8b 100644
--- a/jccoefct.c
+++ b/jccoefct.c
@@ -405,12 +405,13 @@ compress_trellis_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
      */
     for (block_row = 0; block_row < block_rows; block_row++) {
       thisblockrow = buffer[block_row];
+      lastblockrow = (block_row > 0) ? buffer[block_row-1] : NULL;
       quantize_trellis(cinfo, dctbl, actbl, thisblockrow,
                        buffer_dst[block_row], blocks_across,
                        cinfo->quant_tbl_ptrs[compptr->quant_tbl_no],
                        cinfo->master->norm_src[compptr->quant_tbl_no],
                        cinfo->master->norm_coef[compptr->quant_tbl_no],
-                       &lastDC);
+                       &lastDC, lastblockrow, buffer_dst[block_row-1]);
       
       if (ndummy > 0) {
         /* Create dummy blocks at the right edge of the image. */
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 518592b8..a373f3ff 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -879,7 +879,8 @@ static const float jpeg_lambda_weights_csf_luma[64] = {
 
 GLOBAL(void)
 quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actbl, JBLOCKROW coef_blocks, JBLOCKROW src, JDIMENSION num_blocks,
-                 JQUANT_TBL * qtbl, double *norm_src, double *norm_coef, JCOEF *last_dc_val)
+                 JQUANT_TBL * qtbl, double *norm_src, double *norm_coef, JCOEF *last_dc_val,
+                 JBLOCKROW coef_blocks_above, JBLOCKROW src_above)
 {
   int i, j, k, l;
   float accumulated_zero_dist[DCTSIZE2];
@@ -1000,6 +1001,24 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
         dc_candidate_dist = delta * delta * lambda_dc;
         dc_candidate[k][bi] *= 1 + 2*sign;
         
+        /* Take into account DC differences */
+        if (coef_blocks_above && src_above && cinfo->master->trellis_delta_dc_weight > 0.0) {
+          int dc_above_orig;
+          int dc_above_recon;
+          int dc_orig;
+          int dc_recon;
+          float vertical_dist;
+          
+          dc_above_orig = src_above[bi][0];
+          dc_above_recon = coef_blocks_above[bi][0] * q;
+          dc_orig = src[bi][0];
+          dc_recon = dc_candidate[k][bi] * q;
+          /* delta is difference of vertical gradients */
+          delta = (dc_above_orig - dc_orig) - (dc_above_recon - dc_recon);
+          vertical_dist = delta * delta * lambda_dc;
+          dc_candidate_dist +=  cinfo->master->trellis_delta_dc_weight * (vertical_dist - dc_candidate_dist);
+        }
+        
         if (bi == 0) {
           dc_delta = dc_candidate[k][bi] - *last_dc_val;
 
diff --git a/jcext.c b/jcext.c
index 30ec3a4f..5a6a110b 100644
--- a/jcext.c
+++ b/jcext.c
@@ -103,6 +103,7 @@ jpeg_c_float_param_supported (j_compress_ptr cinfo, J_FLOAT_PARAM param)
   switch (param) {
   case JFLOAT_LAMBDA_LOG_SCALE1:
   case JFLOAT_LAMBDA_LOG_SCALE2:
+  case JFLOAT_TRELLIS_DELTA_DC_WEIGHT:
     return TRUE;
   }
 
@@ -120,6 +121,9 @@ jpeg_c_set_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param, float value)
   case JFLOAT_LAMBDA_LOG_SCALE2:
     cinfo->master->lambda_log_scale2 = value;
     break;
+  case JFLOAT_TRELLIS_DELTA_DC_WEIGHT:
+    cinfo->master->trellis_delta_dc_weight = value;
+    break;
   default:
     ERREXIT(cinfo, JERR_BAD_PARAM);
   }
@@ -134,6 +138,8 @@ jpeg_c_get_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param)
     return cinfo->master->lambda_log_scale1;
   case JFLOAT_LAMBDA_LOG_SCALE2:
     return cinfo->master->lambda_log_scale2;
+  case JFLOAT_TRELLIS_DELTA_DC_WEIGHT:
+    return cinfo->master->trellis_delta_dc_weight;
   default:
     ERREXIT(cinfo, JERR_BAD_PARAM);
   }
diff --git a/jchuff.h b/jchuff.h
index b58538bc..19d1c647 100644
--- a/jchuff.h
+++ b/jchuff.h
@@ -45,4 +45,5 @@ EXTERN(void) jpeg_gen_optimal_table
 
 EXTERN(void) quantize_trellis
         (j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actbl, JBLOCKROW coef_blocks, JBLOCKROW src, JDIMENSION num_blocks,
-                 JQUANT_TBL * qtbl, double *norm_src, double *norm_coef, JCOEF *last_dc_val);
+                 JQUANT_TBL * qtbl, double *norm_src, double *norm_coef, JCOEF *last_dc_val,
+         JBLOCKROW coef_blocks_above, JBLOCKROW src_above);
diff --git a/jcparam.c b/jcparam.c
index e47c0576..224d687c 100644
--- a/jcparam.c
+++ b/jcparam.c
@@ -510,6 +510,7 @@ jpeg_set_defaults (j_compress_ptr cinfo)
   cinfo->master->trellis_num_loops = 1;
   cinfo->master->trellis_q_opt = FALSE;
   cinfo->master->trellis_quant_dc = TRUE;
+  cinfo->master->trellis_delta_dc_weight = 0.0;
 }
 
 
diff --git a/jpegint.h b/jpegint.h
index 36c5e409..1b8f77bf 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -87,6 +87,8 @@ struct jpeg_comp_master {
 
   float lambda_log_scale1;
   float lambda_log_scale2;
+  
+  float trellis_delta_dc_weight;
 };
 
 /* Main buffer control (downsampled-data buffer) */
diff --git a/jpeglib.h b/jpeglib.h
index be895a96..059b0cf4 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -288,7 +288,8 @@ typedef enum {
 
 typedef enum {
   JFLOAT_LAMBDA_LOG_SCALE1 = 0x5B61A599,
-  JFLOAT_LAMBDA_LOG_SCALE2 = 0xB9BBAE03
+  JFLOAT_LAMBDA_LOG_SCALE2 = 0xB9BBAE03,
+  JFLOAT_TRELLIS_DELTA_DC_WEIGHT = 0x13775453
 } J_FLOAT_PARAM;
 
 /* Integer parameters */

From 8b7ffebe421bf0671ce792ebc937b3124c98d1b3 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Wed, 17 Dec 2014 07:53:18 +0900
Subject: [PATCH 048/108] Fix filename in C file header

---
 jcext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jcext.c b/jcext.c
index 5a6a110b..9c95f198 100644
--- a/jcext.c
+++ b/jcext.c
@@ -1,5 +1,5 @@
 /*
- * jccompat.c
+ * jcext.c
  *
  * Copyright (C) 2014, D. R. Commander.
  * Copyright (C) 2014, Mozilla Corporation.

From 296c8bad7e2698a72623b1122ab458a131ed0bbc Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Wed, 17 Dec 2014 08:00:29 +0000
Subject: [PATCH 049/108] Fix cosmetic issues in AltiVec comments

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1442 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/jsimd_powerpc_altivec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/simd/jsimd_powerpc_altivec.c b/simd/jsimd_powerpc_altivec.c
index 84132d03..e18eaa8e 100644
--- a/simd/jsimd_powerpc_altivec.c
+++ b/simd/jsimd_powerpc_altivec.c
@@ -47,7 +47,7 @@
   row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
   \
                                        /* transpose coefficients (phase 2) */ \
-  col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61} */ \
+  col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \
   col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
   col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
   col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
@@ -58,7 +58,7 @@
   \
                                        /* transpose coefficients (phase 3) */ \
   col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */   \
-  col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71} */   \
+  col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */   \
   col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */   \
   col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */   \
   col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */   \

From 25e40dc42caa23c912147c1b9867d99b2767f631 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Wed, 17 Dec 2014 08:04:39 +0000
Subject: [PATCH 050/108] AltiVec SIMD implementation of slow integer forward
 DCT;  Clean up fast integer forward DCT code so that it is easier to see how
 it derives from the SSE2 code and to make it play more nicely with the slow
 FDCT code.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1443 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/jsimd.h                 |   2 +
 simd/jsimd_powerpc.c         |  12 ++
 simd/jsimd_powerpc_altivec.c | 299 +++++++++++++++++++++++++++++++++--
 3 files changed, 300 insertions(+), 13 deletions(-)

diff --git a/simd/jsimd.h b/simd/jsimd.h
index b0329728..4dcdfc1a 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -545,6 +545,8 @@ EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM * data);
 
 EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM * data);
 
+EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM * data);
+
 /* Fast Integer Forward DCT */
 EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM * data);
 
diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c
index a9a5965d..ff37c5f0 100644
--- a/simd/jsimd_powerpc.c
+++ b/simd/jsimd_powerpc.c
@@ -226,6 +226,17 @@ jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
 GLOBAL(int)
 jsimd_can_fdct_islow (void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
   return 0;
 }
 
@@ -255,6 +266,7 @@ jsimd_can_fdct_float (void)
 GLOBAL(void)
 jsimd_fdct_islow (DCTELEM * data)
 {
+  jsimd_fdct_islow_altivec(data);
 }
 
 GLOBAL(void)
diff --git a/simd/jsimd_powerpc_altivec.c b/simd/jsimd_powerpc_altivec.c
index e18eaa8e..ef32545d 100644
--- a/simd/jsimd_powerpc_altivec.c
+++ b/simd/jsimd_powerpc_altivec.c
@@ -29,6 +29,9 @@
 #include "jsimd.h"
 #include <altivec.h>
 
+
+/* Common code */
+
 #define TRANSPOSE(row, col)  \
 {  \
   __vector short row04l, row04h, row15l, row15h,  \
@@ -67,15 +70,30 @@
   col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */   \
 }
 
-static const __vector short constants __attribute__((aligned(16))) =
+
+/* FAST INTEGER FORWARD DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in IFAST_CONST_SHIFT.)  This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ *   the elements in arg3 + the most significant 17 bits of
+ *     (the elements in arg1 * the elements in arg2).
+ */
+
+#define IFAST_CONST_BITS 8
+#define IFAST_PRE_MULTIPLY_SCALE_BITS 2
+#define IFAST_CONST_SHIFT \
+  (16 - IFAST_PRE_MULTIPLY_SCALE_BITS - IFAST_CONST_BITS - 1)
+
+static const __vector short jconst_fdct_ifast __attribute__((aligned(16))) =
 {
-  98 << 5,   /* FIX(0.382683433) */
-  139 << 5,  /* FIX(0.541196100) */
-  181 << 5,  /* FIX(0.707106781) */
-  334 << 5   /* FIX(1.306562965) */
+  98 << IFAST_CONST_SHIFT,   /* FIX(0.382683433) */
+  139 << IFAST_CONST_SHIFT,  /* FIX(0.541196100) */
+  181 << IFAST_CONST_SHIFT,  /* FIX(0.707106781) */
+  334 << IFAST_CONST_SHIFT   /* FIX(1.306562965) */
 };
 
-#define DO_DCT()  \
+#define DO_FDCT_IFAST()  \
 {  \
   /* Even part */  \
   \
@@ -134,11 +152,12 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
 
   /* Constants */
   __vector short zero = vec_splat_s16(0),
-    PW_0382 = vec_splat(constants, 0),
-    PW_0541 = vec_splat(constants, 1),
-    PW_0707 = vec_splat(constants, 2),
-    PW_1306 = vec_splat(constants, 3);
-  __vector unsigned short PRE_MULTIPLY_SCALE_BITS = vec_splat_u16(2);
+    PW_0382 = vec_splat(jconst_fdct_ifast, 0),
+    PW_0541 = vec_splat(jconst_fdct_ifast, 1),
+    PW_0707 = vec_splat(jconst_fdct_ifast, 2),
+    PW_1306 = vec_splat(jconst_fdct_ifast, 3);
+  __vector unsigned short PRE_MULTIPLY_SCALE_BITS =
+    vec_splat_u16(IFAST_PRE_MULTIPLY_SCALE_BITS);
 
   /* Pass 1: process rows. */
 
@@ -162,7 +181,7 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
   tmp3 = vec_add(col3, col4);
   tmp4 = vec_sub(col3, col4);
 
-  DO_DCT();
+  DO_FDCT_IFAST();
 
   /* Pass 2: process columns. */
 
@@ -177,7 +196,261 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
   tmp3 = vec_add(row3, row4);
   tmp4 = vec_sub(row3, row4);
 
-  DO_DCT();
+  DO_FDCT_IFAST();
+
+  *(__vector short *)&data[0] = out0;
+  *(__vector short *)&data[8] = out1;
+  *(__vector short *)&data[16] = out2;
+  *(__vector short *)&data[24] = out3;
+  *(__vector short *)&data[32] = out4;
+  *(__vector short *)&data[40] = out5;
+  *(__vector short *)&data[48] = out6;
+  *(__vector short *)&data[56] = out7;
+}
+
+
+/* SLOW INTEGER FORWARD DCT */
+
+#define F_0_298 2446   /* FIX(0.298631336) */
+#define F_0_390 3196   /* FIX(0.390180644) */
+#define F_0_541 4433   /* FIX(0.541196100) */
+#define F_0_765 6270   /* FIX(0.765366865) */
+#define F_0_899 7373   /* FIX(0.899976223) */
+#define F_1_175 9633   /* FIX(1.175875602) */
+#define F_1_501 12299  /* FIX(1.501321110) */
+#define F_1_847 15137  /* FIX(1.847759065) */
+#define F_1_961 16069  /* FIX(1.961570560) */
+#define F_2_053 16819  /* FIX(2.053119869) */
+#define F_2_562 20995  /* FIX(2.562915447) */
+#define F_3_072 25172  /* FIX(3.072711026) */
+
+#define ISLOW_CONST_BITS 13
+#define ISLOW_PASS1_BITS 2
+#define ISLOW_DESCALE_P1 (ISLOW_CONST_BITS - ISLOW_PASS1_BITS)
+#define ISLOW_DESCALE_P2 (ISLOW_CONST_BITS + ISLOW_PASS1_BITS)
+
+static const __vector int jconst_fdct_islow __attribute__((aligned(16))) =
+{
+  1 << (ISLOW_DESCALE_P1 - 1),
+  1 << (ISLOW_DESCALE_P2 - 1)
+};
+
+static const __vector short jconst_fdct_islow2 __attribute__((aligned(16))) =
+{
+  1 << (ISLOW_PASS1_BITS - 1)
+};
+
+#define DO_FDCT_ISLOW_COMMON(PASS)  \
+{  \
+  tmp1312l = vec_mergeh(tmp13, tmp12);  \
+  tmp1312h = vec_mergel(tmp13, tmp12);  \
+  \
+  out2l = vec_msums(tmp1312l, PW_F130_F054, zero);  \
+  out2h = vec_msums(tmp1312h, PW_F130_F054, zero);  \
+  out6l = vec_msums(tmp1312l, PW_F054_MF130, zero);  \
+  out6h = vec_msums(tmp1312h, PW_F054_MF130, zero);  \
+  \
+  out2l = vec_add(out2l, PD_DESCALE_P##PASS);  \
+  out2h = vec_add(out2h, PD_DESCALE_P##PASS);  \
+  out2l = vec_sr(out2l, DESCALE_P##PASS);  \
+  out2h = vec_sr(out2h, DESCALE_P##PASS);  \
+  \
+  out6l = vec_add(out6l, PD_DESCALE_P##PASS);  \
+  out6h = vec_add(out6h, PD_DESCALE_P##PASS);  \
+  out6l = vec_sr(out6l, DESCALE_P##PASS);  \
+  out6h = vec_sr(out6h, DESCALE_P##PASS);  \
+  \
+  out2 = vec_pack(out2l, out2h);  \
+  out6 = vec_pack(out6l, out6h);  \
+  \
+  /* Odd part */  \
+  \
+  z3 = vec_add(tmp4, tmp6);  \
+  z4 = vec_add(tmp5, tmp7);  \
+  \
+  z34l = vec_mergeh(z3, z4);  \
+  z34h = vec_mergel(z3, z4);  \
+  \
+  z3l = vec_msums(z34l, PW_MF078_F117, zero);  \
+  z3h = vec_msums(z34h, PW_MF078_F117, zero);  \
+  z4l = vec_msums(z34l, PW_F117_F078, zero);  \
+  z4h = vec_msums(z34h, PW_F117_F078, zero);  \
+  \
+  tmp47l = vec_mergeh(tmp4, tmp7);  \
+  tmp47h = vec_mergel(tmp4, tmp7);  \
+  \
+  tmp4l = vec_msums(tmp47l, PW_MF060_MF089, zero);  \
+  tmp4h = vec_msums(tmp47h, PW_MF060_MF089, zero);  \
+  tmp7l = vec_msums(tmp47l, PW_MF089_F060, zero);  \
+  tmp7h = vec_msums(tmp47h, PW_MF089_F060, zero);  \
+  \
+  out7l = vec_add(z3l, tmp4l);  \
+  out7h = vec_add(z3h, tmp4h);  \
+  out1l = vec_add(z4l, tmp7l);  \
+  out1h = vec_add(z4h, tmp7h);  \
+  \
+  out7l = vec_add(out7l, PD_DESCALE_P##PASS);  \
+  out7h = vec_add(out7h, PD_DESCALE_P##PASS);  \
+  out7l = vec_sr(out7l, DESCALE_P##PASS);  \
+  out7h = vec_sr(out7h, DESCALE_P##PASS);  \
+  \
+  out1l = vec_add(out1l, PD_DESCALE_P##PASS);  \
+  out1h = vec_add(out1h, PD_DESCALE_P##PASS);  \
+  out1l = vec_sr(out1l, DESCALE_P##PASS);  \
+  out1h = vec_sr(out1h, DESCALE_P##PASS);  \
+  \
+  out7 = vec_pack(out7l, out7h);  \
+  out1 = vec_pack(out1l, out1h);  \
+  \
+  tmp56l = vec_mergeh(tmp5, tmp6);  \
+  tmp56h = vec_mergel(tmp5, tmp6);  \
+  \
+  tmp5l = vec_msums(tmp56l, PW_MF050_MF256, zero);  \
+  tmp5h = vec_msums(tmp56h, PW_MF050_MF256, zero);  \
+  tmp6l = vec_msums(tmp56l, PW_MF256_F050, zero);  \
+  tmp6h = vec_msums(tmp56h, PW_MF256_F050, zero);  \
+  \
+  out5l = vec_add(tmp5l, z4l);  \
+  out5h = vec_add(tmp5h, z4h);  \
+  out3l = vec_add(tmp6l, z3l);  \
+  out3h = vec_add(tmp6h, z3h);  \
+  \
+  out5l = vec_add(out5l, PD_DESCALE_P##PASS);  \
+  out5h = vec_add(out5h, PD_DESCALE_P##PASS);  \
+  out5l = vec_sr(out5l, DESCALE_P##PASS);  \
+  out5h = vec_sr(out5h, DESCALE_P##PASS);  \
+  \
+  out3l = vec_add(out3l, PD_DESCALE_P##PASS);  \
+  out3h = vec_add(out3h, PD_DESCALE_P##PASS);  \
+  out3l = vec_sr(out3l, DESCALE_P##PASS);  \
+  out3h = vec_sr(out3h, DESCALE_P##PASS);  \
+  \
+  out5 = vec_pack(out5l, out5h);  \
+  out3 = vec_pack(out3l, out3h);  \
+}
+
+#define DO_FDCT_ISLOW_ROWS()  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(tmp0, tmp3);  \
+  tmp13 = vec_sub(tmp0, tmp3);  \
+  tmp11 = vec_add(tmp1, tmp2);  \
+  tmp12 = vec_sub(tmp1, tmp2);  \
+  \
+  out0  = vec_add(tmp10, tmp11);  \
+  out0  = vec_sl(out0, PASS1_BITS);  \
+  out4  = vec_sub(tmp10, tmp11);  \
+  out4  = vec_sl(out4, PASS1_BITS);  \
+  \
+  DO_FDCT_ISLOW_COMMON(1);  \
+}
+
+#define DO_FDCT_ISLOW_COLS()  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(tmp0, tmp3);  \
+  tmp13 = vec_sub(tmp0, tmp3);  \
+  tmp11 = vec_add(tmp1, tmp2);  \
+  tmp12 = vec_sub(tmp1, tmp2);  \
+  \
+  out0  = vec_add(tmp10, tmp11);  \
+  out0  = vec_add(out0, PW_DESCALE_P2X);  \
+  out0  = vec_sra(out0, PASS1_BITS);  \
+  out4  = vec_sub(tmp10, tmp11);  \
+  out4  = vec_add(out4, PW_DESCALE_P2X);  \
+  out4  = vec_sra(out4, PASS1_BITS);  \
+  \
+  DO_FDCT_ISLOW_COMMON(2);  \
+}
+
+void
+jsimd_fdct_islow_altivec (DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+    z3, z4, z34l, z34h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int tmp4l, tmp4h, tmp5l, tmp5h, tmp6l, tmp6h, tmp7l, tmp7h,
+    z3l, z3h, z4l, z4h,
+    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+    out7l, out7h;
+
+  __vector short PW_F130_F054 = {F_0_541 + F_0_765, F_0_541,
+    F_0_541 + F_0_765, F_0_541, F_0_541 + F_0_765, F_0_541,
+    F_0_541 + F_0_765, F_0_541};
+  __vector short PW_F054_MF130 = {F_0_541, F_0_541 - F_1_847,
+    F_0_541, F_0_541 - F_1_847, F_0_541, F_0_541 - F_1_847,
+    F_0_541, F_0_541 - F_1_847};
+  __vector short PW_MF078_F117 = {F_1_175 - F_1_961, F_1_175,
+    F_1_175 - F_1_961, F_1_175, F_1_175 - F_1_961, F_1_175,
+    F_1_175 - F_1_961, F_1_175};
+  __vector short PW_F117_F078 = {F_1_175, F_1_175 - F_0_390,
+    F_1_175, F_1_175 - F_0_390, F_1_175, F_1_175 - F_0_390,
+    F_1_175, F_1_175 - F_0_390};
+  __vector short PW_MF060_MF089 = {F_0_298 - F_0_899, -F_0_899,
+    F_0_298 - F_0_899, -F_0_899, F_0_298 - F_0_899, -F_0_899,
+    F_0_298 - F_0_899, -F_0_899};
+  __vector short PW_MF089_F060 = {-F_0_899, F_1_501 - F_0_899,
+    -F_0_899, F_1_501 - F_0_899, -F_0_899, F_1_501 - F_0_899,
+    -F_0_899, F_1_501 - F_0_899};
+  __vector short PW_MF050_MF256 = {F_2_053 - F_2_562, -F_2_562,
+    F_2_053 - F_2_562, -F_2_562, F_2_053 - F_2_562, -F_2_562,
+    F_2_053 - F_2_562, -F_2_562};
+  __vector short PW_MF256_F050 = {-F_2_562, F_3_072 - F_2_562,
+    -F_2_562, F_3_072 - F_2_562, -F_2_562, F_3_072 - F_2_562,
+    -F_2_562, F_3_072 - F_2_562};
+  __vector short PW_DESCALE_P2X = vec_splat(jconst_fdct_islow2, 0);
+
+  /* Constants */
+  __vector unsigned short PASS1_BITS = vec_splat_u16(ISLOW_PASS1_BITS);
+  __vector int zero = vec_splat_s32(0),
+    PD_DESCALE_P1 = vec_splat(jconst_fdct_islow, 0),
+    PD_DESCALE_P2 = vec_splat(jconst_fdct_islow, 1);
+  __vector unsigned int DESCALE_P1 = vec_splat_u32(ISLOW_DESCALE_P1),
+    DESCALE_P2 = vec_splat_u32(ISLOW_DESCALE_P2);
+
+  /* Pass 1: process rows. */
+
+  row0 = *(__vector short *)&data[0];
+  row1 = *(__vector short *)&data[8];
+  row2 = *(__vector short *)&data[16];
+  row3 = *(__vector short *)&data[24];
+  row4 = *(__vector short *)&data[32];
+  row5 = *(__vector short *)&data[40];
+  row6 = *(__vector short *)&data[48];
+  row7 = *(__vector short *)&data[56];
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_FDCT_ISLOW_ROWS();
+
+  /* Pass 2: process columns. */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_FDCT_ISLOW_COLS();
 
   *(__vector short *)&data[0] = out0;
   *(__vector short *)&data[8] = out1;

From 7475e5963726fb8e60137dd612f5abee2dbda8af Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Thu, 18 Dec 2014 09:49:39 +0000
Subject: [PATCH 051/108] Further cleanup of the AltiVec forward DCT code: --
 Use macros to represent the fast FDCT constants, to facilitate comparing the
 AltiVec implementation of the algorithm with the SSE2 implementation. --
 Rename slow FDCT constants for consistency. -- Use vec_sra() in all cases in
 the slow FDCT code.  The SSE2 implementation uses psraw, which is an
 arithmetic shift, so we need to do likewise with AltiVec.  Using vec_sr()
 hasn't caused any problems yet, but it is conceivable that it might cause
 different behavior in certain corner cases.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1444 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/jsimd_powerpc_altivec.c | 121 ++++++++++++++++++++---------------
 1 file changed, 69 insertions(+), 52 deletions(-)

diff --git a/simd/jsimd_powerpc_altivec.c b/simd/jsimd_powerpc_altivec.c
index ef32545d..6f26bd5f 100644
--- a/simd/jsimd_powerpc_altivec.c
+++ b/simd/jsimd_powerpc_altivec.c
@@ -80,6 +80,11 @@
  *     (the elements in arg1 * the elements in arg2).
  */
 
+#define IFAST_F_0_382 98   /* FIX(0.382683433) */
+#define IFAST_F_0_541 139  /* FIX(0.541196100) */
+#define IFAST_F_0_707 181  /* FIX(0.707106781) */
+#define IFAST_F_1_306 334  /* FIX(1.306562965) */
+
 #define IFAST_CONST_BITS 8
 #define IFAST_PRE_MULTIPLY_SCALE_BITS 2
 #define IFAST_CONST_SHIFT \
@@ -87,10 +92,10 @@
 
 static const __vector short jconst_fdct_ifast __attribute__((aligned(16))) =
 {
-  98 << IFAST_CONST_SHIFT,   /* FIX(0.382683433) */
-  139 << IFAST_CONST_SHIFT,  /* FIX(0.541196100) */
-  181 << IFAST_CONST_SHIFT,  /* FIX(0.707106781) */
-  334 << IFAST_CONST_SHIFT   /* FIX(1.306562965) */
+  IFAST_F_0_382 << IFAST_CONST_SHIFT,
+  IFAST_F_0_541 << IFAST_CONST_SHIFT,
+  IFAST_F_0_707 << IFAST_CONST_SHIFT,
+  IFAST_F_1_306 << IFAST_CONST_SHIFT
 };
 
 #define DO_FDCT_IFAST()  \
@@ -211,18 +216,18 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
 
 /* SLOW INTEGER FORWARD DCT */
 
-#define F_0_298 2446   /* FIX(0.298631336) */
-#define F_0_390 3196   /* FIX(0.390180644) */
-#define F_0_541 4433   /* FIX(0.541196100) */
-#define F_0_765 6270   /* FIX(0.765366865) */
-#define F_0_899 7373   /* FIX(0.899976223) */
-#define F_1_175 9633   /* FIX(1.175875602) */
-#define F_1_501 12299  /* FIX(1.501321110) */
-#define F_1_847 15137  /* FIX(1.847759065) */
-#define F_1_961 16069  /* FIX(1.961570560) */
-#define F_2_053 16819  /* FIX(2.053119869) */
-#define F_2_562 20995  /* FIX(2.562915447) */
-#define F_3_072 25172  /* FIX(3.072711026) */
+#define ISLOW_F_0_298 2446   /* FIX(0.298631336) */
+#define ISLOW_F_0_390 3196   /* FIX(0.390180644) */
+#define ISLOW_F_0_541 4433   /* FIX(0.541196100) */
+#define ISLOW_F_0_765 6270   /* FIX(0.765366865) */
+#define ISLOW_F_0_899 7373   /* FIX(0.899976223) */
+#define ISLOW_F_1_175 9633   /* FIX(1.175875602) */
+#define ISLOW_F_1_501 12299  /* FIX(1.501321110) */
+#define ISLOW_F_1_847 15137  /* FIX(1.847759065) */
+#define ISLOW_F_1_961 16069  /* FIX(1.961570560) */
+#define ISLOW_F_2_053 16819  /* FIX(2.053119869) */
+#define ISLOW_F_2_562 20995  /* FIX(2.562915447) */
+#define ISLOW_F_3_072 25172  /* FIX(3.072711026) */
 
 #define ISLOW_CONST_BITS 13
 #define ISLOW_PASS1_BITS 2
@@ -252,13 +257,13 @@ static const __vector short jconst_fdct_islow2 __attribute__((aligned(16))) =
   \
   out2l = vec_add(out2l, PD_DESCALE_P##PASS);  \
   out2h = vec_add(out2h, PD_DESCALE_P##PASS);  \
-  out2l = vec_sr(out2l, DESCALE_P##PASS);  \
-  out2h = vec_sr(out2h, DESCALE_P##PASS);  \
+  out2l = vec_sra(out2l, DESCALE_P##PASS);  \
+  out2h = vec_sra(out2h, DESCALE_P##PASS);  \
   \
   out6l = vec_add(out6l, PD_DESCALE_P##PASS);  \
   out6h = vec_add(out6h, PD_DESCALE_P##PASS);  \
-  out6l = vec_sr(out6l, DESCALE_P##PASS);  \
-  out6h = vec_sr(out6h, DESCALE_P##PASS);  \
+  out6l = vec_sra(out6l, DESCALE_P##PASS);  \
+  out6h = vec_sra(out6h, DESCALE_P##PASS);  \
   \
   out2 = vec_pack(out2l, out2h);  \
   out6 = vec_pack(out6l, out6h);  \
@@ -291,13 +296,13 @@ static const __vector short jconst_fdct_islow2 __attribute__((aligned(16))) =
   \
   out7l = vec_add(out7l, PD_DESCALE_P##PASS);  \
   out7h = vec_add(out7h, PD_DESCALE_P##PASS);  \
-  out7l = vec_sr(out7l, DESCALE_P##PASS);  \
-  out7h = vec_sr(out7h, DESCALE_P##PASS);  \
+  out7l = vec_sra(out7l, DESCALE_P##PASS);  \
+  out7h = vec_sra(out7h, DESCALE_P##PASS);  \
   \
   out1l = vec_add(out1l, PD_DESCALE_P##PASS);  \
   out1h = vec_add(out1h, PD_DESCALE_P##PASS);  \
-  out1l = vec_sr(out1l, DESCALE_P##PASS);  \
-  out1h = vec_sr(out1h, DESCALE_P##PASS);  \
+  out1l = vec_sra(out1l, DESCALE_P##PASS);  \
+  out1h = vec_sra(out1h, DESCALE_P##PASS);  \
   \
   out7 = vec_pack(out7l, out7h);  \
   out1 = vec_pack(out1l, out1h);  \
@@ -317,13 +322,13 @@ static const __vector short jconst_fdct_islow2 __attribute__((aligned(16))) =
   \
   out5l = vec_add(out5l, PD_DESCALE_P##PASS);  \
   out5h = vec_add(out5h, PD_DESCALE_P##PASS);  \
-  out5l = vec_sr(out5l, DESCALE_P##PASS);  \
-  out5h = vec_sr(out5h, DESCALE_P##PASS);  \
+  out5l = vec_sra(out5l, DESCALE_P##PASS);  \
+  out5h = vec_sra(out5h, DESCALE_P##PASS);  \
   \
   out3l = vec_add(out3l, PD_DESCALE_P##PASS);  \
   out3h = vec_add(out3h, PD_DESCALE_P##PASS);  \
-  out3l = vec_sr(out3l, DESCALE_P##PASS);  \
-  out3h = vec_sr(out3h, DESCALE_P##PASS);  \
+  out3l = vec_sra(out3l, DESCALE_P##PASS);  \
+  out3h = vec_sra(out3h, DESCALE_P##PASS);  \
   \
   out5 = vec_pack(out5l, out5h);  \
   out3 = vec_pack(out3l, out3h);  \
@@ -379,30 +384,42 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
     out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
     out7l, out7h;
 
-  __vector short PW_F130_F054 = {F_0_541 + F_0_765, F_0_541,
-    F_0_541 + F_0_765, F_0_541, F_0_541 + F_0_765, F_0_541,
-    F_0_541 + F_0_765, F_0_541};
-  __vector short PW_F054_MF130 = {F_0_541, F_0_541 - F_1_847,
-    F_0_541, F_0_541 - F_1_847, F_0_541, F_0_541 - F_1_847,
-    F_0_541, F_0_541 - F_1_847};
-  __vector short PW_MF078_F117 = {F_1_175 - F_1_961, F_1_175,
-    F_1_175 - F_1_961, F_1_175, F_1_175 - F_1_961, F_1_175,
-    F_1_175 - F_1_961, F_1_175};
-  __vector short PW_F117_F078 = {F_1_175, F_1_175 - F_0_390,
-    F_1_175, F_1_175 - F_0_390, F_1_175, F_1_175 - F_0_390,
-    F_1_175, F_1_175 - F_0_390};
-  __vector short PW_MF060_MF089 = {F_0_298 - F_0_899, -F_0_899,
-    F_0_298 - F_0_899, -F_0_899, F_0_298 - F_0_899, -F_0_899,
-    F_0_298 - F_0_899, -F_0_899};
-  __vector short PW_MF089_F060 = {-F_0_899, F_1_501 - F_0_899,
-    -F_0_899, F_1_501 - F_0_899, -F_0_899, F_1_501 - F_0_899,
-    -F_0_899, F_1_501 - F_0_899};
-  __vector short PW_MF050_MF256 = {F_2_053 - F_2_562, -F_2_562,
-    F_2_053 - F_2_562, -F_2_562, F_2_053 - F_2_562, -F_2_562,
-    F_2_053 - F_2_562, -F_2_562};
-  __vector short PW_MF256_F050 = {-F_2_562, F_3_072 - F_2_562,
-    -F_2_562, F_3_072 - F_2_562, -F_2_562, F_3_072 - F_2_562,
-    -F_2_562, F_3_072 - F_2_562};
+  __vector short PW_F130_F054 = {ISLOW_F_0_541 + ISLOW_F_0_765, ISLOW_F_0_541,
+    ISLOW_F_0_541 + ISLOW_F_0_765, ISLOW_F_0_541,
+    ISLOW_F_0_541 + ISLOW_F_0_765, ISLOW_F_0_541,
+    ISLOW_F_0_541 + ISLOW_F_0_765, ISLOW_F_0_541};
+  __vector short PW_F054_MF130 = {ISLOW_F_0_541, ISLOW_F_0_541 - ISLOW_F_1_847,
+    ISLOW_F_0_541, ISLOW_F_0_541 - ISLOW_F_1_847,
+    ISLOW_F_0_541, ISLOW_F_0_541 - ISLOW_F_1_847,
+    ISLOW_F_0_541, ISLOW_F_0_541 - ISLOW_F_1_847};
+  __vector short PW_MF078_F117 = {ISLOW_F_1_175 - ISLOW_F_1_961, ISLOW_F_1_175,
+    ISLOW_F_1_175 - ISLOW_F_1_961, ISLOW_F_1_175,
+    ISLOW_F_1_175 - ISLOW_F_1_961, ISLOW_F_1_175,
+    ISLOW_F_1_175 - ISLOW_F_1_961, ISLOW_F_1_175};
+  __vector short PW_F117_F078 = {ISLOW_F_1_175, ISLOW_F_1_175 - ISLOW_F_0_390,
+    ISLOW_F_1_175, ISLOW_F_1_175 - ISLOW_F_0_390,
+    ISLOW_F_1_175, ISLOW_F_1_175 - ISLOW_F_0_390,
+    ISLOW_F_1_175, ISLOW_F_1_175 - ISLOW_F_0_390};
+  __vector short PW_MF060_MF089 = {
+    ISLOW_F_0_298 - ISLOW_F_0_899, -ISLOW_F_0_899,
+    ISLOW_F_0_298 - ISLOW_F_0_899, -ISLOW_F_0_899,
+    ISLOW_F_0_298 - ISLOW_F_0_899, -ISLOW_F_0_899,
+    ISLOW_F_0_298 - ISLOW_F_0_899, -ISLOW_F_0_899};
+  __vector short PW_MF089_F060 = {
+    -ISLOW_F_0_899, ISLOW_F_1_501 - ISLOW_F_0_899,
+    -ISLOW_F_0_899, ISLOW_F_1_501 - ISLOW_F_0_899,
+    -ISLOW_F_0_899, ISLOW_F_1_501 - ISLOW_F_0_899,
+    -ISLOW_F_0_899, ISLOW_F_1_501 - ISLOW_F_0_899};
+  __vector short PW_MF050_MF256 = {
+    ISLOW_F_2_053 - ISLOW_F_2_562, -ISLOW_F_2_562,
+    ISLOW_F_2_053 - ISLOW_F_2_562, -ISLOW_F_2_562,
+    ISLOW_F_2_053 - ISLOW_F_2_562, -ISLOW_F_2_562,
+    ISLOW_F_2_053 - ISLOW_F_2_562, -ISLOW_F_2_562};
+  __vector short PW_MF256_F050 = {
+    -ISLOW_F_2_562, ISLOW_F_3_072 - ISLOW_F_2_562,
+    -ISLOW_F_2_562, ISLOW_F_3_072 - ISLOW_F_2_562,
+    -ISLOW_F_2_562, ISLOW_F_3_072 - ISLOW_F_2_562,
+    -ISLOW_F_2_562, ISLOW_F_3_072 - ISLOW_F_2_562};
   __vector short PW_DESCALE_P2X = vec_splat(jconst_fdct_islow2, 0);
 
   /* Constants */

From c7dadd2d0b64fa2cd85c3c064e7e614b30c6c827 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Thu, 18 Dec 2014 10:12:29 +0000
Subject: [PATCH 052/108] AltiVec SIMD implementation of fast integer inverse
 DCT

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1445 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/jsimd.h                 |   4 +
 simd/jsimd_powerpc.c         |  13 +++
 simd/jsimd_powerpc_altivec.c | 214 +++++++++++++++++++++++++++++++++++
 3 files changed, 231 insertions(+)

diff --git a/simd/jsimd.h b/simd/jsimd.h
index 4dcdfc1a..f3396110 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -667,6 +667,10 @@ EXTERN(void) jsimd_idct_ifast_rows_mips_dspr2
         (DCTELEM * wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
          const int * idct_coefs);
 
+EXTERN(void) jsimd_idct_ifast_altivec
+        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
+
 /* Floating Point Inverse DCT */
 EXTERN(void) jsimd_idct_float_3dnow
         (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c
index ff37c5f0..eb977d73 100644
--- a/simd/jsimd_powerpc.c
+++ b/simd/jsimd_powerpc.c
@@ -339,6 +339,17 @@ jsimd_can_idct_islow (void)
 GLOBAL(int)
 jsimd_can_idct_ifast (void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
   return 0;
 }
 
@@ -360,6 +371,8 @@ jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
+  jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
+                           output_col);
 }
 
 GLOBAL(void)
diff --git a/simd/jsimd_powerpc_altivec.c b/simd/jsimd_powerpc_altivec.c
index 6f26bd5f..b77208ca 100644
--- a/simd/jsimd_powerpc_altivec.c
+++ b/simd/jsimd_powerpc_altivec.c
@@ -478,3 +478,217 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
   *(__vector short *)&data[48] = out6;
   *(__vector short *)&data[56] = out7;
 }
+
+
+/* FAST INTEGER INVERSE DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in IFAST_CONST_SHIFT.)  This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ *   the elements in arg3 + the most significant 17 bits of
+ *     (the elements in arg1 * the elements in arg2).
+ */
+
+#define F_1_082 277              /* FIX(1.082392200) */
+#define F_1_414 362              /* FIX(1.414213562) */
+#define F_1_847 473              /* FIX(1.847759065) */
+#define F_2_613 669              /* FIX(2.613125930) */
+#define F_1_613 (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
+
+#define IFAST_PASS1_BITS 2
+#define IFAST_CENTERJSAMPLE 128
+
+static const __vector short jconst_idct_ifast __attribute__((aligned(16))) =
+{
+  F_1_414 << IFAST_CONST_SHIFT,
+  F_1_847 << IFAST_CONST_SHIFT,
+  -F_1_613 << IFAST_CONST_SHIFT,
+  F_1_082 << IFAST_CONST_SHIFT
+};
+
+static const __vector signed char jconst_idct_ifast2
+  __attribute__((aligned(16))) =
+{
+  IFAST_CENTERJSAMPLE
+};
+
+#define DO_IDCT_IFAST(in)  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(in##0, in##4);  \
+  tmp11 = vec_sub(in##0, in##4);  \
+  tmp13 = vec_add(in##2, in##6);  \
+  \
+  tmp12 = vec_sub(in##2, in##6);  \
+  tmp12 = vec_sl(tmp12, PRE_MULTIPLY_SCALE_BITS);  \
+  tmp12 = vec_madds(tmp12, PW_F1414, zero);  \
+  tmp12 = vec_sub(tmp12, tmp13);  \
+  \
+  tmp0 = vec_add(tmp10, tmp13);  \
+  tmp3 = vec_sub(tmp10, tmp13);  \
+  tmp1 = vec_add(tmp11, tmp12);  \
+  tmp2 = vec_sub(tmp11, tmp12);  \
+  \
+  /* Odd part */  \
+  \
+  z13 = vec_add(in##5, in##3);  \
+  z10 = vec_sub(in##5, in##3);  \
+  z10s = vec_sl(z10, PRE_MULTIPLY_SCALE_BITS);  \
+  z11 = vec_add(in##1, in##7);  \
+  z12s = vec_sub(in##1, in##7);  \
+  z12s = vec_sl(z12s, PRE_MULTIPLY_SCALE_BITS);  \
+  \
+  tmp11 = vec_sub(z11, z13);  \
+  tmp11 = vec_sl(tmp11, PRE_MULTIPLY_SCALE_BITS);  \
+  tmp11 = vec_madds(tmp11, PW_F1414, zero);  \
+  \
+  tmp7 = vec_add(z11, z13);  \
+  \
+  z5 = vec_add(z10s, z12s);  \
+  z5 = vec_madds(z5, PW_F1847, zero);  \
+  \
+  tmp10 = vec_madds(z12s, PW_F1082, zero);  \
+  tmp10 = vec_sub(tmp10, z5);  \
+  tmp12 = vec_madds(z10s, PW_MF1613, zero);  \
+  tmp12 = vec_sub(tmp12, z10);  \
+  tmp12 = vec_add(tmp12, z5);  \
+  \
+  tmp6 = vec_sub(tmp12, tmp7);  \
+  tmp5 = vec_sub(tmp11, tmp6);  \
+  tmp4 = vec_add(tmp10, tmp5);  \
+  \
+  out0 = vec_add(tmp0, tmp7);  \
+  out1 = vec_add(tmp1, tmp6);  \
+  out2 = vec_add(tmp2, tmp5);  \
+  out3 = vec_sub(tmp3, tmp4);  \
+  out4 = vec_add(tmp3, tmp4);  \
+  out5 = vec_sub(tmp2, tmp5);  \
+  out6 = vec_sub(tmp1, tmp6);  \
+  out7 = vec_sub(tmp0, tmp7);  \
+}
+
+void
+jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  short *dct_table = (short *)dct_table_;
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    z5, z10, z10s, z11, z12s, z13,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector signed char outb;
+  long long *outptr, *outbptr = (long long *)(&outb);
+
+  /* Constants */
+  __vector short zero = vec_splat_s16(0),
+    PW_F1414 = vec_splat(jconst_idct_ifast, 0),
+    PW_F1847 = vec_splat(jconst_idct_ifast, 1),
+    PW_MF1613 = vec_splat(jconst_idct_ifast, 2),
+    PW_F1082 = vec_splat(jconst_idct_ifast, 3);
+  __vector unsigned short
+    PRE_MULTIPLY_SCALE_BITS = vec_splat_u16(IFAST_PRE_MULTIPLY_SCALE_BITS),
+    PASS1_BITS3 = vec_splat_u16(IFAST_PASS1_BITS + 3);
+  __vector signed char PB_CENTERJSAMP = vec_splat(jconst_idct_ifast2, 0);
+
+  /* Pass 1: process columns. */
+
+  col0 = *(__vector short *)&coef_block[0];
+  col1 = *(__vector short *)&coef_block[8];
+  col2 = *(__vector short *)&coef_block[16];
+  col3 = *(__vector short *)&coef_block[24];
+  col4 = *(__vector short *)&coef_block[32];
+  col5 = *(__vector short *)&coef_block[40];
+  col6 = *(__vector short *)&coef_block[48];
+  col7 = *(__vector short *)&coef_block[56];
+
+  tmp1 = vec_or(col1, col2);
+  tmp2 = vec_or(col3, col4);
+  tmp1 = vec_or(tmp1, tmp2);
+  tmp3 = vec_or(col5, col6);
+  tmp3 = vec_or(tmp3, col7);
+  tmp1 = vec_or(tmp1, tmp3);
+
+  quant0 = *(__vector short *)&dct_table[0];
+  col0 = vec_mladd(col0, quant0, zero);
+
+  if (vec_all_eq(tmp1, zero)) {
+    /* AC terms all zero */
+
+    row0 = vec_splat(col0, 0);
+    row1 = vec_splat(col0, 1);
+    row2 = vec_splat(col0, 2);
+    row3 = vec_splat(col0, 3);
+    row4 = vec_splat(col0, 4);
+    row5 = vec_splat(col0, 5);
+    row6 = vec_splat(col0, 6);
+    row7 = vec_splat(col0, 7);
+
+  } else {
+
+    quant1 = *(__vector short *)&dct_table[8];
+    quant2 = *(__vector short *)&dct_table[16];
+    quant3 = *(__vector short *)&dct_table[24];
+    quant4 = *(__vector short *)&dct_table[32];
+    quant5 = *(__vector short *)&dct_table[40];
+    quant6 = *(__vector short *)&dct_table[48];
+    quant7 = *(__vector short *)&dct_table[56];
+
+    col1 = vec_mladd(col1, quant1, zero);
+    col2 = vec_mladd(col2, quant2, zero);
+    col3 = vec_mladd(col3, quant3, zero);
+    col4 = vec_mladd(col4, quant4, zero);
+    col5 = vec_mladd(col5, quant5, zero);
+    col6 = vec_mladd(col6, quant6, zero);
+    col7 = vec_mladd(col7, quant7, zero);
+
+    DO_IDCT_IFAST(col);
+
+    TRANSPOSE(out, row);
+  }
+
+  /* Pass 2: process rows. */
+
+  DO_IDCT_IFAST(row);
+
+  out0 = vec_sra(out0, PASS1_BITS3);
+  out1 = vec_sra(out1, PASS1_BITS3);
+  out2 = vec_sra(out2, PASS1_BITS3);
+  out3 = vec_sra(out3, PASS1_BITS3);
+  out4 = vec_sra(out4, PASS1_BITS3);
+  out5 = vec_sra(out5, PASS1_BITS3);
+  out6 = vec_sra(out6, PASS1_BITS3);
+  out7 = vec_sra(out7, PASS1_BITS3);
+
+  TRANSPOSE(out, col);
+
+  outb = vec_packs(col0, col1);
+  outb = vec_add(outb, PB_CENTERJSAMP);
+  outptr = (long long *)(output_buf[0] + output_col);
+  *outptr = outbptr[0];
+  outptr = (long long *)(output_buf[1] + output_col);
+  *outptr = outbptr[1];
+
+  outb = vec_packs(col2, col3);
+  outb = vec_add(outb, PB_CENTERJSAMP);
+  outptr = (long long *)(output_buf[2] + output_col);
+  *outptr = outbptr[0];
+  outptr = (long long *)(output_buf[3] + output_col);
+  *outptr = outbptr[1];
+
+  outb = vec_packs(col4, col5);
+  outb = vec_add(outb, PB_CENTERJSAMP);
+  outptr = (long long *)(output_buf[4] + output_col);
+  *outptr = outbptr[0];
+  outptr = (long long *)(output_buf[5] + output_col);
+  *outptr = outbptr[1];
+
+  outb = vec_packs(col6, col7);
+  outb = vec_add(outb, PB_CENTERJSAMP);
+  outptr = (long long *)(output_buf[6] + output_col);
+  *outptr = outbptr[0];
+  outptr = (long long *)(output_buf[7] + output_col);
+  *outptr = outbptr[1];
+}

From a546be514155c8d64716c7d0d067237d714efb64 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 19 Dec 2014 10:46:00 +0000
Subject: [PATCH 053/108] Add iOS architectures to the shared libraries
 generated by the Mac/iOS packaging system.  I have no idea how useful this is
 for "standard" iOS application development, but it is useful in a jailbreak
 environment, and iOS 8 supposedly allows shared libs in "official" apps as
 well.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1448 632fc199-4ca6-4c93-a231-07263d6284db
---
 BUILDING.txt          |   2 +-
 release/makemacpkg.in | 127 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 126 insertions(+), 3 deletions(-)

diff --git a/BUILDING.txt b/BUILDING.txt
index 5b5b7a49..a19d8ce6 100644
--- a/BUILDING.txt
+++ b/BUILDING.txt
@@ -368,7 +368,7 @@ Set the following shell variables for simplicity:
 
 Follow the procedure under "Building libjpeg-turbo" above, adding
 
-  --host arm-apple-darwin10 --enable-static --disable-shared \
+  --host arm-apple-darwin10 \
     CC="$IOS_GCC" LD="$IOS_GCC" \
     CFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
     LDFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT $IOS_CFLAGS"
diff --git a/release/makemacpkg.in b/release/makemacpkg.in
index 16c18deb..d0edecb0 100644
--- a/release/makemacpkg.in
+++ b/release/makemacpkg.in
@@ -173,14 +173,55 @@ if [ $BUILDARMV6 = 1 ]; then
 	pushd $BUILDDIRARMV6
 	make install DESTDIR=$TMPDIR/dist.armv6
 	popd
+	if [ ! -h $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
+		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
+		lipo -create \
+			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
+			-arch arm $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
+			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
+	elif [ ! -h $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
+		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
+		lipo -create \
+			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
+			-arch arm $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
+			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
+	fi
 	lipo -create \
 		$PKGROOT/$LIBDIR/libjpeg.a \
 		-arch arm $TMPDIR/dist.armv6/$LIBDIR/libjpeg.a \
 		-output $PKGROOT/$LIBDIR/libjpeg.a
+	lipo -create \
+		$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
+		-arch arm $TMPDIR/dist.armv6/$LIBDIR/libturbojpeg.0.dylib \
+		-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
 	lipo -create \
 		$PKGROOT/$LIBDIR/libturbojpeg.a \
 		-arch arm $TMPDIR/dist.armv6/$LIBDIR/libturbojpeg.a \
 		-output $PKGROOT/$LIBDIR/libturbojpeg.a
+	lipo -create \
+		$PKGROOT/$BINDIR/cjpeg \
+		-arch arm $TMPDIR/dist.armv6/$BINDIR/cjpeg \
+		-output $PKGROOT/$BINDIR/cjpeg
+	lipo -create \
+		$PKGROOT/$BINDIR/djpeg \
+		-arch arm $TMPDIR/dist.armv6/$BINDIR/djpeg \
+		-output $PKGROOT/$BINDIR/djpeg
+	lipo -create \
+		$PKGROOT/$BINDIR/jpegtran \
+		-arch arm $TMPDIR/dist.armv6/$BINDIR/jpegtran \
+		-output $PKGROOT/$BINDIR/jpegtran
+	lipo -create \
+		$PKGROOT/$BINDIR/tjbench \
+		-arch arm $TMPDIR/dist.armv6/$BINDIR/tjbench \
+		-output $PKGROOT/$BINDIR/tjbench
+	lipo -create \
+		$PKGROOT/$BINDIR/rdjpgcom \
+		-arch arm $TMPDIR/dist.armv6/$BINDIR/rdjpgcom \
+		-output $PKGROOT/$BINDIR/rdjpgcom
+	lipo -create \
+		$PKGROOT/$BINDIR/wrjpgcom \
+		-arch arm $TMPDIR/dist.armv6/$BINDIR/wrjpgcom \
+		-output $PKGROOT/$BINDIR/wrjpgcom
 fi
 
 if [ $BUILDARMV7 = 1 ]; then
@@ -196,14 +237,55 @@ if [ $BUILDARMV7 = 1 ]; then
 	pushd $BUILDDIRARMV7
 	make install DESTDIR=$TMPDIR/dist.armv7
 	popd
+	if [ ! -h $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
+		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
+		lipo -create \
+			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
+			-arch arm $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
+			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
+	elif [ ! -h $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
+		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
+		lipo -create \
+			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
+			-arch arm $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
+			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
+	fi
 	lipo -create \
 		$PKGROOT/$LIBDIR/libjpeg.a \
 		-arch arm $TMPDIR/dist.armv7/$LIBDIR/libjpeg.a \
 		-output $PKGROOT/$LIBDIR/libjpeg.a
+	lipo -create \
+		$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
+		-arch arm $TMPDIR/dist.armv7/$LIBDIR/libturbojpeg.0.dylib \
+		-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
 	lipo -create \
 		$PKGROOT/$LIBDIR/libturbojpeg.a \
 		-arch arm $TMPDIR/dist.armv7/$LIBDIR/libturbojpeg.a \
 		-output $PKGROOT/$LIBDIR/libturbojpeg.a
+	lipo -create \
+		$PKGROOT/$BINDIR/cjpeg \
+		-arch arm $TMPDIR/dist.armv7/$BINDIR/cjpeg \
+		-output $PKGROOT/$BINDIR/cjpeg
+	lipo -create \
+		$PKGROOT/$BINDIR/djpeg \
+		-arch arm $TMPDIR/dist.armv7/$BINDIR/djpeg \
+		-output $PKGROOT/$BINDIR/djpeg
+	lipo -create \
+		$PKGROOT/$BINDIR/jpegtran \
+		-arch arm $TMPDIR/dist.armv7/$BINDIR/jpegtran \
+		-output $PKGROOT/$BINDIR/jpegtran
+	lipo -create \
+		$PKGROOT/$BINDIR/tjbench \
+		-arch arm $TMPDIR/dist.armv7/$BINDIR/tjbench \
+		-output $PKGROOT/$BINDIR/tjbench
+	lipo -create \
+		$PKGROOT/$BINDIR/rdjpgcom \
+		-arch arm $TMPDIR/dist.armv7/$BINDIR/rdjpgcom \
+		-output $PKGROOT/$BINDIR/rdjpgcom
+	lipo -create \
+		$PKGROOT/$BINDIR/wrjpgcom \
+		-arch arm $TMPDIR/dist.armv7/$BINDIR/wrjpgcom \
+		-output $PKGROOT/$BINDIR/wrjpgcom
 fi
 
 if [ $BUILDARMV7S = 1 ]; then
@@ -219,14 +301,55 @@ if [ $BUILDARMV7S = 1 ]; then
 	pushd $BUILDDIRARMV7S
 	make install DESTDIR=$TMPDIR/dist.armv7s
 	popd
+	if [ ! -h $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
+		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
+		lipo -create \
+			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
+			-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
+			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
+	elif [ ! -h $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
+		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
+		lipo -create \
+			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
+			-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
+			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
+	fi
 	lipo -create \
 		$PKGROOT/$LIBDIR/libjpeg.a \
 		-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.a \
 		-output $PKGROOT/$LIBDIR/libjpeg.a
+	lipo -create \
+		$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
+		-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libturbojpeg.0.dylib \
+		-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
 	lipo -create \
 		$PKGROOT/$LIBDIR/libturbojpeg.a \
 		-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libturbojpeg.a \
 		-output $PKGROOT/$LIBDIR/libturbojpeg.a
+	lipo -create \
+		$PKGROOT/$BINDIR/cjpeg \
+		-arch arm $TMPDIR/dist.armv7s/$BINDIR/cjpeg \
+		-output $PKGROOT/$BINDIR/cjpeg
+	lipo -create \
+		$PKGROOT/$BINDIR/djpeg \
+		-arch arm $TMPDIR/dist.armv7s/$BINDIR/djpeg \
+		-output $PKGROOT/$BINDIR/djpeg
+	lipo -create \
+		$PKGROOT/$BINDIR/jpegtran \
+		-arch arm $TMPDIR/dist.armv7s/$BINDIR/jpegtran \
+		-output $PKGROOT/$BINDIR/jpegtran
+	lipo -create \
+		$PKGROOT/$BINDIR/tjbench \
+		-arch arm $TMPDIR/dist.armv7s/$BINDIR/tjbench \
+		-output $PKGROOT/$BINDIR/tjbench
+	lipo -create \
+		$PKGROOT/$BINDIR/rdjpgcom \
+		-arch arm $TMPDIR/dist.armv7s/$BINDIR/rdjpgcom \
+		-output $PKGROOT/$BINDIR/rdjpgcom
+	lipo -create \
+		$PKGROOT/$BINDIR/wrjpgcom \
+		-arch arm $TMPDIR/dist.armv7s/$BINDIR/wrjpgcom \
+		-output $PKGROOT/$BINDIR/wrjpgcom
 fi
 
 install_name_tool -id $LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
@@ -259,7 +382,7 @@ productbuild --distribution $SRCDIR/release/Distribution.xml \
 	--package-path $TMPDIR/pkg/ --resources $TMPDIR/pkg/ \
 	$TMPDIR/dmg/$PACKAGE_NAME.pkg
 hdiutil create -fs HFS+ -volname $PACKAGE_NAME-$VERSION \
-	-srcfolder "$TMPDIR/dmg" $TMPDIR/$PACKAGE_NAME-$VERSION.dmg 
-cp $TMPDIR/$PACKAGE_NAME-$VERSION.dmg . 
+	-srcfolder "$TMPDIR/dmg" $TMPDIR/$PACKAGE_NAME-$VERSION.dmg
+cp $TMPDIR/$PACKAGE_NAME-$VERSION.dmg .
 
 exit

From 81a64020e3db2d04a4399a1035dffa11e764a71f Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 19 Dec 2014 18:19:31 +0000
Subject: [PATCH 054/108] Remove reference to install.txt, which we do not
 include from the IJG distribution

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1455 632fc199-4ca6-4c93-a231-07263d6284db
---
 README | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/README b/README
index 44e69d39..e82a095b 100644
--- a/README
+++ b/README
@@ -36,7 +36,6 @@ TO DO               Plans for future IJG releases.
 Other documentation files in the distribution are:
 
 User documentation:
-  install.txt       How to configure and install the IJG software.
   usage.txt         Usage instructions for cjpeg, djpeg, jpegtran,
                     rdjpgcom, and wrjpgcom.
   *.1               Unix-style man pages for programs (same info as usage.txt).
@@ -48,9 +47,9 @@ Programmer and internal documentation:
   structure.txt     Overview of the JPEG library's internal structure.
   coderules.txt     Coding style rules --- please read if you contribute code.
 
-Please read at least the files install.txt and usage.txt.  Some information
-can also be found in the JPEG FAQ (Frequently Asked Questions) article.  See
-ARCHIVE LOCATIONS below to find out where to obtain the FAQ article.
+Please read at least usage.txt.  Some information can also be found in the JPEG
+FAQ (Frequently Asked Questions) article.  See ARCHIVE LOCATIONS below to find
+out where to obtain the FAQ article.
 
 If you want to understand how the JPEG code works, we suggest reading one or
 more of the REFERENCES, then looking at the documentation files (in roughly

From aa805bc89fef7c42d61a44754f28d9ad07c5546f Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 19 Dec 2014 18:23:52 +0000
Subject: [PATCH 055/108] Modify the ARM64 assembly file so that it uses only
 syntax that the clang assembler in XCode 5.x can understand.  These changes
 should all be cosmetic in nature-- they do not change the meaning or
 readability of the code nor the ability to build it for Linux.  Actually, the
 code is now more in compliance with the ARM64 programming manual.  In
 addition to these changes, there were a couple of instructions that clang
 simply doesn't support, so gas-preprocessor.pl was modified so that it now
 converts those into equivalent instructions that clang can handle.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1456 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/jsimd_arm64_neon.S | 493 ++++++++++++++++++++--------------------
 1 file changed, 247 insertions(+), 246 deletions(-)

diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
index f488b0f1..2186f248 100644
--- a/simd/jsimd_arm64_neon.S
+++ b/simd/jsimd_arm64_neon.S
@@ -6,6 +6,7 @@
  * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
  * Copyright (C) 2013-2014, Linaro Limited
  * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
+ * Copyright (C) 2014, D. R. Commander.  All rights reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -197,21 +198,21 @@ _\fname:
     tmp13 = q1;                                                               \
 }
 
-#define XFIX_0_899976223                    v0.4h[0]
-#define XFIX_0_541196100                    v0.4h[1]
-#define XFIX_2_562915447                    v0.4h[2]
-#define XFIX_0_298631336_MINUS_0_899976223  v0.4h[3]
-#define XFIX_1_501321110_MINUS_0_899976223  v1.4h[0]
-#define XFIX_2_053119869_MINUS_2_562915447  v1.4h[1]
-#define XFIX_0_541196100_PLUS_0_765366865   v1.4h[2]
-#define XFIX_1_175875602                    v1.4h[3]
-#define XFIX_1_175875602_MINUS_0_390180644  v2.4h[0]
-#define XFIX_0_541196100_MINUS_1_847759065  v2.4h[1]
-#define XFIX_3_072711026_MINUS_2_562915447  v2.4h[2]
-#define XFIX_1_175875602_MINUS_1_961570560  v2.4h[3]
+#define XFIX_0_899976223                    v0.h[0]
+#define XFIX_0_541196100                    v0.h[1]
+#define XFIX_2_562915447                    v0.h[2]
+#define XFIX_0_298631336_MINUS_0_899976223  v0.h[3]
+#define XFIX_1_501321110_MINUS_0_899976223  v1.h[0]
+#define XFIX_2_053119869_MINUS_2_562915447  v1.h[1]
+#define XFIX_0_541196100_PLUS_0_765366865   v1.h[2]
+#define XFIX_1_175875602                    v1.h[3]
+#define XFIX_1_175875602_MINUS_0_390180644  v2.h[0]
+#define XFIX_0_541196100_MINUS_1_847759065  v2.h[1]
+#define XFIX_3_072711026_MINUS_2_562915447  v2.h[2]
+#define XFIX_1_175875602_MINUS_1_961570560  v2.h[3]
 
 .balign 16
-jsimd_idct_islow_neon_consts:
+Ljsimd_idct_islow_neon_consts:
     .short FIX_0_899976223                    /* d0[0] */
     .short FIX_0_541196100                    /* d0[1] */
     .short FIX_2_562915447                    /* d0[2] */
@@ -256,54 +257,54 @@ asm_function jsimd_idct_islow_neon
     /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
     sub             sp, sp, 272
     str             x15, [sp], 16
-    adr             x15, jsimd_idct_islow_neon_consts
-    st1             {v0.8b - v3.8b}, [sp], 32
-    st1             {v4.8b - v7.8b}, [sp], 32
-    st1             {v8.8b - v11.8b}, [sp], 32
-    st1             {v12.8b - v15.8b}, [sp], 32
-    st1             {v16.8b - v19.8b}, [sp], 32
-    st1             {v20.8b - v23.8b}, [sp], 32
-    st1             {v24.8b - v27.8b}, [sp], 32
-    st1             {v28.8b - v31.8b}, [sp], 32
+    adr             x15, Ljsimd_idct_islow_neon_consts
+    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
     ld1             {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
     ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
     ld1             {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
     mul             v16.4h, v16.4h, v0.4h
     mul             v17.4h, v17.4h, v1.4h
-    ins             v16.2d[1], v17.2d[0]  /* 128 bit q8 */
+    ins             v16.d[1], v17.d[0]  /* 128 bit q8 */
     ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
     mul             v18.4h, v18.4h, v2.4h
     mul             v19.4h, v19.4h, v3.4h
-    ins             v18.2d[1], v19.2d[0]  /* 128 bit q9 */
+    ins             v18.d[1], v19.d[0]  /* 128 bit q9 */
     ld1             {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
     mul             v20.4h, v20.4h, v4.4h
     mul             v21.4h, v21.4h, v5.4h
-    ins             v20.2d[1], v21.2d[0]  /* 128 bit q10 */
+    ins             v20.d[1], v21.d[0]  /* 128 bit q10 */
     ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
     mul             v22.4h, v22.4h, v6.4h
     mul             v23.4h, v23.4h, v7.4h
-    ins             v22.2d[1], v23.2d[0]  /* 128 bit q11 */
+    ins             v22.d[1], v23.d[0]  /* 128 bit q11 */
     ld1             {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
     mul             v24.4h, v24.4h, v0.4h
     mul             v25.4h, v25.4h, v1.4h
-    ins             v24.2d[1], v25.2d[0]  /* 128 bit q12 */
+    ins             v24.d[1], v25.d[0]  /* 128 bit q12 */
     ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
     mul             v28.4h, v28.4h, v4.4h
     mul             v29.4h, v29.4h, v5.4h
-    ins             v28.2d[1], v29.2d[0]  /* 128 bit q14 */
+    ins             v28.d[1], v29.d[0]  /* 128 bit q14 */
     mul             v26.4h, v26.4h, v2.4h
     mul             v27.4h, v27.4h, v3.4h
-    ins             v26.2d[1], v27.2d[0]  /* 128 bit q13 */
+    ins             v26.d[1], v27.d[0]  /* 128 bit q13 */
     ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x15]  /* load constants */
     add             x15, x15, #16
     mul             v30.4h, v30.4h, v6.4h
     mul             v31.4h, v31.4h, v7.4h
-    ins             v30.2d[1], v31.2d[0]  /* 128 bit q15 */
+    ins             v30.d[1], v31.d[0]  /* 128 bit q15 */
     /* Go to the bottom of the stack */
     sub             sp, sp, 352
     stp             x4, x5, [sp], 16
-    st1             {v8.4h - v11.4h}, [sp], 32  /* save NEON registers */
-    st1             {v12.4h - v15.4h}, [sp], 32
+    st1             {v8.4h, v9.4h, v10.4h, v11.4h}, [sp], 32  /* save NEON registers */
+    st1             {v12.4h, v13.4h, v14.4h, v15.4h}, [sp], 32
     /* 1-D IDCT, pass 1, left 4x8 half */
     add             v4.4h,    ROW7L.4h, ROW3L.4h
     add             v5.4h,    ROW5L.4h, ROW1L.4h
@@ -378,7 +379,7 @@ asm_function jsimd_idct_islow_neon
     rshrn           ROW0L.4h, v12.4s,   #11
     rshrn           ROW4L.4h, v6.4s,    #11
 
-      beq             3f /* Go to do some special handling for the sparse right 4x8 half */
+      b.eq          3f /* Go to do some special handling for the sparse right 4x8 half */
 
     /* 1-D IDCT, pass 1, right 4x8 half */
     ld1             {v2.4h},  [x15]    /* reload constants */
@@ -553,33 +554,33 @@ asm_function jsimd_idct_islow_neon
     shrn            ROW4R.4h, v6.4s,    #16
 
 2:  /* Descale to 8-bit and range limit */
-    ins             v16.2d[1], v17.2d[0]
-    ins             v18.2d[1], v19.2d[0]
-    ins             v20.2d[1], v21.2d[0]
-    ins             v22.2d[1], v23.2d[0]
+    ins             v16.d[1], v17.d[0]
+    ins             v18.d[1], v19.d[0]
+    ins             v20.d[1], v21.d[0]
+    ins             v22.d[1], v23.d[0]
     sqrshrn         v16.8b,   v16.8h,   #2
     sqrshrn2        v16.16b,  v18.8h,   #2
     sqrshrn         v18.8b,   v20.8h,   #2
     sqrshrn2        v18.16b,  v22.8h,   #2
 
     /* vpop            {v8.4h - d15.4h} */ /* restore NEON registers */
-    ld1             {v8.4h - v11.4h}, [sp], 32
-    ld1             {v12.4h - v15.4h}, [sp], 32
-    ins             v24.2d[1], v25.2d[0]
+    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [sp], 32
+    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [sp], 32
+    ins             v24.d[1], v25.d[0]
 
     sqrshrn         v20.8b,   v24.8h,   #2
       /* Transpose the final 8-bit samples and do signed->unsigned conversion */
     /* trn1            v16.8h,    v16.8h,  v18.8h */
     transpose       v16, v18, v3, .16b, .8h
-    ins             v26.2d[1], v27.2d[0]
-    ins             v28.2d[1], v29.2d[0]
-    ins             v30.2d[1], v31.2d[0]
+    ins             v26.d[1], v27.d[0]
+    ins             v28.d[1], v29.d[0]
+    ins             v30.d[1], v31.d[0]
     sqrshrn2        v20.16b,  v26.8h,   #2
     sqrshrn         v22.8b,   v28.8h,   #2
     movi            v0.16b,   #(CENTERJSAMPLE)
     sqrshrn2        v22.16b,  v30.8h,   #2
-    transpose_single v16, v17, v3, .2d, .8b
-    transpose_single v18, v19, v3, .2d, .8b
+    transpose_single v16, v17, v3, .d, .8b
+    transpose_single v18, v19, v3, .d, .8b
     add             v16.8b,   v16.8b,   v0.8b
     add             v17.8b,   v17.8b,   v0.8b
     add             v18.8b,   v18.8b,   v0.8b
@@ -590,7 +591,7 @@ asm_function jsimd_idct_islow_neon
     add             TMP1,     TMP1,     OUTPUT_COL
     add             TMP2,     TMP2,     OUTPUT_COL
     st1             {v16.8b}, [TMP1]
-    transpose_single v20, v21, v3, .2d, .8b
+    transpose_single v20, v21, v3, .d, .8b
     st1             {v17.8b}, [TMP2]
     ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
     add             TMP1,     TMP1,     OUTPUT_COL
@@ -605,7 +606,7 @@ asm_function jsimd_idct_islow_neon
     add             TMP2,     TMP2,     OUTPUT_COL
     add             TMP3,     TMP3,     OUTPUT_COL
     add             TMP4,     TMP4,     OUTPUT_COL
-    transpose_single v22, v23, v3, .2d, .8b
+    transpose_single v22, v23, v3, .d, .8b
     st1             {v20.8b}, [TMP1]
     add             v22.8b,   v22.8b,   v0.8b
     add             v23.8b,   v23.8b,   v0.8b
@@ -613,14 +614,14 @@ asm_function jsimd_idct_islow_neon
     st1             {v22.8b}, [TMP3]
     st1             {v23.8b}, [TMP4]
     ldr             x15, [sp], 16
-    ld1             {v0.8b - v3.8b}, [sp], 32
-    ld1             {v4.8b - v7.8b}, [sp], 32
-    ld1             {v8.8b - v11.8b}, [sp], 32
-    ld1             {v12.8b - v15.8b}, [sp], 32
-    ld1             {v16.8b - v19.8b}, [sp], 32
-    ld1             {v20.8b - v23.8b}, [sp], 32
-    ld1             {v24.8b - v27.8b}, [sp], 32
-    ld1             {v28.8b - v31.8b}, [sp], 32
+    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
     blr             x30
 
 3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
@@ -636,17 +637,17 @@ asm_function jsimd_idct_islow_neon
     transpose       ROW0L, ROW2L, v3, .16b, .2s
     transpose       ROW5L, ROW7L, v3, .16b, .2s
     cmp             x0, #0
-    beq             4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
+    b.eq            4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
 
     /* Only row 0 is non-zero for the right 4x8 half  */
-    dup             ROW1R.4h, ROW0R.4h[1]
-    dup             ROW2R.4h, ROW0R.4h[2]
-    dup             ROW3R.4h, ROW0R.4h[3]
-    dup             ROW4R.4h, ROW0R.4h[0]
-    dup             ROW5R.4h, ROW0R.4h[1]
-    dup             ROW6R.4h, ROW0R.4h[2]
-    dup             ROW7R.4h, ROW0R.4h[3]
-    dup             ROW0R.4h, ROW0R.4h[0]
+    dup             ROW1R.4h, ROW0R.h[1]
+    dup             ROW2R.4h, ROW0R.h[2]
+    dup             ROW3R.4h, ROW0R.h[3]
+    dup             ROW4R.4h, ROW0R.h[0]
+    dup             ROW5R.4h, ROW0R.h[1]
+    dup             ROW6R.4h, ROW0R.h[2]
+    dup             ROW7R.4h, ROW0R.h[3]
+    dup             ROW0R.4h, ROW0R.h[0]
     b               1b /* Go to 'normal' second pass */
 
 4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
@@ -770,13 +771,13 @@ asm_function jsimd_idct_islow_neon
  * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
  */
 
-#define XFIX_1_082392200 v0.4h[0]
-#define XFIX_1_414213562 v0.4h[1]
-#define XFIX_1_847759065 v0.4h[2]
-#define XFIX_2_613125930 v0.4h[3]
+#define XFIX_1_082392200 v0.h[0]
+#define XFIX_1_414213562 v0.h[1]
+#define XFIX_1_847759065 v0.h[2]
+#define XFIX_2_613125930 v0.h[3]
 
 .balign 16
-jsimd_idct_ifast_neon_consts:
+Ljsimd_idct_ifast_neon_consts:
     .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
     .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
     .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
@@ -810,12 +811,12 @@ asm_function jsimd_idct_ifast_neon
     /* Save NEON registers used in fast IDCT */
     sub             sp, sp, #176
     stp             x22, x23, [sp], 16
-    adr             x23, jsimd_idct_ifast_neon_consts
-    st1             {v0.8b - v3.8b}, [sp], 32
-    st1             {v4.8b - v7.8b}, [sp], 32
-    st1             {v8.8b - v11.8b}, [sp], 32
-    st1             {v12.8b - v15.8b}, [sp], 32
-    st1             {v16.8b - v19.8b}, [sp], 32
+    adr             x23, Ljsimd_idct_ifast_neon_consts
+    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
     ld1             {v8.8h, v9.8h}, [COEF_BLOCK], 32
     ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
     ld1             {v10.8h, v11.8h}, [COEF_BLOCK], 32
@@ -909,24 +910,24 @@ asm_function jsimd_idct_ifast_neon
     trn2            v15.4s,   v18.4s,   v15.4s
     /* vswp            v14.4h,   v10-MSB.4h */
     umov            x22, v14.d[0]
-    ins             v14.2d[0], v10.2d[1]
-    ins             v10.2d[1], x22
+    ins             v14.d[0], v10.d[1]
+    ins             v10.d[1], x22
     /* vswp            v13.4h,   v9MSB.4h */
 
     umov            x22, v13.d[0]
-    ins             v13.2d[0], v9.2d[1]
-    ins             v9.2d[1], x22
+    ins             v13.d[0], v9.d[1]
+    ins             v9.d[1], x22
     /* 1-D IDCT, pass 2 */
     sub             v2.8h,    v10.8h,   v14.8h
     /* vswp            v15.4h,   v11MSB.4h */
     umov            x22, v15.d[0]
-    ins             v15.2d[0], v11.2d[1]
-    ins             v11.2d[1], x22
+    ins             v15.d[0], v11.d[1]
+    ins             v11.d[1], x22
     add             v14.8h,   v10.8h,   v14.8h
     /* vswp            v12.4h,   v8-MSB.4h */
     umov            x22, v12.d[0]
-    ins             v12.2d[0], v8.2d[1]
-    ins             v8.2d[1], x22
+    ins             v12.d[0], v8.d[1]
+    ins             v8.d[1],  x22
     sub             v1.8h,    v11.8h,   v13.8h
     add             v13.8h,   v11.8h,   v13.8h
     sub             v5.8h,    v9.8h,    v15.8h
@@ -997,13 +998,13 @@ asm_function jsimd_idct_ifast_neon
     trn1            v9.4s,    v9.4s,    v11.4s
     trn2            v11.4s,   v18.4s,   v11.4s
     /* make copy */
-    ins             v17.2d[0], v8.2d[1]
+    ins             v17.d[0], v8.d[1]
     /* Transpose  d16-d17-msb */
     mov             v18.16b,  v8.16b
     trn1            v8.8b,    v8.8b,    v17.8b
     trn2            v17.8b,   v18.8b,   v17.8b
     /* make copy */
-    ins             v19.2d[0], v9.2d[1]
+    ins             v19.d[0], v9.d[1]
     mov             v18.16b,  v9.16b
     trn1            v9.8b,    v9.8b,    v19.8b
     trn2            v19.8b,   v18.8b,   v19.8b
@@ -1018,7 +1019,7 @@ asm_function jsimd_idct_ifast_neon
     add             TMP2,     TMP2,     OUTPUT_COL
     st1             {v9.8b},  [TMP1]
     /* make copy */
-    ins             v7.2d[0], v10.2d[1]
+    ins             v7.d[0],  v10.d[1]
     mov             v18.16b,  v10.16b
     trn1            v10.8b,   v10.8b,   v7.8b
     trn2            v7.8b,    v18.8b,   v7.8b
@@ -1031,7 +1032,7 @@ asm_function jsimd_idct_ifast_neon
     add             TMP5,     TMP5,     OUTPUT_COL
     st1             {v10.8b}, [TMP1]
     /* make copy */
-    ins             v16.2d[0], v11.2d[1]
+    ins             v16.d[0], v11.d[1]
     mov             v18.16b,  v11.16b
     trn1            v11.8b,   v11.8b,   v16.8b
     trn2            v16.8b,   v18.8b,   v16.8b
@@ -1040,11 +1041,11 @@ asm_function jsimd_idct_ifast_neon
     st1             {v16.8b}, [TMP5]
     sub             sp, sp, #176
     ldp             x22, x23, [sp], 16
-    ld1             {v0.8b - v3.8b}, [sp], 32
-    ld1             {v4.8b - v7.8b}, [sp], 32
-    ld1             {v8.8b - v11.8b}, [sp], 32
-    ld1             {v12.8b - v15.8b}, [sp], 32
-    ld1             {v16.8b - v19.8b}, [sp], 32
+    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
     blr             x30
 
     .unreq          DCT_TABLE
@@ -1095,38 +1096,38 @@ asm_function jsimd_idct_ifast_neon
 #define FIX_3_624509785  (29692) /* FIX(3.624509785) */
 
 .balign 16
-jsimd_idct_4x4_neon_consts:
-    .short     FIX_1_847759065     /* v0.4h[0] */
-    .short     -FIX_0_765366865    /* v0.4h[1] */
-    .short     -FIX_0_211164243    /* v0.4h[2] */
-    .short     FIX_1_451774981     /* v0.4h[3] */
+Ljsimd_idct_4x4_neon_consts:
+    .short     FIX_1_847759065     /* v0.h[0] */
+    .short     -FIX_0_765366865    /* v0.h[1] */
+    .short     -FIX_0_211164243    /* v0.h[2] */
+    .short     FIX_1_451774981     /* v0.h[3] */
     .short     -FIX_2_172734803    /* d1[0] */
     .short     FIX_1_061594337     /* d1[1] */
     .short     -FIX_0_509795579    /* d1[2] */
     .short     -FIX_0_601344887    /* d1[3] */
-    .short     FIX_0_899976223     /* v2.4h[0] */
-    .short     FIX_2_562915447     /* v2.4h[1] */
-    .short     1 << (CONST_BITS+1) /* v2.4h[2] */
-    .short     0                   /* v2.4h[3] */
+    .short     FIX_0_899976223     /* v2.h[0] */
+    .short     FIX_2_562915447     /* v2.h[1] */
+    .short     1 << (CONST_BITS+1) /* v2.h[2] */
+    .short     0                   /* v2.h[3] */
 
 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    smull           v28.4s, \x4,    v2.4h[2]
-    smlal           v28.4s, \x8,    v0.4h[0]
-    smlal           v28.4s, \x14,   v0.4h[1]
+    smull           v28.4s, \x4,    v2.h[2]
+    smlal           v28.4s, \x8,    v0.h[0]
+    smlal           v28.4s, \x14,   v0.h[1]
 
-    smull           v26.4s, \x16,   v1.4h[2]
-    smlal           v26.4s, \x12,   v1.4h[3]
-    smlal           v26.4s, \x10,   v2.4h[0]
-    smlal           v26.4s, \x6,    v2.4h[1]
+    smull           v26.4s, \x16,   v1.h[2]
+    smlal           v26.4s, \x12,   v1.h[3]
+    smlal           v26.4s, \x10,   v2.h[0]
+    smlal           v26.4s, \x6,    v2.h[1]
 
-    smull           v30.4s, \x4,    v2.4h[2]
-    smlsl           v30.4s, \x8,    v0.4h[0]
-    smlsl           v30.4s, \x14,   v0.4h[1]
+    smull           v30.4s, \x4,    v2.h[2]
+    smlsl           v30.4s, \x8,    v0.h[0]
+    smlsl           v30.4s, \x14,   v0.h[1]
 
-    smull           v24.4s, \x16,   v0.4h[2]
-    smlal           v24.4s, \x12,   v0.4h[3]
-    smlal           v24.4s, \x10,   v1.4h[0]
-    smlal           v24.4s, \x6,    v1.4h[1]
+    smull           v24.4s, \x16,   v0.h[2]
+    smlal           v24.4s, \x12,   v0.h[3]
+    smlal           v24.4s, \x10,   v1.h[0]
+    smlal           v24.4s, \x6,    v1.h[1]
 
     add             v20.4s, v28.4s, v26.4s
     sub             v28.4s, v28.4s, v26.4s
@@ -1171,15 +1172,15 @@ asm_function jsimd_idct_4x4_neon
     sub             sp, sp, 272
     str             x15, [sp], 16
     /* Load constants (v3.4h is just used for padding) */
-    adr             TMP4, jsimd_idct_4x4_neon_consts
-    st1             {v0.8b - v3.8b}, [sp], 32
-    st1             {v4.8b - v7.8b}, [sp], 32
-    st1             {v8.8b - v11.8b}, [sp], 32
-    st1             {v12.8b - v15.8b}, [sp], 32
-    st1             {v16.8b - v19.8b}, [sp], 32
-    st1             {v20.8b - v23.8b}, [sp], 32
-    st1             {v24.8b - v27.8b}, [sp], 32
-    st1             {v28.8b - v31.8b}, [sp], 32
+    adr             TMP4, Ljsimd_idct_4x4_neon_consts
+    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
     ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
 
     /* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1203,45 +1204,45 @@ asm_function jsimd_idct_4x4_neon
     ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
     mul             v4.4h, v4.4h, v18.4h
     mul             v5.4h, v5.4h, v19.4h
-    ins             v4.2d[1], v5.2d[0]    /* 128 bit q4 */
+    ins             v4.d[1], v5.d[0]    /* 128 bit q4 */
     ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
     mul             v6.4h, v6.4h, v20.4h
     mul             v7.4h, v7.4h, v21.4h
-    ins             v6.2d[1], v7.2d[0]    /* 128 bit q6 */
+    ins             v6.d[1], v7.d[0]    /* 128 bit q6 */
     mul             v8.4h, v8.4h, v22.4h
     mul             v9.4h, v9.4h, v23.4h
-    ins             v8.2d[1], v9.2d[0]    /* 128 bit q8 */
+    ins             v8.d[1], v9.d[0]    /* 128 bit q8 */
     add             DCT_TABLE, DCT_TABLE, #16
     ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
     mul             v10.4h, v10.4h, v24.4h
     mul             v11.4h, v11.4h, v25.4h
-    ins             v10.2d[1], v11.2d[0]  /* 128 bit q10 */
+    ins             v10.d[1], v11.d[0]  /* 128 bit q10 */
     mul             v12.4h, v12.4h, v26.4h
     mul             v13.4h, v13.4h, v27.4h
-    ins             v12.2d[1], v13.2d[0]  /* 128 bit q12 */
+    ins             v12.d[1], v13.d[0]  /* 128 bit q12 */
     ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
     mul             v14.4h, v14.4h, v28.4h
     mul             v15.4h, v15.4h, v29.4h
-    ins             v14.2d[1], v15.2d[0]  /* 128 bit q14 */
+    ins             v14.d[1], v15.d[0]  /* 128 bit q14 */
     mul             v16.4h, v16.4h, v30.4h
     mul             v17.4h, v17.4h, v31.4h
-    ins             v16.2d[1], v17.2d[0]  /* 128 bit q16 */
+    ins             v16.d[1], v17.d[0]  /* 128 bit q16 */
 
     /* Pass 1 */
     idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4.4h, v6.4h, v8.4h, v10.4h
     transpose_4x4   v4, v6, v8, v10, v3
-    ins             v10.2d[1], v11.2d[0]
+    ins             v10.d[1], v11.d[0]
     idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5.4h, v7.4h, v9.4h, v11.4h
     transpose_4x4   v5, v7, v9, v11, v3
-    ins             v10.2d[1], v11.2d[0]
+    ins             v10.d[1], v11.d[0]
     /* Pass 2 */
     idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4h, v27.4h, v28.4h, v29.4h
     transpose_4x4   v26, v27, v28, v29, v3
 
     /* Range limit */
     movi            v30.8h, #0x80
-    ins             v26.2d[1], v27.2d[0]
-    ins             v28.2d[1], v29.2d[0]
+    ins             v26.d[1], v27.d[0]
+    ins             v28.d[1], v29.d[0]
     add             v26.8h, v26.8h, v30.8h
     add             v28.8h, v28.8h, v30.8h
     sqxtun          v26.8b, v26.8h
@@ -1286,14 +1287,14 @@ asm_function jsimd_idct_4x4_neon
     /* vpop            {v8.4h - v15.4h}    ;not available */
     sub             sp, sp, #272
     ldr             x15, [sp], 16
-    ld1             {v0.8b - v3.8b}, [sp], 32
-    ld1             {v4.8b - v7.8b}, [sp], 32
-    ld1             {v8.8b - v11.8b}, [sp], 32
-    ld1             {v12.8b - v15.8b}, [sp], 32
-    ld1             {v16.8b - v19.8b}, [sp], 32
-    ld1             {v20.8b - v23.8b}, [sp], 32
-    ld1             {v24.8b - v27.8b}, [sp], 32
-    ld1             {v28.8b - v31.8b}, [sp], 32
+    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
     blr             x30
 
     .unreq          DCT_TABLE
@@ -1325,7 +1326,7 @@ asm_function jsimd_idct_4x4_neon
  */
 
 .balign 8
-jsimd_idct_2x2_neon_consts:
+Ljsimd_idct_2x2_neon_consts:
     .short     -FIX_0_720959822    /* v14[0] */
     .short     FIX_0_850430095     /* v14[1] */
     .short     -FIX_1_272758580    /* v14[2] */
@@ -1333,10 +1334,10 @@ jsimd_idct_2x2_neon_consts:
 
 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
     sshll      v15.4s, \x4,    #15
-    smull      v26.4s, \x6,    v14.4h[3]
-    smlal      v26.4s, \x10,   v14.4h[2]
-    smlal      v26.4s, \x12,   v14.4h[1]
-    smlal      v26.4s, \x16,   v14.4h[0]
+    smull      v26.4s, \x6,    v14.h[3]
+    smlal      v26.4s, \x10,   v14.h[2]
+    smlal      v26.4s, \x12,   v14.h[1]
+    smlal      v26.4s, \x16,   v14.h[0]
 
     add        v20.4s, v15.4s, v26.4s
     sub        v15.4s, v15.4s, v26.4s
@@ -1367,14 +1368,14 @@ asm_function jsimd_idct_2x2_neon
     str             x15, [sp], 16
 
     /* Load constants */
-    adr             TMP2, jsimd_idct_2x2_neon_consts
-    st1             {v4.8b - v7.8b}, [sp], 32
-    st1             {v8.8b - v11.8b}, [sp], 32
-    st1             {v12.8b - v15.8b}, [sp], 32
-    st1             {v16.8b - v19.8b}, [sp], 32
-    st1             {v21.8b - v22.8b}, [sp], 16
-    st1             {v24.8b - v27.8b}, [sp], 32
-    st1             {v30.8b - v31.8b}, [sp], 16
+    adr             TMP2, Ljsimd_idct_2x2_neon_consts
+    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+    st1             {v21.8b, v22.8b}, [sp], 16
+    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+    st1             {v30.8b, v31.8b}, [sp], 16
     ld1             {v14.4h}, [TMP2]
 
     /* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1400,25 +1401,25 @@ asm_function jsimd_idct_2x2_neon
     ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
     mul             v4.4h, v4.4h, v18.4h
     mul             v5.4h, v5.4h, v19.4h
-    ins             v4.2d[1], v5.2d[0]
+    ins             v4.d[1], v5.d[0]
     mul             v6.4h, v6.4h, v20.4h
     mul             v7.4h, v7.4h, v21.4h
-    ins             v6.2d[1], v7.2d[0]
+    ins             v6.d[1], v7.d[0]
     add             DCT_TABLE, DCT_TABLE, #16
     ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
     mul             v10.4h, v10.4h, v24.4h
     mul             v11.4h, v11.4h, v25.4h
-    ins             v10.2d[1], v11.2d[0]
+    ins             v10.d[1], v11.d[0]
     add             DCT_TABLE, DCT_TABLE, #16
     ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
     mul             v12.4h, v12.4h, v26.4h
     mul             v13.4h, v13.4h, v27.4h
-    ins             v12.2d[1], v13.2d[0]
+    ins             v12.d[1], v13.d[0]
     add             DCT_TABLE, DCT_TABLE, #16
     ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
     mul             v16.4h, v16.4h, v30.4h
     mul             v17.4h, v17.4h, v31.4h
-    ins             v16.2d[1], v17.2d[0]
+    ins             v16.d[1], v17.d[0]
 
     /* Pass 1 */
 #if 0
@@ -1427,14 +1428,14 @@ asm_function jsimd_idct_2x2_neon
     idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
     transpose_4x4   v5.4h, v7.4h, v9.4h,  v11.4h
 #else
-    smull           v26.4s, v6.4h,  v14.4h[3]
-    smlal           v26.4s, v10.4h, v14.4h[2]
-    smlal           v26.4s, v12.4h, v14.4h[1]
-    smlal           v26.4s, v16.4h, v14.4h[0]
-    smull           v24.4s, v7.4h,  v14.4h[3]
-    smlal           v24.4s, v11.4h, v14.4h[2]
-    smlal           v24.4s, v13.4h, v14.4h[1]
-    smlal           v24.4s, v17.4h, v14.4h[0]
+    smull           v26.4s, v6.4h,  v14.h[3]
+    smlal           v26.4s, v10.4h, v14.h[2]
+    smlal           v26.4s, v12.4h, v14.h[1]
+    smlal           v26.4s, v16.4h, v14.h[0]
+    smull           v24.4s, v7.4h,  v14.h[3]
+    smlal           v24.4s, v11.4h, v14.h[2]
+    smlal           v24.4s, v13.4h, v14.h[1]
+    smlal           v24.4s, v17.4h, v14.h[0]
     sshll           v15.4s, v4.4h,  #15
     sshll           v30.4s, v5.4h,  #15
     add             v20.4s, v15.4s, v26.4s
@@ -1445,12 +1446,12 @@ asm_function jsimd_idct_2x2_neon
     sub             v15.4s, v30.4s, v24.4s
     rshrn           v5.4h,  v20.4s, #13
     rshrn           v7.4h,  v15.4s, #13
-    ins             v4.2d[1], v5.2d[0]
-    ins             v6.2d[1], v7.2d[0]
+    ins             v4.d[1], v5.d[0]
+    ins             v6.d[1], v7.d[0]
     transpose       v4, v6, v3, .16b, .8h
     transpose       v6, v10, v3, .16b, .4s
-    ins             v11.2d[0], v10.2d[1]
-    ins             v7.2d[0], v6.2d[1]
+    ins             v11.d[0], v10.d[1]
+    ins             v7.d[0], v6.d[1]
 #endif
 
     /* Pass 2 */
@@ -1458,10 +1459,10 @@ asm_function jsimd_idct_2x2_neon
 
     /* Range limit */
     movi            v30.8h, #0x80
-    ins             v26.2d[1], v27.2d[0]
+    ins             v26.d[1], v27.d[0]
     add             v26.8h, v26.8h, v30.8h
     sqxtun          v30.8b, v26.8h
-    ins             v26.2d[0], v30.2d[0]
+    ins             v26.d[0], v30.d[0]
     sqxtun          v27.8b, v26.8h
 
     /* Store results to the output buffer */
@@ -1476,13 +1477,13 @@ asm_function jsimd_idct_2x2_neon
 
     sub             sp, sp, #208
     ldr             x15, [sp], 16
-    ld1             {v4.8b - v7.8b}, [sp], 32
-    ld1             {v8.8b - v11.8b}, [sp], 32
-    ld1             {v12.8b - v15.8b}, [sp], 32
-    ld1             {v16.8b - v19.8b}, [sp], 32
-    ld1             {v21.8b - v22.8b}, [sp], 16
-    ld1             {v24.8b - v27.8b}, [sp], 32
-    ld1             {v30.8b - v31.8b}, [sp], 16
+    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+    ld1             {v21.8b, v22.8b}, [sp], 16
+    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+    ld1             {v30.8b, v31.8b}, [sp], 16
     blr             x30
 
     .unreq          DCT_TABLE
@@ -1514,9 +1515,9 @@ asm_function jsimd_idct_2x2_neon
         ld1  {v4.8b}, [U], 8
         ld1  {v5.8b}, [V], 8
         ld1  {v0.8b}, [Y], 8
-        prfm PLDL1KEEP, [U, #64]
-        prfm PLDL1KEEP, [V, #64]
-        prfm PLDL1KEEP, [Y, #64]
+        prfm pldl1keep, [U, #64]
+        prfm pldl1keep, [V, #64]
+        prfm pldl1keep, [Y, #64]
     .elseif \size == 4
         ld1  {v4.b}[0], [U], 1
         ld1  {v4.b}[1], [U], 1
@@ -1606,14 +1607,14 @@ asm_function jsimd_idct_2x2_neon
 .macro do_yuv_to_rgb_stage1
     uaddw        v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
     uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
-    smull        v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
-    smlal        v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
-    smull2       v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
-    smlal2       v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
-    smull        v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
-    smull2       v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
-    smull        v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
-    smull2       v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
+    smull        v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal        v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2       v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2       v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    smull        v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2       v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+    smull        v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
+    smull2       v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
 .endm
 
 .macro do_yuv_to_rgb_stage2
@@ -1656,18 +1657,18 @@ asm_function jsimd_idct_2x2_neon
     sqxtun       v1\g_offs\defsize, v20.8h
     ld1          {v0.8b}, [Y], 8
     sqxtun       v1\r_offs\defsize, v24.8h
-    prfm         PLDL1KEEP, [U, #64]
-    prfm         PLDL1KEEP, [V, #64]
-    prfm         PLDL1KEEP, [Y, #64]
+    prfm         pldl1keep, [U, #64]
+    prfm         pldl1keep, [V, #64]
+    prfm         pldl1keep, [Y, #64]
     sqxtun       v1\b_offs\defsize, v28.8h
     uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
     uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
-    smull        v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
-    smlal        v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
-    smull2       v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
-    smlal2       v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
-    smull        v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
-    smull2       v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
+    smull        v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal        v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2       v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2       v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    smull        v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2       v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
 .else /**************************** rgb565 ***********************************/
     sqshlu       v21.8h, v20.8h, #8
     sqshlu       v25.8h, v24.8h, #8
@@ -1675,21 +1676,21 @@ asm_function jsimd_idct_2x2_neon
     uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
     uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
     ld1          {v0.8b}, [Y], 8
-    smull        v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
-    smlal        v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
-    smull2       v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
-    smlal2       v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
+    smull        v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal        v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2       v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2       v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
     sri          v25.8h, v21.8h, #5
-    smull        v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
-    smull2       v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
-    prfm         PLDL1KEEP, [U, #64]
-    prfm         PLDL1KEEP, [V, #64]
-    prfm         PLDL1KEEP, [Y, #64]
+    smull        v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2       v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+    prfm         pldl1keep, [U, #64]
+    prfm         pldl1keep, [V, #64]
+    prfm         pldl1keep, [Y, #64]
     sri          v25.8h, v29.8h, #11
 .endif
     do_store     \bpp, 8
-    smull        v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
-    smull2       v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
+    smull        v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
+    smull2       v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
 .endm
 
 .macro do_yuv_to_rgb
@@ -1702,7 +1703,7 @@ asm_function jsimd_idct_2x2_neon
  */
 
 .balign 16
-jsimd_ycc_\colorid\()_neon_consts:
+Ljsimd_ycc_\colorid\()_neon_consts:
     .short          0,      0,     0,      0
     .short          22971, -11277, -23401, 29033
     .short          -128,  -128,   -128,   -128
@@ -1717,7 +1718,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
 
     INPUT_BUF0      .req x5
     INPUT_BUF1      .req x6
-    INPUT_BUF2      .req INPUT_BUF
+    INPUT_BUF2      .req x1
 
     RGB             .req x7
     Y               .req x8
@@ -1728,16 +1729,16 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
     sub             sp, sp, 336
     str             x15, [sp], 16
     /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
-    adr             x15, jsimd_ycc_\colorid\()_neon_consts
+    adr             x15, Ljsimd_ycc_\colorid\()_neon_consts
     /* Save NEON registers */
-    st1             {v0.8b - v3.8b}, [sp], 32
-    st1             {v4.8b - v7.8b}, [sp], 32
-    st1             {v8.8b - v11.8b}, [sp], 32
-    st1             {v12.8b - v15.8b}, [sp], 32
-    st1             {v16.8b - v19.8b}, [sp], 32
-    st1             {v20.8b - v23.8b}, [sp], 32
-    st1             {v24.8b - v27.8b}, [sp], 32
-    st1             {v28.8b - v31.8b}, [sp], 32
+    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
     ld1             {v0.4h, v1.4h}, [x15], 16
     ld1             {v2.8h}, [x15]
 
@@ -1748,8 +1749,8 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
     stp             x8, x9, [sp], 16
     stp             x10, x30, [sp], 16
     ldr             INPUT_BUF0, [INPUT_BUF]
-    ldr             INPUT_BUF1, [INPUT_BUF, 8]
-    ldr             INPUT_BUF2, [INPUT_BUF, 16]
+    ldr             INPUT_BUF1, [INPUT_BUF, #8]
+    ldr             INPUT_BUF2, [INPUT_BUF, #16]
     .unreq          INPUT_BUF
 
     /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
@@ -1758,7 +1759,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
 
     /* Outer loop over scanlines */
     cmp             NUM_ROWS, #1
-    blt             9f
+    b.lt            9f
 0:
     lsl             x16, INPUT_ROW, #3
     ldr             Y, [INPUT_BUF0, x16]
@@ -1770,60 +1771,60 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
 
     /* Inner loop over pixels */
     subs            N, N, #8
-    blt             3f
+    b.lt            3f
     do_load         8
     do_yuv_to_rgb_stage1
     subs            N, N, #8
-    blt             2f
+    b.lt            2f
 1:
     do_yuv_to_rgb_stage2_store_load_stage1
     subs            N, N, #8
-    bge             1b
+    b.ge            1b
 2:
     do_yuv_to_rgb_stage2
     do_store        \bpp, 8
     tst             N, #7
-    beq             8f
+    b.eq            8f
 3:
     tst             N, #4
-    beq             3f
+    b.eq            3f
     do_load         4
 3:
     tst             N, #2
-    beq             4f
+    b.eq            4f
     do_load         2
 4:
     tst             N, #1
-    beq             5f
+    b.eq            5f
     do_load         1
 5:
     do_yuv_to_rgb
     tst             N, #4
-    beq             6f
+    b.eq            6f
     do_store        \bpp, 4
 6:
     tst             N, #2
-    beq             7f
+    b.eq            7f
     do_store        \bpp, 2
 7:
     tst             N, #1
-    beq             8f
+    b.eq            8f
     do_store        \bpp, 1
 8:
     subs            NUM_ROWS, NUM_ROWS, #1
-    bgt             0b
+    b.gt            0b
 9:
     /* Restore all registers and return */
     sub             sp, sp, #336
     ldr             x15, [sp], 16
-    ld1             {v0.8b - v3.8b}, [sp], 32
-    ld1             {v4.8b - v7.8b}, [sp], 32
-    ld1             {v8.8b - v11.8b}, [sp], 32
-    ld1             {v12.8b - v15.8b}, [sp], 32
-    ld1             {v16.8b - v19.8b}, [sp], 32
-    ld1             {v20.8b - v23.8b}, [sp], 32
-    ld1             {v24.8b - v27.8b}, [sp], 32
-    ld1             {v28.8b - v31.8b}, [sp], 32
+    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
     /* pop             {r4, r5, r6, r7, r8, r9, r10, pc} */
     ldp             x4, x5, [sp], 16
     ldp             x6, x7, [sp], 16

From f8e80392047874d42e42bee17fdf348bd0d309c1 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 19 Dec 2014 18:25:03 +0000
Subject: [PATCH 056/108] In the output of the configure script, indicate
 whether gas-preprocessor.pl is being used along with the assembler.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1457 632fc199-4ca6-4c93-a231-07263d6284db
---
 configure.ac | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/configure.ac b/configure.ac
index 39a9c2ba..cd232841 100644
--- a/configure.ac
+++ b/configure.ac
@@ -437,7 +437,11 @@ if test "x${with_simd}" != "xno"; then
       AC_MSG_RESULT([yes (arm)])
       AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
       AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE(
-        [AC_MSG_RESULT([yes])
+        [if test "x$ac_use_gas_preprocessor" = "xyes"; then
+           AC_MSG_RESULT([yes (with gas-preprocessor)])
+         else
+           AC_MSG_RESULT([yes])
+         fi
          simd_arch=arm],
         [AC_MSG_RESULT([no])
          with_simd=no])
@@ -453,7 +457,11 @@ if test "x${with_simd}" != "xno"; then
       AC_MSG_RESULT([yes (arm64)])
       AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
       AC_CHECK_COMPATIBLE_ARM64_ASSEMBLER_IFELSE(
-        [AC_MSG_RESULT([yes])
+        [if test "x$ac_use_gas_preprocessor" = "xyes"; then
+           AC_MSG_RESULT([yes (with gas-preprocessor)])
+         else
+           AC_MSG_RESULT([yes])
+         fi
          simd_arch=aarch64],
         [AC_MSG_RESULT([no])
          with_simd=no])

From 4efb529bb7b91790679dd6597689e9de7e7b94bc Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 19 Dec 2014 18:28:00 +0000
Subject: [PATCH 057/108] Include ARMv8 binaries when generating a combined OS
 X/iOS package using 'make iosdmg'

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1458 632fc199-4ca6-4c93-a231-07263d6284db
---
 BUILDING.txt          | 108 +++++++++++++++++--------
 ChangeLog.txt         |   8 +-
 Makefile.am           |   4 +-
 release/makemacpkg.in | 184 ++++++++++++++++++++++++++++++------------
 4 files changed, 216 insertions(+), 88 deletions(-)

diff --git a/BUILDING.txt b/BUILDING.txt
index a19d8ce6..c800b6e6 100644
--- a/BUILDING.txt
+++ b/BUILDING.txt
@@ -335,6 +335,9 @@ Additional build requirements:
   (https://sourceforge.net/p/libjpeg-turbo/code/HEAD/tree/gas-preprocessor)
   should be installed in your PATH.
 
+
+ARM 32-bit Build (Xcode 4.6.x and earlier, LLVM-GCC):
+
 Set the following shell variables for simplicity:
 
   Xcode 4.2 and earlier:
@@ -343,28 +346,18 @@ Set the following shell variables for simplicity:
   IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
 
   IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
-
-  Xcode 4.6.x and earlier:
   IOS_GCC=$IOS_PLATFORMDIR/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2
-  Xcode 5.0.x and later:
-  IOS_GCC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
 
-  ARM v6 (code will run on all iOS devices, not SIMD-accelerated):
+  ARMv6 (code will run on all iOS devices, not SIMD-accelerated):
   [NOTE: Requires Xcode 4.4.x or earlier]
   IOS_CFLAGS="-march=armv6 -mcpu=arm1176jzf-s -mfpu=vfp"
 
-  ARM v7 (code will run on iPhone 3GS-4S/iPad 1st-3rd Generation and newer):
-  Xcode 4.6.x and earlier:
+  ARMv7 (code will run on iPhone 3GS-4S/iPad 1st-3rd Generation and newer):
   IOS_CFLAGS="-march=armv7 -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon"
-  Xcode 5.0.x and later:
-  IOS_CFLAGS="-arch armv7"
 
-  ARM v7s (code will run on iPhone 5/iPad 4th Generation and newer):
+  ARMv7s (code will run on iPhone 5/iPad 4th Generation and newer):
   [NOTE: Requires Xcode 4.5 or later]
-  Xcode 4.6.x and earlier:
   IOS_CFLAGS="-march=armv7s -mcpu=swift -mtune=swift -mfpu=neon"
-  Xcode 5.0.x and later:
-  IOS_CFLAGS="-arch armv7s"
 
 Follow the procedure under "Building libjpeg-turbo" above, adding
 
@@ -373,24 +366,67 @@ Follow the procedure under "Building libjpeg-turbo" above, adding
     CFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
     LDFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT $IOS_CFLAGS"
 
-to the configure command line.  If using Xcode 5.0.x or later, also add
+to the configure command line.
 
+
+ARM 32-bit Build (Xcode 5.0.x and later, Clang):
+
+Set the following shell variables for simplicity:
+
+  IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
+  IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
+  IOS_GCC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+
+  ARMv7 (code will run on iPhone 3GS-4S/iPad 1st-3rd Generation and newer):
+  IOS_CFLAGS="-arch armv7"
+
+  ARMv7s (code will run on iPhone 5/iPad 4th Generation and newer):
+  IOS_CFLAGS="-arch armv7s"
+
+Follow the procedure under "Building libjpeg-turbo" above, adding
+
+  --host arm-apple-darwin10 \
+    CC="$IOS_GCC" LD="$IOS_GCC" \
+    CFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
+    LDFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT $IOS_CFLAGS" \
     CCASFLAGS="-no-integrated-as $IOS_CFLAGS"
 
 to the configure command line.
 
+
+ARMv8 64-bit Build (Xcode 5.0.x and later, Clang):
+
+Code will run on iPhone 5S/iPad Mini 2 and newer.
+
+Set the following shell variables for simplicity:
+
+  IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
+  IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
+  IOS_GCC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+  IOS_CFLAGS="-arch arm64"
+
+Follow the procedure under "Building libjpeg-turbo" above, adding
+
+  --host aarch64-apple-darwin \
+    CC="$IOS_GCC" LD="$IOS_GCC" \
+    CFLAGS="-isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
+    LDFLAGS="-isysroot $IOS_SYSROOT $IOS_CFLAGS"
+
+to the configure command line.
+
+
 NOTE:  You can also add -miphoneos-version-min={version} to $IOS_CFLAGS above
 in order to support older versions of iOS than the default version supported by
 the SDK.
 
-Once built, lipo can be used to combine the ARM v6, v7, and/or v7s variants
+Once built, lipo can be used to combine the ARMv6, v7, v7s, and/or v8 variants
 into a universal library.
 
 NOTE: If you are building libjpeg-turbo from the "official" project tarball,
 then it is highly likely that you will need to run 'autoreconf -fiv' in the
-source tree prior to building ARM v7 or v7s iOS binaries using the techniques
-described above.  Otherwise, you may get a libtool error such as "unable to
-infer tagged configuration."
+source tree prior to building ARMv7, v7s, or v8 iOS binaries using the
+techniques described above.  Otherwise, you may get a libtool error such as
+"unable to infer tagged configuration."
 
 
 Building libjpeg-turbo for Android
@@ -764,26 +800,32 @@ make udmg [BUILDDIR32={32-bit build directory}]
   make command line as shown above.
 
 make iosdmg [BUILDDIR32={32-bit build directory}] \
-  [BUILDDIRARMV6={ARM v6 build directory}] \
-  [BUILDDIRARMV7={ARM v7 build directory}] \
-  [BUILDDIRARMV7S={ARM v7s build directory}]
+  [BUILDDIRARMV6={ARMv6 build directory}] \
+  [BUILDDIRARMV7={ARMv7 build directory}] \
+  [BUILDDIRARMV7S={ARMv7s build directory}] \
+  [BUILDDIRARMV8={ARMv8 build directory}]
 
   On OS X systems, this creates a Macintosh package and disk image in which the
   libjpeg-turbo static libraries contain ARM architectures necessary to build
   iOS applications.  If building on an x86-64 system, the binaries will also
   contain the i386 architecture, as with 'make udmg' above.  You should first
-  configure ARM v6, ARM v7, and/or ARM v7s out-of-tree builds of libjpeg-turbo
-  (see "Building libjpeg-turbo for iOS" above.)  If you are building an x86-64
-  version of libjpeg-turbo, you should configure a 32-bit out-of-tree build as
-  well.  Next, build libjpeg-turbo as you would normally, using an out-of-tree
-  build.  When it is built, run 'make iosdmg' from the build directory.  The
-  build system will look for the ARM v6 build under {source_directory}/iosarmv6
-  by default, the ARM v7 build under {source_directory}/iosarmv7 by default,
-  the ARM v7s build under {source_directory}/iosarmv7s by default, and (if
-  applicable) the 32-bit build under {source_directory}/osxx86 by default, but
-  you can override this by setting the BUILDDIR32, BUILDDIRARMV6,
-  BUILDDIRARMV7, and/or BUILDDIRARMV7S variables on the make command line as
-  shown above.
+  configure ARMv6, ARMv7, ARMv7s, and/or ARMv8 out-of-tree builds of
+  libjpeg-turbo (see "Building libjpeg-turbo for iOS" above.)  If you are
+  building an x86-64 version of libjpeg-turbo, you should configure a 32-bit
+  out-of-tree build as well.  Next, build libjpeg-turbo as you would normally,
+  using an out-of-tree build.  When it is built, run 'make iosdmg' from the
+  build directory.  The build system will look for the ARMv6 build under
+  {source_directory}/iosarmv6 by default, the ARMv7 build under
+  {source_directory}/iosarmv7 by default, the ARMv7s build under
+  {source_directory}/iosarmv7s by default, the ARMv8 build under
+  {source_directory}/iosarmv8 by default, and (if applicable) the 32-bit build
+  under {source_directory}/osxx86 by default, but you can override this by
+  setting the BUILDDIR32, BUILDDIRARMV6, BUILDDIRARMV7, BUILDDIRARMV7S, and/or
+  BUILDDIRARMV8 variables on the make command line as shown above.
+
+  NOTE: If including an ARMv8 build in the package, then you may need to use
+  Xcode's version of lipo instead of the operating system's.  To do this, pass
+  an argument of LIPO="xcrun lipo" on the make command line.
 
 make cygwinpkg
 
diff --git a/ChangeLog.txt b/ChangeLog.txt
index 40ae4814..0afab584 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -63,6 +63,10 @@ good argument can be made that this is a bad practice on the part of the
 software in question, but since this affects more than one package, it's just
 easier to fix it here.
 
+[12] Fixed issues that were preventing the ARM 64-bit SIMD code from compiling
+for iOS, and included an ARMv8 architecture in all of the binaries installed by
+the "official" libjpeg-turbo SDK for OS X.
+
 
 1.3.90 (1.4 beta1)
 ==================
@@ -315,7 +319,7 @@ configure/CMake switch in order to retain strict API/ABI compatibility with the
 libjpeg v6b or v7 API/ABI (or with previous versions of libjpeg-turbo.)  See
 README-turbo.txt for more details.
 
-[13] Added ARM v7s architecture to libjpeg.a and libturbojpeg.a in the official
+[13] Added ARMv7s architecture to libjpeg.a and libturbojpeg.a in the official
 libjpeg-turbo binary package for OS X, so that those libraries can be used to
 build applications that leverage the faster CPUs in the iPhone 5 and iPad 4.
 
@@ -398,7 +402,7 @@ K component is assigned a component ID of 1 instead of 4.  Although these files
 are in violation of the spec, other JPEG implementations handle them
 correctly.
 
-[7] Added ARM v6 and ARM v7 architectures to libjpeg.a and libturbojpeg.a in
+[7] Added ARMv6 and ARMv7 architectures to libjpeg.a and libturbojpeg.a in
 the official libjpeg-turbo binary package for OS X, so that those libraries can
 be used to build both OS X and iOS applications.
 
diff --git a/Makefile.am b/Makefile.am
index 569c0bbc..b32a41b2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -584,12 +584,12 @@ udmg: all pkgscripts/makemacpkg pkgscripts/uninstall
 	sh pkgscripts/makemacpkg -build32 ${BUILDDIR32}
 
 iosdmg: all pkgscripts/makemacpkg pkgscripts/uninstall
-	sh pkgscripts/makemacpkg -build32 ${BUILDDIR32} -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S}
+	sh pkgscripts/makemacpkg -build32 ${BUILDDIR32} -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S} -buildarmv8 ${BUILDDIRARMV8} -lipo "${LIPO}"
 
 else
 
 iosdmg: all pkgscripts/makemacpkg pkgscripts/uninstall
-	sh pkgscripts/makemacpkg -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S}
+	sh pkgscripts/makemacpkg -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S} -buildarmv8 ${BUILDDIRARMV8} -lipo "${LIPO}"
 
 endif
 
diff --git a/release/makemacpkg.in b/release/makemacpkg.in
index d0edecb0..71637573 100644
--- a/release/makemacpkg.in
+++ b/release/makemacpkg.in
@@ -17,7 +17,7 @@ onexit()
 
 usage()
 {
-	echo "$0 [-build32 [32-bit build dir]] [-buildarmv6 [ARM v6 build dir]] [-buildarmv7 [ARM v7 build dir]] [-buildarmv7s [ARM v7s build dir]]"
+	echo "$0 [-build32 [32-bit build dir]] [-buildarmv6 [ARMv6 build dir]] [-buildarmv7 [ARMv7 build dir]] [-buildarmv7s [ARMv7s build dir] [-buildarmv8 [ARMv8 build dir]] [-lipo [path to lipo]]"
 	exit 1
 }
 
@@ -33,7 +33,10 @@ BUILDDIRARMV7=@abs_top_srcdir@/iosarmv7
 BUILDARMV7=0
 BUILDDIRARMV7S=@abs_top_srcdir@/iosarmv7s
 BUILDARMV7S=0
+BUILDDIRARMV8=@abs_top_srcdir@/iosarmv8
+BUILDARMV8=0
 WITH_JAVA=@WITH_JAVA@
+LIPO=lipo
 
 PREFIX=%{__prefix}
 BINDIR=%{__bindir}
@@ -75,6 +78,21 @@ while [ $# -gt 0 ]; do
 			fi
 		fi
 		;;
+	-buildarmv8)
+		BUILDARMV8=1
+		if [ $# -gt 1 ]; then
+			if [[ ! "$2" =~ -.* ]]; then
+				BUILDDIRARMV8=$2;  shift
+			fi
+		fi
+		;;
+	-lipo)
+		if [ $# -gt 1 ]; then
+			if [[ ! "$2" =~ -.* ]]; then
+				LIPO=$2;  shift
+			fi
+		fi
+		;;
 	esac
 	shift
 done
@@ -110,50 +128,50 @@ if [ $BUILD32 = 1 ]; then
 	popd
 	if [ ! -h $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
 		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
-		lipo -create \
+		$LIPO -create \
 			-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
 			-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
 			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
 	elif [ ! -h $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
 		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
-		lipo -create \
+		$LIPO -create \
 			-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
 			-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
 			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
 	fi
-	lipo -create \
+	$LIPO -create \
 		-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.a \
 		-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.a \
 		-output $PKGROOT/$LIBDIR/libjpeg.a
-	lipo -create \
+	$LIPO -create \
 		-arch i386 $TMPDIR/dist.x86/$LIBDIR/libturbojpeg.0.dylib \
 		-arch x86_64 $PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
 		-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
-	lipo -create \
+	$LIPO -create \
 		-arch i386 $TMPDIR/dist.x86/$LIBDIR/libturbojpeg.a \
 		-arch x86_64 $PKGROOT/$LIBDIR/libturbojpeg.a \
 		-output $PKGROOT/$LIBDIR/libturbojpeg.a
-	lipo -create \
+	$LIPO -create \
 		-arch i386 $TMPDIR/dist.x86/$BINDIR/cjpeg \
 		-arch x86_64 $PKGROOT/$BINDIR/cjpeg \
 		-output $PKGROOT/$BINDIR/cjpeg
-	lipo -create \
+	$LIPO -create \
 		-arch i386 $TMPDIR/dist.x86/$BINDIR/djpeg \
 		-arch x86_64 $PKGROOT/$BINDIR/djpeg \
 		-output $PKGROOT/$BINDIR/djpeg
-	lipo -create \
+	$LIPO -create \
 		-arch i386 $TMPDIR/dist.x86/$BINDIR/jpegtran \
 		-arch x86_64 $PKGROOT/$BINDIR/jpegtran \
 		-output $PKGROOT/$BINDIR/jpegtran
-	lipo -create \
+	$LIPO -create \
 		-arch i386 $TMPDIR/dist.x86/$BINDIR/tjbench \
 		-arch x86_64 $PKGROOT/$BINDIR/tjbench \
 		-output $PKGROOT/$BINDIR/tjbench
-	lipo -create \
+	$LIPO -create \
 		-arch i386 $TMPDIR/dist.x86/$BINDIR/rdjpgcom \
 		-arch x86_64 $PKGROOT/$BINDIR/rdjpgcom \
 		-output $PKGROOT/$BINDIR/rdjpgcom
-	lipo -create \
+	$LIPO -create \
 		-arch i386 $TMPDIR/dist.x86/$BINDIR/wrjpgcom \
 		-arch x86_64 $PKGROOT/$BINDIR/wrjpgcom \
 		-output $PKGROOT/$BINDIR/wrjpgcom
@@ -162,11 +180,11 @@ fi
 
 if [ $BUILDARMV6 = 1 ]; then
 	if [ ! -d $BUILDDIRARMV6 ]; then
-		echo ERROR: ARM v6 build directory $BUILDDIRARMV6 does not exist
+		echo ERROR: ARMv6 build directory $BUILDDIRARMV6 does not exist
 		exit 1
 	fi
 	if [ ! -f $BUILDDIRARMV6/Makefile ]; then
-		echo ERROR: ARM v6 build directory $BUILDDIRARMV6 is not configured
+		echo ERROR: ARMv6 build directory $BUILDDIRARMV6 is not configured
 		exit 1
 	fi
 	mkdir -p $TMPDIR/dist.armv6
@@ -175,50 +193,50 @@ if [ $BUILDARMV6 = 1 ]; then
 	popd
 	if [ ! -h $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
 		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
-		lipo -create \
+		$LIPO -create \
 			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
 			-arch arm $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
 			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
 	elif [ ! -h $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
 		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
-		lipo -create \
+		$LIPO -create \
 			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
 			-arch arm $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
 			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
 	fi
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$LIBDIR/libjpeg.a \
 		-arch arm $TMPDIR/dist.armv6/$LIBDIR/libjpeg.a \
 		-output $PKGROOT/$LIBDIR/libjpeg.a
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
 		-arch arm $TMPDIR/dist.armv6/$LIBDIR/libturbojpeg.0.dylib \
 		-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$LIBDIR/libturbojpeg.a \
 		-arch arm $TMPDIR/dist.armv6/$LIBDIR/libturbojpeg.a \
 		-output $PKGROOT/$LIBDIR/libturbojpeg.a
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/cjpeg \
 		-arch arm $TMPDIR/dist.armv6/$BINDIR/cjpeg \
 		-output $PKGROOT/$BINDIR/cjpeg
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/djpeg \
 		-arch arm $TMPDIR/dist.armv6/$BINDIR/djpeg \
 		-output $PKGROOT/$BINDIR/djpeg
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/jpegtran \
 		-arch arm $TMPDIR/dist.armv6/$BINDIR/jpegtran \
 		-output $PKGROOT/$BINDIR/jpegtran
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/tjbench \
 		-arch arm $TMPDIR/dist.armv6/$BINDIR/tjbench \
 		-output $PKGROOT/$BINDIR/tjbench
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/rdjpgcom \
 		-arch arm $TMPDIR/dist.armv6/$BINDIR/rdjpgcom \
 		-output $PKGROOT/$BINDIR/rdjpgcom
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/wrjpgcom \
 		-arch arm $TMPDIR/dist.armv6/$BINDIR/wrjpgcom \
 		-output $PKGROOT/$BINDIR/wrjpgcom
@@ -226,11 +244,11 @@ fi
 
 if [ $BUILDARMV7 = 1 ]; then
 	if [ ! -d $BUILDDIRARMV7 ]; then
-		echo ERROR: ARM v7 build directory $BUILDDIRARMV7 does not exist
+		echo ERROR: ARMv7 build directory $BUILDDIRARMV7 does not exist
 		exit 1
 	fi
 	if [ ! -f $BUILDDIRARMV7/Makefile ]; then
-		echo ERROR: ARM v7 build directory $BUILDDIRARMV7 is not configured
+		echo ERROR: ARMv7 build directory $BUILDDIRARMV7 is not configured
 		exit 1
 	fi
 	mkdir -p $TMPDIR/dist.armv7
@@ -239,50 +257,50 @@ if [ $BUILDARMV7 = 1 ]; then
 	popd
 	if [ ! -h $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
 		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
-		lipo -create \
+		$LIPO -create \
 			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
 			-arch arm $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
 			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
 	elif [ ! -h $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
 		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
-		lipo -create \
+		$LIPO -create \
 			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
 			-arch arm $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
 			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
 	fi
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$LIBDIR/libjpeg.a \
 		-arch arm $TMPDIR/dist.armv7/$LIBDIR/libjpeg.a \
 		-output $PKGROOT/$LIBDIR/libjpeg.a
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
 		-arch arm $TMPDIR/dist.armv7/$LIBDIR/libturbojpeg.0.dylib \
 		-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$LIBDIR/libturbojpeg.a \
 		-arch arm $TMPDIR/dist.armv7/$LIBDIR/libturbojpeg.a \
 		-output $PKGROOT/$LIBDIR/libturbojpeg.a
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/cjpeg \
 		-arch arm $TMPDIR/dist.armv7/$BINDIR/cjpeg \
 		-output $PKGROOT/$BINDIR/cjpeg
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/djpeg \
 		-arch arm $TMPDIR/dist.armv7/$BINDIR/djpeg \
 		-output $PKGROOT/$BINDIR/djpeg
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/jpegtran \
 		-arch arm $TMPDIR/dist.armv7/$BINDIR/jpegtran \
 		-output $PKGROOT/$BINDIR/jpegtran
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/tjbench \
 		-arch arm $TMPDIR/dist.armv7/$BINDIR/tjbench \
 		-output $PKGROOT/$BINDIR/tjbench
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/rdjpgcom \
 		-arch arm $TMPDIR/dist.armv7/$BINDIR/rdjpgcom \
 		-output $PKGROOT/$BINDIR/rdjpgcom
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/wrjpgcom \
 		-arch arm $TMPDIR/dist.armv7/$BINDIR/wrjpgcom \
 		-output $PKGROOT/$BINDIR/wrjpgcom
@@ -290,11 +308,11 @@ fi
 
 if [ $BUILDARMV7S = 1 ]; then
 	if [ ! -d $BUILDDIRARMV7S ]; then
-		echo ERROR: ARM v7s build directory $BUILDDIRARMV7S does not exist
+		echo ERROR: ARMv7s build directory $BUILDDIRARMV7S does not exist
 		exit 1
 	fi
 	if [ ! -f $BUILDDIRARMV7S/Makefile ]; then
-		echo ERROR: ARM v7s build directory $BUILDDIRARMV7S is not configured
+		echo ERROR: ARMv7s build directory $BUILDDIRARMV7S is not configured
 		exit 1
 	fi
 	mkdir -p $TMPDIR/dist.armv7s
@@ -303,55 +321,119 @@ if [ $BUILDARMV7S = 1 ]; then
 	popd
 	if [ ! -h $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
 		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
-		lipo -create \
+		$LIPO -create \
 			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
 			-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
 			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
 	elif [ ! -h $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
 		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
-		lipo -create \
+		$LIPO -create \
 			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
 			-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
 			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
 	fi
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$LIBDIR/libjpeg.a \
 		-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.a \
 		-output $PKGROOT/$LIBDIR/libjpeg.a
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
 		-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libturbojpeg.0.dylib \
 		-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$LIBDIR/libturbojpeg.a \
 		-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libturbojpeg.a \
 		-output $PKGROOT/$LIBDIR/libturbojpeg.a
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/cjpeg \
 		-arch arm $TMPDIR/dist.armv7s/$BINDIR/cjpeg \
 		-output $PKGROOT/$BINDIR/cjpeg
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/djpeg \
 		-arch arm $TMPDIR/dist.armv7s/$BINDIR/djpeg \
 		-output $PKGROOT/$BINDIR/djpeg
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/jpegtran \
 		-arch arm $TMPDIR/dist.armv7s/$BINDIR/jpegtran \
 		-output $PKGROOT/$BINDIR/jpegtran
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/tjbench \
 		-arch arm $TMPDIR/dist.armv7s/$BINDIR/tjbench \
 		-output $PKGROOT/$BINDIR/tjbench
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/rdjpgcom \
 		-arch arm $TMPDIR/dist.armv7s/$BINDIR/rdjpgcom \
 		-output $PKGROOT/$BINDIR/rdjpgcom
-	lipo -create \
+	$LIPO -create \
 		$PKGROOT/$BINDIR/wrjpgcom \
 		-arch arm $TMPDIR/dist.armv7s/$BINDIR/wrjpgcom \
 		-output $PKGROOT/$BINDIR/wrjpgcom
 fi
 
+if [ $BUILDARMV8 = 1 ]; then
+	if [ ! -d $BUILDDIRARMV8 ]; then
+		echo ERROR: ARMv8 build directory $BUILDDIRARMV8 does not exist
+		exit 1
+	fi
+	if [ ! -f $BUILDDIRARMV8/Makefile ]; then
+		echo ERROR: ARMv8 build directory $BUILDDIRARMV8 is not configured
+		exit 1
+	fi
+	mkdir -p $TMPDIR/dist.armv8
+	pushd $BUILDDIRARMV8
+	make install DESTDIR=$TMPDIR/dist.armv8
+	popd
+	if [ ! -h $TMPDIR/dist.armv8/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
+		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
+		$LIPO -create \
+			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
+			-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
+			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
+	elif [ ! -h $TMPDIR/dist.armv8/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
+		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
+		$LIPO -create \
+			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
+			-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
+			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
+	fi
+	$LIPO -create \
+		$PKGROOT/$LIBDIR/libjpeg.a \
+		-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libjpeg.a \
+		-output $PKGROOT/$LIBDIR/libjpeg.a
+	$LIPO -create \
+		$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
+		-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libturbojpeg.0.dylib \
+		-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
+	$LIPO -create \
+		$PKGROOT/$LIBDIR/libturbojpeg.a \
+		-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libturbojpeg.a \
+		-output $PKGROOT/$LIBDIR/libturbojpeg.a
+	$LIPO -create \
+		$PKGROOT/$BINDIR/cjpeg \
+		-arch arm64 $TMPDIR/dist.armv8/$BINDIR/cjpeg \
+		-output $PKGROOT/$BINDIR/cjpeg
+	$LIPO -create \
+		$PKGROOT/$BINDIR/djpeg \
+		-arch arm64 $TMPDIR/dist.armv8/$BINDIR/djpeg \
+		-output $PKGROOT/$BINDIR/djpeg
+	$LIPO -create \
+		$PKGROOT/$BINDIR/jpegtran \
+		-arch arm64 $TMPDIR/dist.armv8/$BINDIR/jpegtran \
+		-output $PKGROOT/$BINDIR/jpegtran
+	$LIPO -create \
+		$PKGROOT/$BINDIR/tjbench \
+		-arch arm64 $TMPDIR/dist.armv8/$BINDIR/tjbench \
+		-output $PKGROOT/$BINDIR/tjbench
+	$LIPO -create \
+		$PKGROOT/$BINDIR/rdjpgcom \
+		-arch arm64 $TMPDIR/dist.armv8/$BINDIR/rdjpgcom \
+		-output $PKGROOT/$BINDIR/rdjpgcom
+	$LIPO -create \
+		$PKGROOT/$BINDIR/wrjpgcom \
+		-arch arm64 $TMPDIR/dist.armv8/$BINDIR/wrjpgcom \
+		-output $PKGROOT/$BINDIR/wrjpgcom
+fi
+
 install_name_tool -id $LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
 install_name_tool -id $LIBDIR/libturbojpeg.0.dylib $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
 

From 4ac7101056222c31a5d50728f352f875d70bd8c3 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Sat, 20 Dec 2014 00:48:03 +0100
Subject: [PATCH 058/108] No scan opt if custom scan

Disables scan pattern optimization (JBOOLEAN_OPTIMIZE_SCANS extension
parameter) if a custom scan pattern is provided.
Fixes #137
---
 rdswitch.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/rdswitch.c b/rdswitch.c
index d4c630e6..d6fba749 100644
--- a/rdswitch.c
+++ b/rdswitch.c
@@ -265,6 +265,9 @@ bogus:
     MEMCOPY(scanptr, scans, scanno * sizeof(jpeg_scan_info));
     cinfo->scan_info = scanptr;
     cinfo->num_scans = scanno;
+    
+    /* Disable scan optimization if using custom scan */
+    jpeg_c_set_bool_param(cinfo, JBOOLEAN_OPTIMIZE_SCANS, FALSE);
   }
 
   fclose(fp);

From 864600d70705bfe9af852689d43c05f23b8230b5 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Sat, 20 Dec 2014 01:14:38 +0000
Subject: [PATCH 059/108] Swap the order of the IFAST and ISLOW FDCT functions
 so that it matches the order of the prototypes in jsimd.h and the stubs in
 jsimd_powerpc.c.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1459 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/jsimd_powerpc_altivec.c | 286 +++++++++++++++++------------------
 1 file changed, 143 insertions(+), 143 deletions(-)

diff --git a/simd/jsimd_powerpc_altivec.c b/simd/jsimd_powerpc_altivec.c
index b77208ca..798757cf 100644
--- a/simd/jsimd_powerpc_altivec.c
+++ b/simd/jsimd_powerpc_altivec.c
@@ -71,149 +71,6 @@
 }
 
 
-/* FAST INTEGER FORWARD DCT
- *
- * This is similar to the SSE2 implementation, except that we left-shift the
- * constants by 1 less bit (the -1 in IFAST_CONST_SHIFT.)  This is because
- * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
- *   the elements in arg3 + the most significant 17 bits of
- *     (the elements in arg1 * the elements in arg2).
- */
-
-#define IFAST_F_0_382 98   /* FIX(0.382683433) */
-#define IFAST_F_0_541 139  /* FIX(0.541196100) */
-#define IFAST_F_0_707 181  /* FIX(0.707106781) */
-#define IFAST_F_1_306 334  /* FIX(1.306562965) */
-
-#define IFAST_CONST_BITS 8
-#define IFAST_PRE_MULTIPLY_SCALE_BITS 2
-#define IFAST_CONST_SHIFT \
-  (16 - IFAST_PRE_MULTIPLY_SCALE_BITS - IFAST_CONST_BITS - 1)
-
-static const __vector short jconst_fdct_ifast __attribute__((aligned(16))) =
-{
-  IFAST_F_0_382 << IFAST_CONST_SHIFT,
-  IFAST_F_0_541 << IFAST_CONST_SHIFT,
-  IFAST_F_0_707 << IFAST_CONST_SHIFT,
-  IFAST_F_1_306 << IFAST_CONST_SHIFT
-};
-
-#define DO_FDCT_IFAST()  \
-{  \
-  /* Even part */  \
-  \
-  tmp10 = vec_add(tmp0, tmp3);  \
-  tmp13 = vec_sub(tmp0, tmp3);  \
-  tmp11 = vec_add(tmp1, tmp2);  \
-  tmp12 = vec_sub(tmp1, tmp2);  \
-  \
-  out0  = vec_add(tmp10, tmp11);  \
-  out4  = vec_sub(tmp10, tmp11);  \
-  \
-  z1 = vec_add(tmp12, tmp13);  \
-  z1 = vec_sl(z1, PRE_MULTIPLY_SCALE_BITS);  \
-  z1 = vec_madds(z1, PW_0707, zero);  \
-  \
-  out2 = vec_add(tmp13, z1);  \
-  out6 = vec_sub(tmp13, z1);  \
-  \
-  /* Odd part */  \
-  \
-  tmp10 = vec_add(tmp4, tmp5);  \
-  tmp11 = vec_add(tmp5, tmp6);  \
-  tmp12 = vec_add(tmp6, tmp7);  \
-  \
-  tmp10 = vec_sl(tmp10, PRE_MULTIPLY_SCALE_BITS);  \
-  tmp12 = vec_sl(tmp12, PRE_MULTIPLY_SCALE_BITS);  \
-  z5 = vec_sub(tmp10, tmp12);  \
-  z5 = vec_madds(z5, PW_0382, zero);  \
-  \
-  z2 = vec_madds(tmp10, PW_0541, zero);  \
-  z2 = vec_add(z2, z5);  \
-  \
-  z4 = vec_madds(tmp12, PW_1306, zero);  \
-  z4 = vec_add(z4, z5);  \
-  \
-  tmp11 = vec_sl(tmp11, PRE_MULTIPLY_SCALE_BITS);  \
-  z3 = vec_madds(tmp11, PW_0707, zero);  \
-  \
-  z11 = vec_add(tmp7, z3);  \
-  z13 = vec_sub(tmp7, z3);  \
-  \
-  out5 = vec_add(z13, z2);  \
-  out3 = vec_sub(z13, z2);  \
-  out1 = vec_add(z11, z4);  \
-  out7 = vec_sub(z11, z4);  \
-}
-
-void
-jsimd_fdct_ifast_altivec (DCTELEM *data)
-{
-  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
-    col0, col1, col2, col3, col4, col5, col6, col7,
-    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
-    z1, z2, z3, z4, z5, z11, z13,
-    out0, out1, out2, out3, out4, out5, out6, out7;
-
-  /* Constants */
-  __vector short zero = vec_splat_s16(0),
-    PW_0382 = vec_splat(jconst_fdct_ifast, 0),
-    PW_0541 = vec_splat(jconst_fdct_ifast, 1),
-    PW_0707 = vec_splat(jconst_fdct_ifast, 2),
-    PW_1306 = vec_splat(jconst_fdct_ifast, 3);
-  __vector unsigned short PRE_MULTIPLY_SCALE_BITS =
-    vec_splat_u16(IFAST_PRE_MULTIPLY_SCALE_BITS);
-
-  /* Pass 1: process rows. */
-
-  row0 = *(__vector short *)&data[0];
-  row1 = *(__vector short *)&data[8];
-  row2 = *(__vector short *)&data[16];
-  row3 = *(__vector short *)&data[24];
-  row4 = *(__vector short *)&data[32];
-  row5 = *(__vector short *)&data[40];
-  row6 = *(__vector short *)&data[48];
-  row7 = *(__vector short *)&data[56];
-
-  TRANSPOSE(row, col);
-
-  tmp0 = vec_add(col0, col7);
-  tmp7 = vec_sub(col0, col7);
-  tmp1 = vec_add(col1, col6);
-  tmp6 = vec_sub(col1, col6);
-  tmp2 = vec_add(col2, col5);
-  tmp5 = vec_sub(col2, col5);
-  tmp3 = vec_add(col3, col4);
-  tmp4 = vec_sub(col3, col4);
-
-  DO_FDCT_IFAST();
-
-  /* Pass 2: process columns. */
-
-  TRANSPOSE(out, row);
-
-  tmp0 = vec_add(row0, row7);
-  tmp7 = vec_sub(row0, row7);
-  tmp1 = vec_add(row1, row6);
-  tmp6 = vec_sub(row1, row6);
-  tmp2 = vec_add(row2, row5);
-  tmp5 = vec_sub(row2, row5);
-  tmp3 = vec_add(row3, row4);
-  tmp4 = vec_sub(row3, row4);
-
-  DO_FDCT_IFAST();
-
-  *(__vector short *)&data[0] = out0;
-  *(__vector short *)&data[8] = out1;
-  *(__vector short *)&data[16] = out2;
-  *(__vector short *)&data[24] = out3;
-  *(__vector short *)&data[32] = out4;
-  *(__vector short *)&data[40] = out5;
-  *(__vector short *)&data[48] = out6;
-  *(__vector short *)&data[56] = out7;
-}
-
-
 /* SLOW INTEGER FORWARD DCT */
 
 #define ISLOW_F_0_298 2446   /* FIX(0.298631336) */
@@ -480,6 +337,149 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
 }
 
 
+/* FAST INTEGER FORWARD DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in IFAST_CONST_SHIFT.)  This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ *   the elements in arg3 + the most significant 17 bits of
+ *     (the elements in arg1 * the elements in arg2).
+ */
+
+#define IFAST_F_0_382 98   /* FIX(0.382683433) */
+#define IFAST_F_0_541 139  /* FIX(0.541196100) */
+#define IFAST_F_0_707 181  /* FIX(0.707106781) */
+#define IFAST_F_1_306 334  /* FIX(1.306562965) */
+
+#define IFAST_CONST_BITS 8
+#define IFAST_PRE_MULTIPLY_SCALE_BITS 2
+#define IFAST_CONST_SHIFT \
+  (16 - IFAST_PRE_MULTIPLY_SCALE_BITS - IFAST_CONST_BITS - 1)
+
+static const __vector short jconst_fdct_ifast __attribute__((aligned(16))) =
+{
+  IFAST_F_0_382 << IFAST_CONST_SHIFT,
+  IFAST_F_0_541 << IFAST_CONST_SHIFT,
+  IFAST_F_0_707 << IFAST_CONST_SHIFT,
+  IFAST_F_1_306 << IFAST_CONST_SHIFT
+};
+
+#define DO_FDCT_IFAST()  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(tmp0, tmp3);  \
+  tmp13 = vec_sub(tmp0, tmp3);  \
+  tmp11 = vec_add(tmp1, tmp2);  \
+  tmp12 = vec_sub(tmp1, tmp2);  \
+  \
+  out0  = vec_add(tmp10, tmp11);  \
+  out4  = vec_sub(tmp10, tmp11);  \
+  \
+  z1 = vec_add(tmp12, tmp13);  \
+  z1 = vec_sl(z1, PRE_MULTIPLY_SCALE_BITS);  \
+  z1 = vec_madds(z1, PW_0707, zero);  \
+  \
+  out2 = vec_add(tmp13, z1);  \
+  out6 = vec_sub(tmp13, z1);  \
+  \
+  /* Odd part */  \
+  \
+  tmp10 = vec_add(tmp4, tmp5);  \
+  tmp11 = vec_add(tmp5, tmp6);  \
+  tmp12 = vec_add(tmp6, tmp7);  \
+  \
+  tmp10 = vec_sl(tmp10, PRE_MULTIPLY_SCALE_BITS);  \
+  tmp12 = vec_sl(tmp12, PRE_MULTIPLY_SCALE_BITS);  \
+  z5 = vec_sub(tmp10, tmp12);  \
+  z5 = vec_madds(z5, PW_0382, zero);  \
+  \
+  z2 = vec_madds(tmp10, PW_0541, zero);  \
+  z2 = vec_add(z2, z5);  \
+  \
+  z4 = vec_madds(tmp12, PW_1306, zero);  \
+  z4 = vec_add(z4, z5);  \
+  \
+  tmp11 = vec_sl(tmp11, PRE_MULTIPLY_SCALE_BITS);  \
+  z3 = vec_madds(tmp11, PW_0707, zero);  \
+  \
+  z11 = vec_add(tmp7, z3);  \
+  z13 = vec_sub(tmp7, z3);  \
+  \
+  out5 = vec_add(z13, z2);  \
+  out3 = vec_sub(z13, z2);  \
+  out1 = vec_add(z11, z4);  \
+  out7 = vec_sub(z11, z4);  \
+}
+
+void
+jsimd_fdct_ifast_altivec (DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    z1, z2, z3, z4, z5, z11, z13,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+
+  /* Constants */
+  __vector short zero = vec_splat_s16(0),
+    PW_0382 = vec_splat(jconst_fdct_ifast, 0),
+    PW_0541 = vec_splat(jconst_fdct_ifast, 1),
+    PW_0707 = vec_splat(jconst_fdct_ifast, 2),
+    PW_1306 = vec_splat(jconst_fdct_ifast, 3);
+  __vector unsigned short PRE_MULTIPLY_SCALE_BITS =
+    vec_splat_u16(IFAST_PRE_MULTIPLY_SCALE_BITS);
+
+  /* Pass 1: process rows. */
+
+  row0 = *(__vector short *)&data[0];
+  row1 = *(__vector short *)&data[8];
+  row2 = *(__vector short *)&data[16];
+  row3 = *(__vector short *)&data[24];
+  row4 = *(__vector short *)&data[32];
+  row5 = *(__vector short *)&data[40];
+  row6 = *(__vector short *)&data[48];
+  row7 = *(__vector short *)&data[56];
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_FDCT_IFAST();
+
+  /* Pass 2: process columns. */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_FDCT_IFAST();
+
+  *(__vector short *)&data[0] = out0;
+  *(__vector short *)&data[8] = out1;
+  *(__vector short *)&data[16] = out2;
+  *(__vector short *)&data[24] = out3;
+  *(__vector short *)&data[32] = out4;
+  *(__vector short *)&data[40] = out5;
+  *(__vector short *)&data[48] = out6;
+  *(__vector short *)&data[56] = out7;
+}
+
+
 /* FAST INTEGER INVERSE DCT
  *
  * This is similar to the SSE2 implementation, except that we left-shift the

From 63c1674ebceaa6c064a109aa7cae571303b3e2b1 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Sat, 20 Dec 2014 01:16:26 +0000
Subject: [PATCH 060/108] Use macros to allocate constants statically, rather
 than reading them from a table using vec_splat*().  This improves code
 readability and probably improves performance a bit as well.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1460 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/jsimd_powerpc_altivec.c | 135 +++++++++++------------------------
 1 file changed, 40 insertions(+), 95 deletions(-)

diff --git a/simd/jsimd_powerpc_altivec.c b/simd/jsimd_powerpc_altivec.c
index 798757cf..fd3fcc2d 100644
--- a/simd/jsimd_powerpc_altivec.c
+++ b/simd/jsimd_powerpc_altivec.c
@@ -32,6 +32,11 @@
 
 /* Common code */
 
+#define __4X(a) a, a, a, a
+#define __4X2(a, b) a, b, a, b, a, b, a, b
+#define __8X(a) __4X(a), __4X(a)
+#define __16X(a) __8X(a), __8X(a)
+
 #define TRANSPOSE(row, col)  \
 {  \
   __vector short row04l, row04h, row15l, row15h,  \
@@ -91,17 +96,6 @@
 #define ISLOW_DESCALE_P1 (ISLOW_CONST_BITS - ISLOW_PASS1_BITS)
 #define ISLOW_DESCALE_P2 (ISLOW_CONST_BITS + ISLOW_PASS1_BITS)
 
-static const __vector int jconst_fdct_islow __attribute__((aligned(16))) =
-{
-  1 << (ISLOW_DESCALE_P1 - 1),
-  1 << (ISLOW_DESCALE_P2 - 1)
-};
-
-static const __vector short jconst_fdct_islow2 __attribute__((aligned(16))) =
-{
-  1 << (ISLOW_PASS1_BITS - 1)
-};
-
 #define DO_FDCT_ISLOW_COMMON(PASS)  \
 {  \
   tmp1312l = vec_mergeh(tmp13, tmp12);  \
@@ -241,51 +235,24 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
     out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
     out7l, out7h;
 
-  __vector short PW_F130_F054 = {ISLOW_F_0_541 + ISLOW_F_0_765, ISLOW_F_0_541,
-    ISLOW_F_0_541 + ISLOW_F_0_765, ISLOW_F_0_541,
-    ISLOW_F_0_541 + ISLOW_F_0_765, ISLOW_F_0_541,
-    ISLOW_F_0_541 + ISLOW_F_0_765, ISLOW_F_0_541};
-  __vector short PW_F054_MF130 = {ISLOW_F_0_541, ISLOW_F_0_541 - ISLOW_F_1_847,
-    ISLOW_F_0_541, ISLOW_F_0_541 - ISLOW_F_1_847,
-    ISLOW_F_0_541, ISLOW_F_0_541 - ISLOW_F_1_847,
-    ISLOW_F_0_541, ISLOW_F_0_541 - ISLOW_F_1_847};
-  __vector short PW_MF078_F117 = {ISLOW_F_1_175 - ISLOW_F_1_961, ISLOW_F_1_175,
-    ISLOW_F_1_175 - ISLOW_F_1_961, ISLOW_F_1_175,
-    ISLOW_F_1_175 - ISLOW_F_1_961, ISLOW_F_1_175,
-    ISLOW_F_1_175 - ISLOW_F_1_961, ISLOW_F_1_175};
-  __vector short PW_F117_F078 = {ISLOW_F_1_175, ISLOW_F_1_175 - ISLOW_F_0_390,
-    ISLOW_F_1_175, ISLOW_F_1_175 - ISLOW_F_0_390,
-    ISLOW_F_1_175, ISLOW_F_1_175 - ISLOW_F_0_390,
-    ISLOW_F_1_175, ISLOW_F_1_175 - ISLOW_F_0_390};
-  __vector short PW_MF060_MF089 = {
-    ISLOW_F_0_298 - ISLOW_F_0_899, -ISLOW_F_0_899,
-    ISLOW_F_0_298 - ISLOW_F_0_899, -ISLOW_F_0_899,
-    ISLOW_F_0_298 - ISLOW_F_0_899, -ISLOW_F_0_899,
-    ISLOW_F_0_298 - ISLOW_F_0_899, -ISLOW_F_0_899};
-  __vector short PW_MF089_F060 = {
-    -ISLOW_F_0_899, ISLOW_F_1_501 - ISLOW_F_0_899,
-    -ISLOW_F_0_899, ISLOW_F_1_501 - ISLOW_F_0_899,
-    -ISLOW_F_0_899, ISLOW_F_1_501 - ISLOW_F_0_899,
-    -ISLOW_F_0_899, ISLOW_F_1_501 - ISLOW_F_0_899};
-  __vector short PW_MF050_MF256 = {
-    ISLOW_F_2_053 - ISLOW_F_2_562, -ISLOW_F_2_562,
-    ISLOW_F_2_053 - ISLOW_F_2_562, -ISLOW_F_2_562,
-    ISLOW_F_2_053 - ISLOW_F_2_562, -ISLOW_F_2_562,
-    ISLOW_F_2_053 - ISLOW_F_2_562, -ISLOW_F_2_562};
-  __vector short PW_MF256_F050 = {
-    -ISLOW_F_2_562, ISLOW_F_3_072 - ISLOW_F_2_562,
-    -ISLOW_F_2_562, ISLOW_F_3_072 - ISLOW_F_2_562,
-    -ISLOW_F_2_562, ISLOW_F_3_072 - ISLOW_F_2_562,
-    -ISLOW_F_2_562, ISLOW_F_3_072 - ISLOW_F_2_562};
-  __vector short PW_DESCALE_P2X = vec_splat(jconst_fdct_islow2, 0);
+  __vector short
+    PW_F130_F054 = { __4X2(ISLOW_F_0_541 + ISLOW_F_0_765, ISLOW_F_0_541) },
+    PW_F054_MF130 = { __4X2(ISLOW_F_0_541, ISLOW_F_0_541 - ISLOW_F_1_847) },
+    PW_MF078_F117 = { __4X2(ISLOW_F_1_175 - ISLOW_F_1_961, ISLOW_F_1_175) },
+    PW_F117_F078 = { __4X2(ISLOW_F_1_175, ISLOW_F_1_175 - ISLOW_F_0_390) },
+    PW_MF060_MF089 = { __4X2(ISLOW_F_0_298 - ISLOW_F_0_899, -ISLOW_F_0_899) },
+    PW_MF089_F060 = { __4X2(-ISLOW_F_0_899, ISLOW_F_1_501 - ISLOW_F_0_899) },
+    PW_MF050_MF256 = { __4X2(ISLOW_F_2_053 - ISLOW_F_2_562, -ISLOW_F_2_562) },
+    PW_MF256_F050 = { __4X2(-ISLOW_F_2_562, ISLOW_F_3_072 - ISLOW_F_2_562) },
+    PW_DESCALE_P2X = { __8X(1 << (ISLOW_PASS1_BITS - 1)) };
 
   /* Constants */
-  __vector unsigned short PASS1_BITS = vec_splat_u16(ISLOW_PASS1_BITS);
-  __vector int zero = vec_splat_s32(0),
-    PD_DESCALE_P1 = vec_splat(jconst_fdct_islow, 0),
-    PD_DESCALE_P2 = vec_splat(jconst_fdct_islow, 1);
-  __vector unsigned int DESCALE_P1 = vec_splat_u32(ISLOW_DESCALE_P1),
-    DESCALE_P2 = vec_splat_u32(ISLOW_DESCALE_P2);
+  __vector unsigned short PASS1_BITS = { __8X(ISLOW_PASS1_BITS) };
+  __vector int zero = { __4X(0) },
+    PD_DESCALE_P1 = { __4X(1 << (ISLOW_DESCALE_P1 - 1)) },
+    PD_DESCALE_P2 = { __4X(1 << (ISLOW_DESCALE_P2 - 1)) };
+  __vector unsigned int DESCALE_P1 = { __4X(ISLOW_DESCALE_P1) },
+    DESCALE_P2 = { __4X(ISLOW_DESCALE_P2) };
 
   /* Pass 1: process rows. */
 
@@ -356,14 +323,6 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
 #define IFAST_CONST_SHIFT \
   (16 - IFAST_PRE_MULTIPLY_SCALE_BITS - IFAST_CONST_BITS - 1)
 
-static const __vector short jconst_fdct_ifast __attribute__((aligned(16))) =
-{
-  IFAST_F_0_382 << IFAST_CONST_SHIFT,
-  IFAST_F_0_541 << IFAST_CONST_SHIFT,
-  IFAST_F_0_707 << IFAST_CONST_SHIFT,
-  IFAST_F_1_306 << IFAST_CONST_SHIFT
-};
-
 #define DO_FDCT_IFAST()  \
 {  \
   /* Even part */  \
@@ -423,12 +382,12 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
 
   /* Constants */
   __vector short zero = vec_splat_s16(0),
-    PW_0382 = vec_splat(jconst_fdct_ifast, 0),
-    PW_0541 = vec_splat(jconst_fdct_ifast, 1),
-    PW_0707 = vec_splat(jconst_fdct_ifast, 2),
-    PW_1306 = vec_splat(jconst_fdct_ifast, 3);
-  __vector unsigned short PRE_MULTIPLY_SCALE_BITS =
-    vec_splat_u16(IFAST_PRE_MULTIPLY_SCALE_BITS);
+    PW_0382 = { __8X(IFAST_F_0_382 << IFAST_CONST_SHIFT) },
+    PW_0541 = { __8X(IFAST_F_0_541 << IFAST_CONST_SHIFT) },
+    PW_0707 = { __8X(IFAST_F_0_707 << IFAST_CONST_SHIFT) },
+    PW_1306 = { __8X(IFAST_F_1_306 << IFAST_CONST_SHIFT) };
+  __vector unsigned short
+    PRE_MULTIPLY_SCALE_BITS = { __8X(IFAST_PRE_MULTIPLY_SCALE_BITS) };
 
   /* Pass 1: process rows. */
 
@@ -489,29 +448,15 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
  *     (the elements in arg1 * the elements in arg2).
  */
 
-#define F_1_082 277              /* FIX(1.082392200) */
-#define F_1_414 362              /* FIX(1.414213562) */
-#define F_1_847 473              /* FIX(1.847759065) */
-#define F_2_613 669              /* FIX(2.613125930) */
-#define F_1_613 (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
+#define IFAST_F_1_082 277                    /* FIX(1.082392200) */
+#define IFAST_F_1_414 362                    /* FIX(1.414213562) */
+#define IFAST_F_1_847 473                    /* FIX(1.847759065) */
+#define IFAST_F_2_613 669                    /* FIX(2.613125930) */
+#define IFAST_F_1_613 (IFAST_F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
 
 #define IFAST_PASS1_BITS 2
 #define IFAST_CENTERJSAMPLE 128
 
-static const __vector short jconst_idct_ifast __attribute__((aligned(16))) =
-{
-  F_1_414 << IFAST_CONST_SHIFT,
-  F_1_847 << IFAST_CONST_SHIFT,
-  -F_1_613 << IFAST_CONST_SHIFT,
-  F_1_082 << IFAST_CONST_SHIFT
-};
-
-static const __vector signed char jconst_idct_ifast2
-  __attribute__((aligned(16))) =
-{
-  IFAST_CENTERJSAMPLE
-};
-
 #define DO_IDCT_IFAST(in)  \
 {  \
   /* Even part */  \
@@ -583,15 +528,15 @@ jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
   long long *outptr, *outbptr = (long long *)(&outb);
 
   /* Constants */
-  __vector short zero = vec_splat_s16(0),
-    PW_F1414 = vec_splat(jconst_idct_ifast, 0),
-    PW_F1847 = vec_splat(jconst_idct_ifast, 1),
-    PW_MF1613 = vec_splat(jconst_idct_ifast, 2),
-    PW_F1082 = vec_splat(jconst_idct_ifast, 3);
+  __vector short zero = { __8X(0) },
+    PW_F1414 = { __8X(IFAST_F_1_414 << IFAST_CONST_SHIFT) },
+    PW_F1847 = { __8X(IFAST_F_1_847 << IFAST_CONST_SHIFT) },
+    PW_MF1613 = { __8X(-IFAST_F_1_613 << IFAST_CONST_SHIFT) },
+    PW_F1082 = { __8X(IFAST_F_1_082 << IFAST_CONST_SHIFT) };
   __vector unsigned short
-    PRE_MULTIPLY_SCALE_BITS = vec_splat_u16(IFAST_PRE_MULTIPLY_SCALE_BITS),
-    PASS1_BITS3 = vec_splat_u16(IFAST_PASS1_BITS + 3);
-  __vector signed char PB_CENTERJSAMP = vec_splat(jconst_idct_ifast2, 0);
+    PRE_MULTIPLY_SCALE_BITS = { __8X(IFAST_PRE_MULTIPLY_SCALE_BITS) },
+    PASS1_BITS3 = { __8X(IFAST_PASS1_BITS + 3) };
+  __vector signed char PB_CENTERJSAMP = { __16X(IFAST_CENTERJSAMPLE) };
 
   /* Pass 1: process columns. */
 

From 0d435698f42587d57afe8e21ead8a9ef16f23350 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Sat, 20 Dec 2014 01:17:39 +0000
Subject: [PATCH 061/108] AltiVec SIMD implementation of slow integer inverse
 DCT

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1461 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/jsimd.h                 |   4 +
 simd/jsimd_powerpc.c         |  13 ++
 simd/jsimd_powerpc_altivec.c | 278 ++++++++++++++++++++++++++++++++++-
 3 files changed, 291 insertions(+), 4 deletions(-)

diff --git a/simd/jsimd.h b/simd/jsimd.h
index f3396110..9b50610a 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -646,6 +646,10 @@ EXTERN(void) jsimd_idct_islow_mips_dspr2
         (void * dct_table, JCOEFPTR coef_block, int * output_buf,
          JSAMPLE * output_col);
 
+EXTERN(void) jsimd_idct_islow_altivec
+        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
+
 /* Fast Integer Inverse DCT */
 EXTERN(void) jsimd_idct_ifast_mmx
         (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c
index eb977d73..f92851c5 100644
--- a/simd/jsimd_powerpc.c
+++ b/simd/jsimd_powerpc.c
@@ -333,6 +333,17 @@ jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 GLOBAL(int)
 jsimd_can_idct_islow (void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
   return 0;
 }
 
@@ -364,6 +375,8 @@ jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
+  jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
+                           output_col);
 }
 
 GLOBAL(void)
diff --git a/simd/jsimd_powerpc_altivec.c b/simd/jsimd_powerpc_altivec.c
index fd3fcc2d..0d251dfa 100644
--- a/simd/jsimd_powerpc_altivec.c
+++ b/simd/jsimd_powerpc_altivec.c
@@ -235,6 +235,7 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
     out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
     out7l, out7h;
 
+  /* Constants */
   __vector short
     PW_F130_F054 = { __4X2(ISLOW_F_0_541 + ISLOW_F_0_765, ISLOW_F_0_541) },
     PW_F054_MF130 = { __4X2(ISLOW_F_0_541, ISLOW_F_0_541 - ISLOW_F_1_847) },
@@ -245,8 +246,6 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
     PW_MF050_MF256 = { __4X2(ISLOW_F_2_053 - ISLOW_F_2_562, -ISLOW_F_2_562) },
     PW_MF256_F050 = { __4X2(-ISLOW_F_2_562, ISLOW_F_3_072 - ISLOW_F_2_562) },
     PW_DESCALE_P2X = { __8X(1 << (ISLOW_PASS1_BITS - 1)) };
-
-  /* Constants */
   __vector unsigned short PASS1_BITS = { __8X(ISLOW_PASS1_BITS) };
   __vector int zero = { __4X(0) },
     PD_DESCALE_P1 = { __4X(1 << (ISLOW_DESCALE_P1 - 1)) },
@@ -439,6 +438,278 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
 }
 
 
+/* SLOW INTEGER INVERSE DCT */
+
+#define DO_IDCT_ISLOW(in, PASS)  \
+{  \
+  /* Even part */  \
+  \
+  in##26l = vec_mergeh(in##2, in##6);  \
+  in##26h = vec_mergel(in##2, in##6);  \
+  \
+  tmp3l = vec_msums(in##26l, PW_F130_F054, zero32);  \
+  tmp3h = vec_msums(in##26h, PW_F130_F054, zero32);  \
+  tmp2l = vec_msums(in##26l, PW_F054_MF130, zero32);  \
+  tmp2h = vec_msums(in##26h, PW_F054_MF130, zero32);  \
+  \
+  tmp0 = vec_add(in##0, in##4);  \
+  tmp1 = vec_sub(in##0, in##4);  \
+  \
+  tmp0l = vec_unpackh(tmp0);  \
+  tmp0h = vec_unpackl(tmp0);  \
+  tmp0l = vec_sl(tmp0l, CONST_BITS);  \
+  tmp0h = vec_sl(tmp0h, CONST_BITS);  \
+  \
+  tmp10l = vec_add(tmp0l, tmp3l);  \
+  tmp10h = vec_add(tmp0h, tmp3h);  \
+  tmp13l = vec_sub(tmp0l, tmp3l);  \
+  tmp13h = vec_sub(tmp0h, tmp3h);  \
+  \
+  tmp1l = vec_unpackh(tmp1);  \
+  tmp1h = vec_unpackl(tmp1);  \
+  tmp1l = vec_sl(tmp1l, CONST_BITS);  \
+  tmp1h = vec_sl(tmp1h, CONST_BITS);  \
+  \
+  tmp11l = vec_add(tmp1l, tmp2l);  \
+  tmp11h = vec_add(tmp1h, tmp2h);  \
+  tmp12l = vec_sub(tmp1l, tmp2l);  \
+  tmp12h = vec_sub(tmp1h, tmp2h);  \
+  \
+  /* Odd part */  \
+  \
+  z3 = vec_add(in##3, in##7);  \
+  z4 = vec_add(in##1, in##5);  \
+  \
+  z34l = vec_mergeh(z3, z4);  \
+  z34h = vec_mergel(z3, z4);  \
+  \
+  z3l = vec_msums(z34l, PW_MF078_F117, zero32);  \
+  z3h = vec_msums(z34h, PW_MF078_F117, zero32);  \
+  z4l = vec_msums(z34l, PW_F117_F078, zero32);  \
+  z4h = vec_msums(z34h, PW_F117_F078, zero32);  \
+  \
+  in##71l = vec_mergeh(in##7, in##1);  \
+  in##71h = vec_mergel(in##7, in##1);  \
+  \
+  tmp0l = vec_msums(in##71l, PW_MF060_MF089, z3l);  \
+  tmp0h = vec_msums(in##71h, PW_MF060_MF089, z3h);  \
+  tmp3l = vec_msums(in##71l, PW_MF089_F060, z4l);  \
+  tmp3h = vec_msums(in##71h, PW_MF089_F060, z4h);  \
+  \
+  in##53l = vec_mergeh(in##5, in##3);  \
+  in##53h = vec_mergel(in##5, in##3);  \
+  \
+  tmp1l = vec_msums(in##53l, PW_MF050_MF256, z4l);  \
+  tmp1h = vec_msums(in##53h, PW_MF050_MF256, z4h);  \
+  tmp2l = vec_msums(in##53l, PW_MF256_F050, z3l);  \
+  tmp2h = vec_msums(in##53h, PW_MF256_F050, z3h);  \
+  \
+  /* Final output stage */  \
+  \
+  out0l = vec_add(tmp10l, tmp3l);  \
+  out0h = vec_add(tmp10h, tmp3h);  \
+  out7l = vec_sub(tmp10l, tmp3l);  \
+  out7h = vec_sub(tmp10h, tmp3h);  \
+  \
+  out0l = vec_add(out0l, PD_DESCALE_P##PASS);  \
+  out0h = vec_add(out0h, PD_DESCALE_P##PASS);  \
+  out0l = vec_sra(out0l, DESCALE_P##PASS);  \
+  out0h = vec_sra(out0h, DESCALE_P##PASS);  \
+  \
+  out7l = vec_add(out7l, PD_DESCALE_P##PASS);  \
+  out7h = vec_add(out7h, PD_DESCALE_P##PASS);  \
+  out7l = vec_sra(out7l, DESCALE_P##PASS);  \
+  out7h = vec_sra(out7h, DESCALE_P##PASS);  \
+  \
+  out0 = vec_pack(out0l, out0h);  \
+  out7 = vec_pack(out7l, out7h);  \
+  \
+  out1l = vec_add(tmp11l, tmp2l);  \
+  out1h = vec_add(tmp11h, tmp2h);  \
+  out6l = vec_sub(tmp11l, tmp2l);  \
+  out6h = vec_sub(tmp11h, tmp2h);  \
+  \
+  out1l = vec_add(out1l, PD_DESCALE_P##PASS);  \
+  out1h = vec_add(out1h, PD_DESCALE_P##PASS);  \
+  out1l = vec_sra(out1l, DESCALE_P##PASS);  \
+  out1h = vec_sra(out1h, DESCALE_P##PASS);  \
+  \
+  out6l = vec_add(out6l, PD_DESCALE_P##PASS);  \
+  out6h = vec_add(out6h, PD_DESCALE_P##PASS);  \
+  out6l = vec_sra(out6l, DESCALE_P##PASS);  \
+  out6h = vec_sra(out6h, DESCALE_P##PASS);  \
+  \
+  out1 = vec_pack(out1l, out1h);  \
+  out6 = vec_pack(out6l, out6h);  \
+  \
+  out2l = vec_add(tmp12l, tmp1l);  \
+  out2h = vec_add(tmp12h, tmp1h);  \
+  out5l = vec_sub(tmp12l, tmp1l);  \
+  out5h = vec_sub(tmp12h, tmp1h);  \
+  \
+  out2l = vec_add(out2l, PD_DESCALE_P##PASS);  \
+  out2h = vec_add(out2h, PD_DESCALE_P##PASS);  \
+  out2l = vec_sra(out2l, DESCALE_P##PASS);  \
+  out2h = vec_sra(out2h, DESCALE_P##PASS);  \
+  \
+  out5l = vec_add(out5l, PD_DESCALE_P##PASS);  \
+  out5h = vec_add(out5h, PD_DESCALE_P##PASS);  \
+  out5l = vec_sra(out5l, DESCALE_P##PASS);  \
+  out5h = vec_sra(out5h, DESCALE_P##PASS);  \
+  \
+  out2 = vec_pack(out2l, out2h);  \
+  out5 = vec_pack(out5l, out5h);  \
+  \
+  out3l = vec_add(tmp13l, tmp0l);  \
+  out3h = vec_add(tmp13h, tmp0h);  \
+  out4l = vec_sub(tmp13l, tmp0l);  \
+  out4h = vec_sub(tmp13h, tmp0h);  \
+  \
+  out3l = vec_add(out3l, PD_DESCALE_P##PASS);  \
+  out3h = vec_add(out3h, PD_DESCALE_P##PASS);  \
+  out3l = vec_sra(out3l, DESCALE_P##PASS);  \
+  out3h = vec_sra(out3h, DESCALE_P##PASS);  \
+  \
+  out4l = vec_add(out4l, PD_DESCALE_P##PASS);  \
+  out4h = vec_add(out4h, PD_DESCALE_P##PASS);  \
+  out4l = vec_sra(out4l, DESCALE_P##PASS);  \
+  out4h = vec_sra(out4h, DESCALE_P##PASS);  \
+  \
+  out3 = vec_pack(out3l, out3h);  \
+  out4 = vec_pack(out4l, out4h);  \
+}
+
+void
+jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  short *dct_table = (short *)dct_table_;
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+    tmp0, tmp1, tmp2, tmp3, z3, z4,
+    z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
+    row71l, row71h, row26l, row26h, row53l, row53h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
+    tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
+    z3l, z3h, z4l, z4h,
+    out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
+    out5l, out5h, out6l, out6h, out7l, out7h;
+  __vector signed char outb;
+  long long *outptr, *outbptr = (long long *)(&outb);
+
+  /* Constants */
+  __vector short zero16 = { __8X(0) },
+    PW_F130_F054 = { __4X2(ISLOW_F_0_541 + ISLOW_F_0_765, ISLOW_F_0_541) },
+    PW_F054_MF130 = { __4X2(ISLOW_F_0_541, ISLOW_F_0_541 - ISLOW_F_1_847) },
+    PW_MF078_F117 = { __4X2(ISLOW_F_1_175 - ISLOW_F_1_961, ISLOW_F_1_175) },
+    PW_F117_F078 = { __4X2(ISLOW_F_1_175, ISLOW_F_1_175 - ISLOW_F_0_390) },
+    PW_MF060_MF089 = { __4X2(ISLOW_F_0_298 - ISLOW_F_0_899, -ISLOW_F_0_899) },
+    PW_MF089_F060 = { __4X2(-ISLOW_F_0_899, ISLOW_F_1_501 - ISLOW_F_0_899) },
+    PW_MF050_MF256 = { __4X2(ISLOW_F_2_053 - ISLOW_F_2_562, -ISLOW_F_2_562) },
+    PW_MF256_F050 = { __4X2(-ISLOW_F_2_562, ISLOW_F_3_072 - ISLOW_F_2_562) };
+  __vector unsigned short PASS1_BITS = { __8X(ISLOW_PASS1_BITS) };
+  __vector int zero32 = { __4X(0) },
+    PD_DESCALE_P1 = { __4X(1 << (ISLOW_DESCALE_P1 - 1)) },
+    PD_DESCALE_P2 = { __4X(1 << (ISLOW_DESCALE_P2 + 3 - 1)) };
+  __vector unsigned int DESCALE_P1 = { __4X(ISLOW_DESCALE_P1) },
+    DESCALE_P2 = { __4X(ISLOW_DESCALE_P2 + 3) },
+    CONST_BITS = { __4X(ISLOW_CONST_BITS) };
+  __vector signed char PB_CENTERJSAMP = { __16X(CENTERJSAMPLE) };
+
+  /* Pass 1: process columns. */
+
+  col0 = *(__vector short *)&coef_block[0];
+  col1 = *(__vector short *)&coef_block[8];
+  col2 = *(__vector short *)&coef_block[16];
+  col3 = *(__vector short *)&coef_block[24];
+  col4 = *(__vector short *)&coef_block[32];
+  col5 = *(__vector short *)&coef_block[40];
+  col6 = *(__vector short *)&coef_block[48];
+  col7 = *(__vector short *)&coef_block[56];
+
+  tmp1 = vec_or(col1, col2);
+  tmp2 = vec_or(col3, col4);
+  tmp1 = vec_or(tmp1, tmp2);
+  tmp3 = vec_or(col5, col6);
+  tmp3 = vec_or(tmp3, col7);
+  tmp1 = vec_or(tmp1, tmp3);
+
+  quant0 = *(__vector short *)&dct_table[0];
+  col0 = vec_mladd(col0, quant0, zero16);
+
+  if (vec_all_eq(tmp1, zero16)) {
+    /* AC terms all zero */
+
+    col0 = vec_sl(col0, PASS1_BITS);
+
+    row0 = vec_splat(col0, 0);
+    row1 = vec_splat(col0, 1);
+    row2 = vec_splat(col0, 2);
+    row3 = vec_splat(col0, 3);
+    row4 = vec_splat(col0, 4);
+    row5 = vec_splat(col0, 5);
+    row6 = vec_splat(col0, 6);
+    row7 = vec_splat(col0, 7);
+
+  } else {
+
+    quant1 = *(__vector short *)&dct_table[8];
+    quant2 = *(__vector short *)&dct_table[16];
+    quant3 = *(__vector short *)&dct_table[24];
+    quant4 = *(__vector short *)&dct_table[32];
+    quant5 = *(__vector short *)&dct_table[40];
+    quant6 = *(__vector short *)&dct_table[48];
+    quant7 = *(__vector short *)&dct_table[56];
+
+    col1 = vec_mladd(col1, quant1, zero16);
+    col2 = vec_mladd(col2, quant2, zero16);
+    col3 = vec_mladd(col3, quant3, zero16);
+    col4 = vec_mladd(col4, quant4, zero16);
+    col5 = vec_mladd(col5, quant5, zero16);
+    col6 = vec_mladd(col6, quant6, zero16);
+    col7 = vec_mladd(col7, quant7, zero16);
+
+    DO_IDCT_ISLOW(col, 1);
+
+    TRANSPOSE(out, row);
+  }
+
+  DO_IDCT_ISLOW(row, 2);
+
+  TRANSPOSE(out, col);
+
+  outb = vec_packs(col0, col1);
+  outb = vec_add(outb, PB_CENTERJSAMP);
+  outptr = (long long *)(output_buf[0] + output_col);
+  *outptr = outbptr[0];
+  outptr = (long long *)(output_buf[1] + output_col);
+  *outptr = outbptr[1];
+
+  outb = vec_packs(col2, col3);
+  outb = vec_add(outb, PB_CENTERJSAMP);
+  outptr = (long long *)(output_buf[2] + output_col);
+  *outptr = outbptr[0];
+  outptr = (long long *)(output_buf[3] + output_col);
+  *outptr = outbptr[1];
+
+  outb = vec_packs(col4, col5);
+  outb = vec_add(outb, PB_CENTERJSAMP);
+  outptr = (long long *)(output_buf[4] + output_col);
+  *outptr = outbptr[0];
+  outptr = (long long *)(output_buf[5] + output_col);
+  *outptr = outbptr[1];
+
+  outb = vec_packs(col6, col7);
+  outb = vec_add(outb, PB_CENTERJSAMP);
+  outptr = (long long *)(output_buf[6] + output_col);
+  *outptr = outbptr[0];
+  outptr = (long long *)(output_buf[7] + output_col);
+  *outptr = outbptr[1];
+}
+
+
 /* FAST INTEGER INVERSE DCT
  *
  * This is similar to the SSE2 implementation, except that we left-shift the
@@ -455,7 +726,6 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
 #define IFAST_F_1_613 (IFAST_F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
 
 #define IFAST_PASS1_BITS 2
-#define IFAST_CENTERJSAMPLE 128
 
 #define DO_IDCT_IFAST(in)  \
 {  \
@@ -536,7 +806,7 @@ jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
   __vector unsigned short
     PRE_MULTIPLY_SCALE_BITS = { __8X(IFAST_PRE_MULTIPLY_SCALE_BITS) },
     PASS1_BITS3 = { __8X(IFAST_PASS1_BITS + 3) };
-  __vector signed char PB_CENTERJSAMP = { __16X(IFAST_CENTERJSAMPLE) };
+  __vector signed char PB_CENTERJSAMP = { __16X(CENTERJSAMPLE) };
 
   /* Pass 1: process columns. */
 

From 6c08ceb2eff74549494c3cffab457f5b5bd95d92 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Sat, 20 Dec 2014 04:03:14 +0100
Subject: [PATCH 062/108] Temp fix for #138

Disable scan optimization and trellis quantization when arithmetic
coding is used
---
 cjpeg.c    | 9 +++++++++
 jpegtran.c | 6 ++++++
 2 files changed, 15 insertions(+)

diff --git a/cjpeg.c b/cjpeg.c
index c6ffb09f..40c24a75 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -289,6 +289,15 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       /* Use arithmetic coding. */
 #ifdef C_ARITH_CODING_SUPPORTED
       cinfo->arith_code = TRUE;
+      
+      /* No table optimization required for AC */
+      cinfo->optimize_coding = FALSE;
+      
+      /* Scan optimization currently incompatible with AC */
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_OPTIMIZE_SCANS, FALSE);
+      
+      /* Trellis quantization currently incompatible with AC */
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_TRELLIS_QUANT, FALSE);
 #else
       fprintf(stderr, "%s: sorry, arithmetic coding not supported\n",
               progname);
diff --git a/jpegtran.c b/jpegtran.c
index e5161c4b..90900d0f 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -175,6 +175,12 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       /* Use arithmetic coding. */
 #ifdef C_ARITH_CODING_SUPPORTED
       cinfo->arith_code = TRUE;
+
+      /* No table optimization required for AC */
+      cinfo->optimize_coding = FALSE;
+      
+      /* Scan optimization currently incompatible with AC */
+      jpeg_c_set_bool_param(cinfo, JBOOLEAN_OPTIMIZE_SCANS, FALSE);
 #else
       fprintf(stderr, "%s: sorry, arithmetic coding not supported\n",
               progname);

From 8a178239bd5ebe92971a6de6d3bb1ae3e0a88679 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Sat, 20 Dec 2014 04:23:39 +0100
Subject: [PATCH 063/108] Partial fix for #138

Fix pass number computation in scan optimization to support case where
Huffman table optimization is not done, e.g. when arithmetic coding is
used
Enable combination of arithmetic coding and scan optimization
(previously disabled)
---
 cjpeg.c    |  3 ---
 jcmaster.c | 11 ++++++-----
 jpegtran.c |  3 ---
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/cjpeg.c b/cjpeg.c
index 40c24a75..74096782 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -293,9 +293,6 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       /* No table optimization required for AC */
       cinfo->optimize_coding = FALSE;
       
-      /* Scan optimization currently incompatible with AC */
-      jpeg_c_set_bool_param(cinfo, JBOOLEAN_OPTIMIZE_SCANS, FALSE);
-      
       /* Trellis quantization currently incompatible with AC */
       jpeg_c_set_bool_param(cinfo, JBOOLEAN_TRELLIS_QUANT, FALSE);
 #else
diff --git a/jcmaster.c b/jcmaster.c
index c02f5903..62cfe07e 100644
--- a/jcmaster.c
+++ b/jcmaster.c
@@ -624,7 +624,8 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
   int chroma_freq_split_scan_start = cinfo->master->num_scans_luma +
                                      cinfo->master->num_scans_chroma_dc +
                                      (6 * cinfo->master->Al_max_chroma + 4);
-
+  int passes_per_scan = cinfo->optimize_coding ? 2 : 1;
+  
   if (next_scan_number > 1 && next_scan_number <= luma_freq_split_scan_start) {
     if ((next_scan_number - 1) % 3 == 2) {
       int Al = (next_scan_number - 1) / 3;
@@ -640,7 +641,7 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
         master->best_Al_luma = Al;
       } else {
         master->scan_number = luma_freq_split_scan_start - 1;
-        master->pass_number = 2 * master->scan_number + 1 + master->pass_number_scan_opt_base;
+        master->pass_number = passes_per_scan * (master->scan_number + 1) - 1 + master->pass_number_scan_opt_base;
       }
     }
   
@@ -666,7 +667,7 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
           (idx == 3 && master->best_freq_split_idx_luma != 2) ||
           (idx == 4 && master->best_freq_split_idx_luma != 4)) {
         master->scan_number = cinfo->master->num_scans_luma - 1;
-        master->pass_number = 2 * master->scan_number + 1 + master->pass_number_scan_opt_base;
+        master->pass_number = passes_per_scan * (master->scan_number + 1) - 1 + master->pass_number_scan_opt_base;
         master->pub.is_last_pass = (master->pass_number == master->total_passes - 1);
       }
     }
@@ -702,7 +703,7 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
           master->best_Al_chroma = Al;
         } else {
           master->scan_number = chroma_freq_split_scan_start - 1;
-          master->pass_number = 2 * master->scan_number + 1 + master->pass_number_scan_opt_base;
+          master->pass_number = passes_per_scan * (master->scan_number + 1) - 1 + master->pass_number_scan_opt_base;
         }
       }
 
@@ -730,7 +731,7 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
             (idx == 3 && master->best_freq_split_idx_chroma != 2) ||
             (idx == 4 && master->best_freq_split_idx_chroma != 4)) {
           master->scan_number = cinfo->num_scans - 1;
-          master->pass_number = 2 * master->scan_number + 1 + master->pass_number_scan_opt_base;
+          master->pass_number = passes_per_scan * (master->scan_number + 1) - 1 + master->pass_number_scan_opt_base;
           master->pub.is_last_pass = (master->pass_number == master->total_passes - 1);
         }
       }
diff --git a/jpegtran.c b/jpegtran.c
index 90900d0f..e9be7a08 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -178,9 +178,6 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
 
       /* No table optimization required for AC */
       cinfo->optimize_coding = FALSE;
-      
-      /* Scan optimization currently incompatible with AC */
-      jpeg_c_set_bool_param(cinfo, JBOOLEAN_OPTIMIZE_SCANS, FALSE);
 #else
       fprintf(stderr, "%s: sorry, arithmetic coding not supported\n",
               progname);

From ddd54ff8a83c6f9e2df59b4ae4a1ffeb61396e1c Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Sat, 20 Dec 2014 03:32:59 +0000
Subject: [PATCH 064/108] Optimizations to the AltiVec DCT algorithms
 (pre-compute constants and combine multiply/add operations)

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1462 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/jsimd_powerpc_altivec.c | 97 ++++++++++--------------------------
 1 file changed, 25 insertions(+), 72 deletions(-)

diff --git a/simd/jsimd_powerpc_altivec.c b/simd/jsimd_powerpc_altivec.c
index 0d251dfa..7796b707 100644
--- a/simd/jsimd_powerpc_altivec.c
+++ b/simd/jsimd_powerpc_altivec.c
@@ -101,18 +101,13 @@
   tmp1312l = vec_mergeh(tmp13, tmp12);  \
   tmp1312h = vec_mergel(tmp13, tmp12);  \
   \
-  out2l = vec_msums(tmp1312l, PW_F130_F054, zero);  \
-  out2h = vec_msums(tmp1312h, PW_F130_F054, zero);  \
-  out6l = vec_msums(tmp1312l, PW_F054_MF130, zero);  \
-  out6h = vec_msums(tmp1312h, PW_F054_MF130, zero);  \
+  out2l = vec_msums(tmp1312l, PW_F130_F054, PD_DESCALE_P##PASS);  \
+  out2h = vec_msums(tmp1312h, PW_F130_F054, PD_DESCALE_P##PASS);  \
+  out6l = vec_msums(tmp1312l, PW_F054_MF130, PD_DESCALE_P##PASS);  \
+  out6h = vec_msums(tmp1312h, PW_F054_MF130, PD_DESCALE_P##PASS);  \
   \
-  out2l = vec_add(out2l, PD_DESCALE_P##PASS);  \
-  out2h = vec_add(out2h, PD_DESCALE_P##PASS);  \
   out2l = vec_sra(out2l, DESCALE_P##PASS);  \
   out2h = vec_sra(out2h, DESCALE_P##PASS);  \
-  \
-  out6l = vec_add(out6l, PD_DESCALE_P##PASS);  \
-  out6h = vec_add(out6h, PD_DESCALE_P##PASS);  \
   out6l = vec_sra(out6l, DESCALE_P##PASS);  \
   out6h = vec_sra(out6h, DESCALE_P##PASS);  \
   \
@@ -127,31 +122,21 @@
   z34l = vec_mergeh(z3, z4);  \
   z34h = vec_mergel(z3, z4);  \
   \
-  z3l = vec_msums(z34l, PW_MF078_F117, zero);  \
-  z3h = vec_msums(z34h, PW_MF078_F117, zero);  \
-  z4l = vec_msums(z34l, PW_F117_F078, zero);  \
-  z4h = vec_msums(z34h, PW_F117_F078, zero);  \
+  z3l = vec_msums(z34l, PW_MF078_F117, PD_DESCALE_P##PASS);  \
+  z3h = vec_msums(z34h, PW_MF078_F117, PD_DESCALE_P##PASS);  \
+  z4l = vec_msums(z34l, PW_F117_F078, PD_DESCALE_P##PASS);  \
+  z4h = vec_msums(z34h, PW_F117_F078, PD_DESCALE_P##PASS);  \
   \
   tmp47l = vec_mergeh(tmp4, tmp7);  \
   tmp47h = vec_mergel(tmp4, tmp7);  \
   \
-  tmp4l = vec_msums(tmp47l, PW_MF060_MF089, zero);  \
-  tmp4h = vec_msums(tmp47h, PW_MF060_MF089, zero);  \
-  tmp7l = vec_msums(tmp47l, PW_MF089_F060, zero);  \
-  tmp7h = vec_msums(tmp47h, PW_MF089_F060, zero);  \
+  out7l = vec_msums(tmp47l, PW_MF060_MF089, z3l);  \
+  out7h = vec_msums(tmp47h, PW_MF060_MF089, z3h);  \
+  out1l = vec_msums(tmp47l, PW_MF089_F060, z4l);  \
+  out1h = vec_msums(tmp47h, PW_MF089_F060, z4h);  \
   \
-  out7l = vec_add(z3l, tmp4l);  \
-  out7h = vec_add(z3h, tmp4h);  \
-  out1l = vec_add(z4l, tmp7l);  \
-  out1h = vec_add(z4h, tmp7h);  \
-  \
-  out7l = vec_add(out7l, PD_DESCALE_P##PASS);  \
-  out7h = vec_add(out7h, PD_DESCALE_P##PASS);  \
   out7l = vec_sra(out7l, DESCALE_P##PASS);  \
   out7h = vec_sra(out7h, DESCALE_P##PASS);  \
-  \
-  out1l = vec_add(out1l, PD_DESCALE_P##PASS);  \
-  out1h = vec_add(out1h, PD_DESCALE_P##PASS);  \
   out1l = vec_sra(out1l, DESCALE_P##PASS);  \
   out1h = vec_sra(out1h, DESCALE_P##PASS);  \
   \
@@ -161,23 +146,13 @@
   tmp56l = vec_mergeh(tmp5, tmp6);  \
   tmp56h = vec_mergel(tmp5, tmp6);  \
   \
-  tmp5l = vec_msums(tmp56l, PW_MF050_MF256, zero);  \
-  tmp5h = vec_msums(tmp56h, PW_MF050_MF256, zero);  \
-  tmp6l = vec_msums(tmp56l, PW_MF256_F050, zero);  \
-  tmp6h = vec_msums(tmp56h, PW_MF256_F050, zero);  \
+  out5l = vec_msums(tmp56l, PW_MF050_MF256, z4l);  \
+  out5h = vec_msums(tmp56h, PW_MF050_MF256, z4h);  \
+  out3l = vec_msums(tmp56l, PW_MF256_F050, z3l);  \
+  out3h = vec_msums(tmp56h, PW_MF256_F050, z3h);  \
   \
-  out5l = vec_add(tmp5l, z4l);  \
-  out5h = vec_add(tmp5h, z4h);  \
-  out3l = vec_add(tmp6l, z3l);  \
-  out3h = vec_add(tmp6h, z3h);  \
-  \
-  out5l = vec_add(out5l, PD_DESCALE_P##PASS);  \
-  out5h = vec_add(out5h, PD_DESCALE_P##PASS);  \
   out5l = vec_sra(out5l, DESCALE_P##PASS);  \
   out5h = vec_sra(out5h, DESCALE_P##PASS);  \
-  \
-  out3l = vec_add(out3l, PD_DESCALE_P##PASS);  \
-  out3h = vec_add(out3h, PD_DESCALE_P##PASS);  \
   out3l = vec_sra(out3l, DESCALE_P##PASS);  \
   out3h = vec_sra(out3h, DESCALE_P##PASS);  \
   \
@@ -230,8 +205,7 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
     tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
     z3, z4, z34l, z34h,
     out0, out1, out2, out3, out4, out5, out6, out7;
-  __vector int tmp4l, tmp4h, tmp5l, tmp5h, tmp6l, tmp6h, tmp7l, tmp7h,
-    z3l, z3h, z4l, z4h,
+  __vector int z3l, z3h, z4l, z4h,
     out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
     out7l, out7h;
 
@@ -247,8 +221,7 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
     PW_MF256_F050 = { __4X2(-ISLOW_F_2_562, ISLOW_F_3_072 - ISLOW_F_2_562) },
     PW_DESCALE_P2X = { __8X(1 << (ISLOW_PASS1_BITS - 1)) };
   __vector unsigned short PASS1_BITS = { __8X(ISLOW_PASS1_BITS) };
-  __vector int zero = { __4X(0) },
-    PD_DESCALE_P1 = { __4X(1 << (ISLOW_DESCALE_P1 - 1)) },
+  __vector int PD_DESCALE_P1 = { __4X(1 << (ISLOW_DESCALE_P1 - 1)) },
     PD_DESCALE_P2 = { __4X(1 << (ISLOW_DESCALE_P2 - 1)) };
   __vector unsigned int DESCALE_P1 = { __4X(ISLOW_DESCALE_P1) },
     DESCALE_P2 = { __4X(ISLOW_DESCALE_P2) };
@@ -352,11 +325,8 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
   z5 = vec_sub(tmp10, tmp12);  \
   z5 = vec_madds(z5, PW_0382, zero);  \
   \
-  z2 = vec_madds(tmp10, PW_0541, zero);  \
-  z2 = vec_add(z2, z5);  \
-  \
-  z4 = vec_madds(tmp12, PW_1306, zero);  \
-  z4 = vec_add(z4, z5);  \
+  z2 = vec_madds(tmp10, PW_0541, z5);  \
+  z4 = vec_madds(tmp12, PW_1306, z5);  \
   \
   tmp11 = vec_sl(tmp11, PRE_MULTIPLY_SCALE_BITS);  \
   z3 = vec_madds(tmp11, PW_0707, zero);  \
@@ -459,6 +429,8 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
   tmp0h = vec_unpackl(tmp0);  \
   tmp0l = vec_sl(tmp0l, CONST_BITS);  \
   tmp0h = vec_sl(tmp0h, CONST_BITS);  \
+  tmp0l = vec_add(tmp0l, PD_DESCALE_P##PASS);  \
+  tmp0h = vec_add(tmp0h, PD_DESCALE_P##PASS);  \
   \
   tmp10l = vec_add(tmp0l, tmp3l);  \
   tmp10h = vec_add(tmp0h, tmp3h);  \
@@ -469,6 +441,8 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
   tmp1h = vec_unpackl(tmp1);  \
   tmp1l = vec_sl(tmp1l, CONST_BITS);  \
   tmp1h = vec_sl(tmp1h, CONST_BITS);  \
+  tmp1l = vec_add(tmp1l, PD_DESCALE_P##PASS);  \
+  tmp1h = vec_add(tmp1h, PD_DESCALE_P##PASS);  \
   \
   tmp11l = vec_add(tmp1l, tmp2l);  \
   tmp11h = vec_add(tmp1h, tmp2h);  \
@@ -511,13 +485,8 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
   out7l = vec_sub(tmp10l, tmp3l);  \
   out7h = vec_sub(tmp10h, tmp3h);  \
   \
-  out0l = vec_add(out0l, PD_DESCALE_P##PASS);  \
-  out0h = vec_add(out0h, PD_DESCALE_P##PASS);  \
   out0l = vec_sra(out0l, DESCALE_P##PASS);  \
   out0h = vec_sra(out0h, DESCALE_P##PASS);  \
-  \
-  out7l = vec_add(out7l, PD_DESCALE_P##PASS);  \
-  out7h = vec_add(out7h, PD_DESCALE_P##PASS);  \
   out7l = vec_sra(out7l, DESCALE_P##PASS);  \
   out7h = vec_sra(out7h, DESCALE_P##PASS);  \
   \
@@ -529,13 +498,8 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
   out6l = vec_sub(tmp11l, tmp2l);  \
   out6h = vec_sub(tmp11h, tmp2h);  \
   \
-  out1l = vec_add(out1l, PD_DESCALE_P##PASS);  \
-  out1h = vec_add(out1h, PD_DESCALE_P##PASS);  \
   out1l = vec_sra(out1l, DESCALE_P##PASS);  \
   out1h = vec_sra(out1h, DESCALE_P##PASS);  \
-  \
-  out6l = vec_add(out6l, PD_DESCALE_P##PASS);  \
-  out6h = vec_add(out6h, PD_DESCALE_P##PASS);  \
   out6l = vec_sra(out6l, DESCALE_P##PASS);  \
   out6h = vec_sra(out6h, DESCALE_P##PASS);  \
   \
@@ -547,13 +511,8 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
   out5l = vec_sub(tmp12l, tmp1l);  \
   out5h = vec_sub(tmp12h, tmp1h);  \
   \
-  out2l = vec_add(out2l, PD_DESCALE_P##PASS);  \
-  out2h = vec_add(out2h, PD_DESCALE_P##PASS);  \
   out2l = vec_sra(out2l, DESCALE_P##PASS);  \
   out2h = vec_sra(out2h, DESCALE_P##PASS);  \
-  \
-  out5l = vec_add(out5l, PD_DESCALE_P##PASS);  \
-  out5h = vec_add(out5h, PD_DESCALE_P##PASS);  \
   out5l = vec_sra(out5l, DESCALE_P##PASS);  \
   out5h = vec_sra(out5h, DESCALE_P##PASS);  \
   \
@@ -565,13 +524,8 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
   out4l = vec_sub(tmp13l, tmp0l);  \
   out4h = vec_sub(tmp13h, tmp0h);  \
   \
-  out3l = vec_add(out3l, PD_DESCALE_P##PASS);  \
-  out3h = vec_add(out3h, PD_DESCALE_P##PASS);  \
   out3l = vec_sra(out3l, DESCALE_P##PASS);  \
   out3h = vec_sra(out3h, DESCALE_P##PASS);  \
-  \
-  out4l = vec_add(out4l, PD_DESCALE_P##PASS);  \
-  out4h = vec_add(out4h, PD_DESCALE_P##PASS);  \
   out4l = vec_sra(out4l, DESCALE_P##PASS);  \
   out4h = vec_sra(out4h, DESCALE_P##PASS);  \
   \
@@ -765,9 +719,8 @@ jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
   \
   tmp10 = vec_madds(z12s, PW_F1082, zero);  \
   tmp10 = vec_sub(tmp10, z5);  \
-  tmp12 = vec_madds(z10s, PW_MF1613, zero);  \
+  tmp12 = vec_madds(z10s, PW_MF1613, z5);  \
   tmp12 = vec_sub(tmp12, z10);  \
-  tmp12 = vec_add(tmp12, z5);  \
   \
   tmp6 = vec_sub(tmp12, tmp7);  \
   tmp5 = vec_sub(tmp11, tmp6);  \

From 9d8efde83b1619801a1519b954232a851e160ce6 Mon Sep 17 00:00:00 2001
From: Josh Aas <joshmoz@gmail.com>
Date: Sat, 20 Dec 2014 13:21:16 -0600
Subject: [PATCH 065/108] Take upstream r1455 to fix issue #95.

---
 README | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/README b/README
index 44e69d39..e82a095b 100644
--- a/README
+++ b/README
@@ -36,7 +36,6 @@ TO DO               Plans for future IJG releases.
 Other documentation files in the distribution are:
 
 User documentation:
-  install.txt       How to configure and install the IJG software.
   usage.txt         Usage instructions for cjpeg, djpeg, jpegtran,
                     rdjpgcom, and wrjpgcom.
   *.1               Unix-style man pages for programs (same info as usage.txt).
@@ -48,9 +47,9 @@ Programmer and internal documentation:
   structure.txt     Overview of the JPEG library's internal structure.
   coderules.txt     Coding style rules --- please read if you contribute code.
 
-Please read at least the files install.txt and usage.txt.  Some information
-can also be found in the JPEG FAQ (Frequently Asked Questions) article.  See
-ARCHIVE LOCATIONS below to find out where to obtain the FAQ article.
+Please read at least usage.txt.  Some information can also be found in the JPEG
+FAQ (Frequently Asked Questions) article.  See ARCHIVE LOCATIONS below to find
+out where to obtain the FAQ article.
 
 If you want to understand how the JPEG code works, we suggest reading one or
 more of the REFERENCES, then looking at the documentation files (in roughly

From 4802ddd7f7bf8f071fa42bfd767b5da996f1168e Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Sun, 21 Dec 2014 01:52:38 +0100
Subject: [PATCH 066/108] Partial fix for #138

Initial implementation of trellis quantization for arithmetic coding.
The rate computation does not yet implement all rules of the entropy
coder and may thus be suboptimal.
---
 cjpeg.c    |   3 -
 jcarith.c  |  45 ++++++-
 jccoefct.c |  33 +++--
 jcdctmgr.c | 355 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 jcmaster.c |   4 +-
 jpegint.h  |  29 +++++
 6 files changed, 450 insertions(+), 19 deletions(-)

diff --git a/cjpeg.c b/cjpeg.c
index 74096782..5695e7d0 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -292,9 +292,6 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       
       /* No table optimization required for AC */
       cinfo->optimize_coding = FALSE;
-      
-      /* Trellis quantization currently incompatible with AC */
-      jpeg_c_set_bool_param(cinfo, JBOOLEAN_TRELLIS_QUANT, FALSE);
 #else
       fprintf(stderr, "%s: sorry, arithmetic coding not supported\n",
               progname);
diff --git a/jcarith.c b/jcarith.c
index 71a84ddb..50c52bc4 100644
--- a/jcarith.c
+++ b/jcarith.c
@@ -18,7 +18,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-
+#include <math.h>
 
 /* Expanded entropy encoder object for arithmetic encoding. */
 
@@ -120,6 +120,10 @@ emit_byte (int val, j_compress_ptr cinfo)
 {
   struct jpeg_destination_mgr * dest = cinfo->dest;
 
+  /* Do not emit bytes during trellis passes */
+  if (cinfo->master->trellis_passes)
+    return;
+  
   *dest->next_output_byte++ = (JOCTET) val;
   if (--dest->free_in_buffer == 0)
     if (! (*dest->empty_output_buffer) (cinfo))
@@ -826,6 +830,7 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   int ci, tbl;
   jpeg_component_info * compptr;
+  boolean progressive_mode;
 
   if (gather_statistics)
     /* Make sure to avoid that in the master control logic!
@@ -836,8 +841,12 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
 
   /* We assume jcmaster.c already validated the progressive scan parameters. */
 
+  /* Trellis optimization does DC and AC in same pass and without refinement
+   * so consider progressive mode to be off in such case */
+  progressive_mode = (cinfo->master->trellis_passes) ? FALSE : cinfo->progressive_mode;
+  
   /* Select execution routines */
-  if (cinfo->progressive_mode) {
+  if (progressive_mode) {
     if (cinfo->Ah == 0) {
       if (cinfo->Ss == 0)
         entropy->pub.encode_mcu = encode_mcu_DC_first;
@@ -856,7 +865,7 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     /* DC needs no table for refinement scan */
-    if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+    if (progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
       tbl = compptr->dc_tbl_no;
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
         ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
@@ -869,7 +878,7 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
       entropy->dc_context[ci] = 0;
     }
     /* AC needs no table when not present */
-    if (cinfo->progressive_mode == 0 || cinfo->Se) {
+    if (progressive_mode == 0 || cinfo->Se) {
       tbl = compptr->ac_tbl_no;
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
         ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
@@ -878,7 +887,7 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
           ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
       MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
 #ifdef CALCULATE_SPECTRAL_CONDITIONING
-      if (cinfo->progressive_mode)
+      if (progressive_mode)
         /* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */
         cinfo->arith_ac_K[tbl] = cinfo->Ss + ((8 + cinfo->Se - cinfo->Ss) >> 4);
 #endif
@@ -925,3 +934,29 @@ jinit_arith_encoder (j_compress_ptr cinfo)
   /* Initialize index for fixed probability estimation */
   entropy->fixed_bin[0] = 113;
 }
+
+GLOBAL(void)
+jget_arith_rates (j_compress_ptr cinfo, int dc_tbl_no, int ac_tbl_no, arith_rates *r)
+{
+  int i;
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  for (i = 0; i < DC_STAT_BINS; i++) {
+    int state = entropy->dc_stats[dc_tbl_no][i];
+    int mps_val = state >> 7;
+    float prob_lps = (jpeg_aritab[state & 0x7f] >> 16) / 46340.95; /* 32768*sqrt(2) */
+    float prob_0 = (mps_val) ? prob_lps : 1.0 - prob_lps;
+    float prob_1 = 1.0 - prob_0;
+    r->rate_dc[i][0] = -log(prob_0) / log(2.0);
+    r->rate_dc[i][1] = -log(prob_1) / log(2.0);
+  }
+
+  for (i = 0; i < AC_STAT_BINS; i++) {
+    int state = entropy->ac_stats[ac_tbl_no][i];
+    int mps_val = state >> 7;
+    float prob_lps = (jpeg_aritab[state & 0x7f] >> 16) / 46340.95;
+    float prob_0 = (mps_val) ? prob_lps : 1.0 - prob_lps;
+    float prob_1 = 1.0 - prob_0;
+    r->rate_ac[i][0] = -log(prob_0) / log(2.0);
+    r->rate_ac[i][1] = -log(prob_1) / log(2.0);
+  }
+}
diff --git a/jccoefct.c b/jccoefct.c
index 3a792c8b..7c3d58f9 100644
--- a/jccoefct.c
+++ b/jccoefct.c
@@ -19,7 +19,6 @@
 #include "jpeglib.h"
 #include "jchuff.h"
 
-
 /* We use a full-image coefficient buffer when doing Huffman optimization,
  * and also for writing multiple-scan JPEG files.  In all cases, the DCT
  * step is run during the first pass, and subsequent passes need only read
@@ -367,10 +366,18 @@ compress_trellis_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
     c_derived_tbl *dctbl = &dctbl_data;
     c_derived_tbl actbl_data;
     c_derived_tbl *actbl = &actbl_data;
+    
+    arith_rates arith_r_data;
+    arith_rates *arith_r = &arith_r_data;
+    
     compptr = cinfo->cur_comp_info[ci];
 
-    jpeg_make_c_derived_tbl(cinfo, TRUE, compptr->dc_tbl_no, &dctbl);
-    jpeg_make_c_derived_tbl(cinfo, FALSE, compptr->ac_tbl_no, &actbl);
+    if (cinfo->arith_code)
+      jget_arith_rates(cinfo, compptr->dc_tbl_no, compptr->ac_tbl_no, arith_r);
+    else {
+      jpeg_make_c_derived_tbl(cinfo, TRUE, compptr->dc_tbl_no, &dctbl);
+      jpeg_make_c_derived_tbl(cinfo, FALSE, compptr->ac_tbl_no, &actbl);
+    }
     
     /* Align the virtual buffer for this component. */
     buffer = (*cinfo->mem->access_virt_barray)
@@ -406,12 +413,20 @@ compress_trellis_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
     for (block_row = 0; block_row < block_rows; block_row++) {
       thisblockrow = buffer[block_row];
       lastblockrow = (block_row > 0) ? buffer[block_row-1] : NULL;
-      quantize_trellis(cinfo, dctbl, actbl, thisblockrow,
-                       buffer_dst[block_row], blocks_across,
-                       cinfo->quant_tbl_ptrs[compptr->quant_tbl_no],
-                       cinfo->master->norm_src[compptr->quant_tbl_no],
-                       cinfo->master->norm_coef[compptr->quant_tbl_no],
-                       &lastDC, lastblockrow, buffer_dst[block_row-1]);
+      if (cinfo->arith_code)
+        quantize_trellis_arith(cinfo, arith_r, thisblockrow,
+                               buffer_dst[block_row], blocks_across,
+                               cinfo->quant_tbl_ptrs[compptr->quant_tbl_no],
+                               cinfo->master->norm_src[compptr->quant_tbl_no],
+                               cinfo->master->norm_coef[compptr->quant_tbl_no],
+                               &lastDC, lastblockrow, buffer_dst[block_row-1]);
+      else
+        quantize_trellis(cinfo, dctbl, actbl, thisblockrow,
+                         buffer_dst[block_row], blocks_across,
+                         cinfo->quant_tbl_ptrs[compptr->quant_tbl_no],
+                         cinfo->master->norm_src[compptr->quant_tbl_no],
+                         cinfo->master->norm_coef[compptr->quant_tbl_no],
+                         &lastDC, lastblockrow, buffer_dst[block_row-1]);
       
       if (ndummy > 0) {
         /* Create dummy blocks at the right edge of the image. */
diff --git a/jcdctmgr.c b/jcdctmgr.c
index a373f3ff..37e39e5e 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -1261,6 +1261,361 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
 
 }
 
+GLOBAL(void)
+quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_blocks, JBLOCKROW src, JDIMENSION num_blocks,
+                 JQUANT_TBL * qtbl, double *norm_src, double *norm_coef, JCOEF *last_dc_val,
+                 JBLOCKROW coef_blocks_above, JBLOCKROW src_above)
+{
+  int i, j, k, l;
+  float accumulated_zero_dist[DCTSIZE2];
+  float accumulated_cost[DCTSIZE2];
+  int run_start[DCTSIZE2];
+  int bi;
+  float best_cost;
+  int last_coeff_idx; /* position of last nonzero coefficient */
+  float norm = 0.0;
+  float lambda_base;
+  float lambda;
+  float lambda_dc;
+  const float *lambda_tbl = (cinfo->master->use_lambda_weight_tbl) ?
+  jpeg_lambda_weights_csf_luma :
+  jpeg_lambda_weights_flat;
+  int Ss, Se;
+  int has_eob;
+  float cost_all_zeros;
+  float best_cost_skip;
+  float cost;
+  float run_bits;
+  int rate;
+  float *accumulated_dc_cost[DC_TRELLIS_CANDIDATES];
+  int *dc_cost_backtrack[DC_TRELLIS_CANDIDATES];
+  JCOEF *dc_candidate[DC_TRELLIS_CANDIDATES];
+  int mode = 1;
+  float lambda_table[DCTSIZE2];
+  
+  /* Arithmetic coding context. Set to 0 for now but can refined */
+  int dc_context = 0;
+  
+  Ss = cinfo->Ss;
+  Se = cinfo->Se;
+  if (Ss == 0)
+    Ss = 1;
+  if (Se < Ss)
+    return;
+  
+  if (cinfo->master->trellis_quant_dc) {
+    for (i = 0; i < DC_TRELLIS_CANDIDATES; i++) {
+      accumulated_dc_cost[i] = (float *)malloc(num_blocks * sizeof(float));
+      dc_cost_backtrack[i] = (int *)malloc(num_blocks * sizeof(int));
+      dc_candidate[i] = (JCOEF *)malloc(num_blocks * sizeof(JCOEF));
+      if (!accumulated_dc_cost[i] ||
+          !dc_cost_backtrack[i] ||
+          !dc_candidate[i]) {
+        ERREXIT(cinfo, JERR_OUT_OF_MEMORY);
+      }
+    }
+  }
+  
+  norm = 0.0;
+  for (i = 1; i < DCTSIZE2; i++) {
+    norm += qtbl->quantval[i] * qtbl->quantval[i];
+  }
+  norm /= 63.0;
+  
+  if (mode == 1) {
+    lambda_base = 1.0;
+    lambda_tbl = lambda_table;
+    for (i = 0; i < DCTSIZE2; i++)
+      lambda_table[i] = 1.0 / (qtbl->quantval[i] * qtbl->quantval[i]);
+  } else
+    lambda_base = 1.0 / norm;
+  
+  for (bi = 0; bi < num_blocks; bi++) {
+    
+    norm = 0.0;
+    for (i = 1; i < DCTSIZE2; i++) {
+      norm += src[bi][i] * src[bi][i];
+    }
+    norm /= 63.0;
+    
+    if (cinfo->master->lambda_log_scale2 > 0.0)
+      lambda = pow(2.0, cinfo->master->lambda_log_scale1) * lambda_base /
+      (pow(2.0, cinfo->master->lambda_log_scale2) + norm);
+    else
+      lambda = pow(2.0, cinfo->master->lambda_log_scale1 - 12.0) * lambda_base;
+    
+    lambda_dc = lambda * lambda_tbl[0];
+    
+    accumulated_zero_dist[Ss-1] = 0.0;
+    accumulated_cost[Ss-1] = 0.0;
+    
+    /* Do DC coefficient */
+    if (cinfo->master->trellis_quant_dc) {
+      int sign = src[bi][0] >> 31;
+      int x = abs(src[bi][0]);
+      int q = 8 * qtbl->quantval[0];
+      int qval;
+      float dc_candidate_dist;
+      
+      qval = (x + q/2) / q; /* quantized value (round nearest) */
+      for (k = 0; k < DC_TRELLIS_CANDIDATES; k++) {
+        int delta;
+        int dc_delta;
+        float bits;
+        int st = dc_context;
+        int m;
+        int v2;
+        
+        dc_candidate[k][bi] = qval - DC_TRELLIS_CANDIDATES/2 + k;
+        delta = dc_candidate[k][bi] * q - x;
+        dc_candidate_dist = delta * delta * lambda_dc;
+        dc_candidate[k][bi] *= 1 + 2*sign;
+        
+        /* Take into account DC differences */
+        if (coef_blocks_above && src_above && cinfo->master->trellis_delta_dc_weight > 0.0) {
+          int dc_above_orig;
+          int dc_above_recon;
+          int dc_orig;
+          int dc_recon;
+          float vertical_dist;
+          
+          dc_above_orig = src_above[bi][0];
+          dc_above_recon = coef_blocks_above[bi][0] * q;
+          dc_orig = src[bi][0];
+          dc_recon = dc_candidate[k][bi] * q;
+          /* delta is difference of vertical gradients */
+          delta = (dc_above_orig - dc_orig) - (dc_above_recon - dc_recon);
+          vertical_dist = delta * delta * lambda_dc;
+          dc_candidate_dist +=  cinfo->master->trellis_delta_dc_weight * (vertical_dist - dc_candidate_dist);
+        }
+        
+        if (bi == 0) {
+          dc_delta = dc_candidate[k][bi] - *last_dc_val;
+          
+          bits = r->rate_dc[st][dc_delta != 0];
+          if (dc_delta != 0) {
+            bits += r->rate_dc[st+1][dc_delta < 0];
+            st += 2 + (dc_delta < 0);
+            dc_delta = abs(dc_delta);
+            
+            m = 0;
+            if (dc_delta -= 1) {
+              bits += r->rate_dc[st][1];
+              st = 20;
+              m = 1;
+              v2 = dc_delta;
+              while (v2 >>= 1) {
+                bits += r->rate_dc[st][1];
+                m <<= 1;
+                st++;
+              }
+            }
+            bits += r->rate_dc[st][0];
+            st += 14;
+            while (m >>= 1)
+              bits += r->rate_dc[st][(m & dc_delta) ? 1 : 0];
+          }
+          cost = bits + dc_candidate_dist;
+          accumulated_dc_cost[k][0] = cost;
+          dc_cost_backtrack[k][0] = -1;
+        } else {
+          for (l = 0; l < DC_TRELLIS_CANDIDATES; l++) {
+            dc_delta = dc_candidate[k][bi] - dc_candidate[l][bi-1];
+            
+            bits = r->rate_dc[st][dc_delta != 0];
+            if (dc_delta != 0) {
+              bits += r->rate_dc[st+1][dc_delta < 0];
+              st += 2 + (dc_delta < 0);
+              dc_delta = abs(dc_delta);
+              
+              m = 0;
+              if (dc_delta -= 1) {
+                bits += r->rate_dc[st][1];
+                st = 20;
+                m = 1;
+                v2 = dc_delta;
+                while (v2 >>= 1) {
+                  bits += r->rate_dc[st][1];
+                  m <<= 1;
+                  st++;
+                }
+              }
+              bits += r->rate_dc[st][0];
+              st += 14;
+              while (m >>= 1)
+                bits += r->rate_dc[st][(m & dc_delta) ? 1 : 0];
+            }
+            
+            cost = bits + dc_candidate_dist + accumulated_dc_cost[l][bi-1];
+            if (l == 0 || cost < accumulated_dc_cost[k][bi]) {
+              accumulated_dc_cost[k][bi] = cost;
+              dc_cost_backtrack[k][bi] = l;
+            }
+          }
+        }
+      }
+    }
+    
+    /* Do AC coefficients */
+    for (i = Ss; i <= Se; i++) {
+      int z = jpeg_natural_order[i];
+      
+      int sign = src[bi][z] >> 31;
+      int x = abs(src[bi][z]);
+      int q = 8 * qtbl->quantval[z];
+      int candidate[16];
+      float candidate_dist[16];
+      int num_candidates;
+      int qval;
+      int delta;
+      
+      accumulated_zero_dist[i] = x * x * lambda * lambda_tbl[z] + accumulated_zero_dist[i-1];
+      
+      qval = (x + q/2) / q; /* quantized value (round nearest) */
+      
+      if (qval == 0) {
+        coef_blocks[bi][z] = 0;
+        accumulated_cost[i] = 1e38; /* Shouldn't be needed */
+        continue;
+      }
+      
+      k = 0;
+      candidate[k] = qval;
+      delta = candidate[k] * q - x;
+      candidate_dist[k] = delta * delta * lambda * lambda_tbl[z];
+      k++;
+      if (qval > 1) {
+        candidate[k] = qval - 1;
+        delta = candidate[k] * q - x;
+        candidate_dist[k] = delta * delta * lambda * lambda_tbl[z];
+        k++;
+      }
+      num_candidates = k;
+      
+      accumulated_cost[i] = 1e38;
+      
+      for (j = Ss-1; j < i; j++) {
+        int zz = jpeg_natural_order[j];
+        if (j != Ss-1 && coef_blocks[bi][zz] == 0)
+          continue;
+        
+        run_bits = r->rate_ac[3*j][0]; /* EOB */
+        for (k = j+1; k < i; k++)
+          run_bits += r->rate_ac[3*(k-1)+1][0];
+        run_bits += r->rate_ac[3*(i-1)+1][1];
+        
+        for (k = 0; k < num_candidates; k++) {
+          float coef_bits = 1.0; /* sign bit */
+          int v = candidate[k];
+          int v2;
+          int m;
+          int st;
+          
+          st = 3*(i-1)+2;
+          m = 0;
+          if (v -= 1) {
+            coef_bits += r->rate_ac[st][1];
+            m = 1;
+            v2 = v;
+            if (v2 >>= 1) {
+              coef_bits += r->rate_ac[st][1];
+              m <<= 1;
+              st = 189; /* TODO: condition 189/217 */
+              while (v2 >>= 1) {
+                coef_bits += r->rate_ac[st][1];
+                m <<= 1;
+                st++;
+              }
+            }
+          }
+          coef_bits += r->rate_ac[st][0];
+          st += 14;
+          while (m >>= 1)
+            coef_bits += r->rate_ac[st][(m & v) ? 1 : 0];
+          
+          rate = coef_bits + run_bits;
+          cost = rate + candidate_dist[k];
+          cost += accumulated_zero_dist[i-1] - accumulated_zero_dist[j] + accumulated_cost[j];
+          
+          if (cost < accumulated_cost[i]) {
+            coef_blocks[bi][z] = (candidate[k] ^ sign) - sign;
+            accumulated_cost[i] = cost;
+            run_start[i] = j;
+          }
+        }
+      }
+    }
+    
+    last_coeff_idx = Ss-1;
+    best_cost = accumulated_zero_dist[Se] + r->rate_ac[0][1];
+    cost_all_zeros = accumulated_zero_dist[Se];
+    best_cost_skip = cost_all_zeros;
+    
+    for (i = Ss; i <= Se; i++) {
+      int z = jpeg_natural_order[i];
+      if (coef_blocks[bi][z] != 0) {
+        float cost = accumulated_cost[i] + accumulated_zero_dist[Se] - accumulated_zero_dist[i];
+        float cost_wo_eob = cost;
+        
+        if (i < Se)
+          cost += r->rate_ac[3*(i-1)][1];
+        
+        if (cost < best_cost) {
+          best_cost = cost;
+          last_coeff_idx = i;
+          best_cost_skip = cost_wo_eob;
+        }
+      }
+    }
+    
+    has_eob = (last_coeff_idx < Se) + (last_coeff_idx == Ss-1);
+    
+    /* Zero out coefficients that are part of runs */
+    i = Se;
+    while (i >= Ss)
+    {
+      while (i > last_coeff_idx) {
+        int z = jpeg_natural_order[i];
+        coef_blocks[bi][z] = 0;
+        i--;
+      }
+      last_coeff_idx = run_start[i];
+      i--;
+    }
+    
+  }
+  
+  if (cinfo->master->trellis_q_opt) {
+    for (bi = 0; bi < num_blocks; bi++) {
+      for (i = 1; i < DCTSIZE2; i++) {
+        norm_src[i] += src[bi][i] * coef_blocks[bi][i];
+        norm_coef[i] += 8 * coef_blocks[bi][i] * coef_blocks[bi][i];
+      }
+    }
+  }
+  
+  if (cinfo->master->trellis_quant_dc) {
+    j = 0;
+    for (i = 1; i < DC_TRELLIS_CANDIDATES; i++) {
+      if (accumulated_dc_cost[i][num_blocks-1] < accumulated_dc_cost[j][num_blocks-1])
+        j = i;
+    }
+    for (bi = num_blocks-1; bi >= 0; bi--) {
+      coef_blocks[bi][0] = dc_candidate[j][bi];
+      j = dc_cost_backtrack[j][bi];
+    }
+    
+    /* Save DC predictor */
+    *last_dc_val = coef_blocks[num_blocks-1][0];
+    
+    for (i = 0; i < DC_TRELLIS_CANDIDATES; i++) {
+      free(accumulated_dc_cost[i]);
+      free(dc_cost_backtrack[i]);
+      free(dc_candidate[i]);
+    }
+  }
+}
+
 /*
  * Initialize FDCT manager.
  */
diff --git a/jcmaster.c b/jcmaster.c
index 62cfe07e..bf5bf2f5 100644
--- a/jcmaster.c
+++ b/jcmaster.c
@@ -475,7 +475,7 @@ prepare_for_pass (j_compress_ptr cinfo)
       (*cinfo->prep->start_pass) (cinfo, JBUF_PASS_THRU);
     }
     (*cinfo->fdct->start_pass) (cinfo);
-    (*cinfo->entropy->start_pass) (cinfo, cinfo->optimize_coding || cinfo->master->trellis_quant);
+    (*cinfo->entropy->start_pass) (cinfo, (cinfo->optimize_coding || cinfo->master->trellis_quant) && !cinfo->arith_code);
     (*cinfo->coef->start_pass) (cinfo,
                                 (master->total_passes > 1 ?
                                  JBUF_SAVE_AND_PASS : JBUF_PASS_THRU));
@@ -541,7 +541,7 @@ prepare_for_pass (j_compress_ptr cinfo)
         }
       }
     }
-    (*cinfo->entropy->start_pass) (cinfo, TRUE);
+    (*cinfo->entropy->start_pass) (cinfo, !cinfo->arith_code);
     (*cinfo->coef->start_pass) (cinfo, JBUF_REQUANT);
     master->pub.call_pass_startup = FALSE;
     break;
diff --git a/jpegint.h b/jpegint.h
index 1b8f77bf..424f2d54 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -91,6 +91,27 @@ struct jpeg_comp_master {
   float trellis_delta_dc_weight;
 };
 
+/* The following two definitions specify the allocation chunk size
+ * for the statistics area.
+ * According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least
+ * 49 statistics bins for DC, and 245 statistics bins for AC coding.
+ *
+ * We use a compact representation with 1 byte per statistics bin,
+ * thus the numbers directly represent byte sizes.
+ * This 1 byte per statistics bin contains the meaning of the MPS
+ * (more probable symbol) in the highest bit (mask 0x80), and the
+ * index into the probability estimation state machine table
+ * in the lower bits (mask 0x7F).
+ */
+
+#define DC_STAT_BINS 64
+#define AC_STAT_BINS 256
+
+typedef struct {
+  float rate_dc[DC_STAT_BINS][2];
+  float rate_ac[AC_STAT_BINS][2];
+} arith_rates;
+
 /* Main buffer control (downsampled-data buffer) */
 struct jpeg_c_main_controller {
   void (*start_pass) (j_compress_ptr cinfo, J_BUF_MODE pass_mode);
@@ -364,6 +385,14 @@ EXTERN(void) jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
 EXTERN(void) jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row,
                               JDIMENSION num_blocks);
 EXTERN(void) jzero_far (void * target, size_t bytestozero);
+
+EXTERN(void) jget_arith_rates (j_compress_ptr cinfo, int dc_tbl_no, int ac_tbl_no, arith_rates *r);
+
+EXTERN(void) quantize_trellis_arith
+(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_blocks, JBLOCKROW src, JDIMENSION num_blocks,
+ JQUANT_TBL * qtbl, double *norm_src, double *norm_coef, JCOEF *last_dc_val,
+ JBLOCKROW coef_blocks_above, JBLOCKROW src_above);
+
 /* Constant tables in jutils.c */
 #if 0                           /* This table is not actually needed in v6a */
 extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */

From 888d4075ee43bcf706d1475f55c4a98b9a91e97e Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Sun, 21 Dec 2014 10:38:50 +0100
Subject: [PATCH 067/108] Refine rate estimate in trellis

Take into account cutoff parameter to switch between sets of contexts
for rate estimation in arithmetic coding version of trellis quantization
---
 jcarith.c  | 3 +++
 jcdctmgr.c | 2 +-
 jpegint.h  | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/jcarith.c b/jcarith.c
index 50c52bc4..50ed4156 100644
--- a/jcarith.c
+++ b/jcarith.c
@@ -940,6 +940,9 @@ jget_arith_rates (j_compress_ptr cinfo, int dc_tbl_no, int ac_tbl_no, arith_rate
 {
   int i;
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  
+  r->arith_ac_K = cinfo->arith_ac_K[ac_tbl_no];
+  
   for (i = 0; i < DC_STAT_BINS; i++) {
     int state = entropy->dc_stats[dc_tbl_no][i];
     int mps_val = state >> 7;
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 37e39e5e..6292a42a 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -1520,7 +1520,7 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
             if (v2 >>= 1) {
               coef_bits += r->rate_ac[st][1];
               m <<= 1;
-              st = 189; /* TODO: condition 189/217 */
+              st = (i <= r->arith_ac_K) ? 189 : 217;
               while (v2 >>= 1) {
                 coef_bits += r->rate_ac[st][1];
                 m <<= 1;
diff --git a/jpegint.h b/jpegint.h
index 424f2d54..42c3c5d5 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -110,6 +110,7 @@ struct jpeg_comp_master {
 typedef struct {
   float rate_dc[DC_STAT_BINS][2];
   float rate_ac[AC_STAT_BINS][2];
+  int arith_ac_K;
 } arith_rates;
 
 /* Main buffer control (downsampled-data buffer) */

From 933289f509a25c5cb8d2117327c458f833514258 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Sun, 21 Dec 2014 12:46:43 +0100
Subject: [PATCH 068/108] Clean up trellis code

Avoid code duplication in computation of DC delta rate in trellis
quantization for arithmetic coding
---
 jcdctmgr.c | 52 +++++++++++++++-------------------------------------
 1 file changed, 15 insertions(+), 37 deletions(-)

diff --git a/jcdctmgr.c b/jcdctmgr.c
index 6292a42a..21d3479e 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -1358,6 +1358,8 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
       float dc_candidate_dist;
       
       qval = (x + q/2) / q; /* quantized value (round nearest) */
+      
+      /* loop over candidates in current block */
       for (k = 0; k < DC_TRELLIS_CANDIDATES; k++) {
         int delta;
         int dc_delta;
@@ -1389,10 +1391,14 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
           dc_candidate_dist +=  cinfo->master->trellis_delta_dc_weight * (vertical_dist - dc_candidate_dist);
         }
         
-        if (bi == 0) {
-          dc_delta = dc_candidate[k][bi] - *last_dc_val;
+        /* loop of candidates from previous block */
+        for (l = 0; l < (bi == 0 ? 1 : DC_TRELLIS_CANDIDATES); l++) {
+          int dc_pred = (bi == 0 ? *last_dc_val : dc_candidate[l][bi-1]);
+          
+          dc_delta = dc_candidate[k][bi] - dc_pred;
           
           bits = r->rate_dc[st][dc_delta != 0];
+          
           if (dc_delta != 0) {
             bits += r->rate_dc[st+1][dc_delta < 0];
             st += 2 + (dc_delta < 0);
@@ -1415,42 +1421,14 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
             while (m >>= 1)
               bits += r->rate_dc[st][(m & dc_delta) ? 1 : 0];
           }
+          
           cost = bits + dc_candidate_dist;
-          accumulated_dc_cost[k][0] = cost;
-          dc_cost_backtrack[k][0] = -1;
-        } else {
-          for (l = 0; l < DC_TRELLIS_CANDIDATES; l++) {
-            dc_delta = dc_candidate[k][bi] - dc_candidate[l][bi-1];
-            
-            bits = r->rate_dc[st][dc_delta != 0];
-            if (dc_delta != 0) {
-              bits += r->rate_dc[st+1][dc_delta < 0];
-              st += 2 + (dc_delta < 0);
-              dc_delta = abs(dc_delta);
-              
-              m = 0;
-              if (dc_delta -= 1) {
-                bits += r->rate_dc[st][1];
-                st = 20;
-                m = 1;
-                v2 = dc_delta;
-                while (v2 >>= 1) {
-                  bits += r->rate_dc[st][1];
-                  m <<= 1;
-                  st++;
-                }
-              }
-              bits += r->rate_dc[st][0];
-              st += 14;
-              while (m >>= 1)
-                bits += r->rate_dc[st][(m & dc_delta) ? 1 : 0];
-            }
-            
-            cost = bits + dc_candidate_dist + accumulated_dc_cost[l][bi-1];
-            if (l == 0 || cost < accumulated_dc_cost[k][bi]) {
-              accumulated_dc_cost[k][bi] = cost;
-              dc_cost_backtrack[k][bi] = l;
-            }
+          if (bi != 0)
+            cost += accumulated_dc_cost[l][bi-1];
+          
+          if (l == 0 || cost < accumulated_dc_cost[k][bi]) {
+            accumulated_dc_cost[k][bi] = cost;
+            dc_cost_backtrack[k][bi] = (bi == 0 ? -1 : l);
           }
         }
       }

From fa628eff6a2690a348ea2a427401d29f64baa053 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Sun, 21 Dec 2014 21:36:06 +0100
Subject: [PATCH 069/108] Refine rate estimation in trellis quant

Account for the more elaborate context modeling used for coding the DC
coefficient differences
---
 jcarith.c  |  2 ++
 jcdctmgr.c | 23 +++++++++++++++++------
 jpegint.h  |  2 ++
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/jcarith.c b/jcarith.c
index 50ed4156..7adb989a 100644
--- a/jcarith.c
+++ b/jcarith.c
@@ -941,6 +941,8 @@ jget_arith_rates (j_compress_ptr cinfo, int dc_tbl_no, int ac_tbl_no, arith_rate
   int i;
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   
+  r->arith_dc_L = cinfo->arith_dc_L[dc_tbl_no];
+  r->arith_dc_U = cinfo->arith_dc_U[dc_tbl_no];
   r->arith_ac_K = cinfo->arith_ac_K[ac_tbl_no];
   
   for (i = 0; i < DC_STAT_BINS; i++) {
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 21d3479e..3380b05c 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -1290,12 +1290,11 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
   float *accumulated_dc_cost[DC_TRELLIS_CANDIDATES];
   int *dc_cost_backtrack[DC_TRELLIS_CANDIDATES];
   JCOEF *dc_candidate[DC_TRELLIS_CANDIDATES];
+  int *dc_context[DC_TRELLIS_CANDIDATES];
+  
   int mode = 1;
   float lambda_table[DCTSIZE2];
   
-  /* Arithmetic coding context. Set to 0 for now but can refined */
-  int dc_context = 0;
-  
   Ss = cinfo->Ss;
   Se = cinfo->Se;
   if (Ss == 0)
@@ -1308,9 +1307,11 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
       accumulated_dc_cost[i] = (float *)malloc(num_blocks * sizeof(float));
       dc_cost_backtrack[i] = (int *)malloc(num_blocks * sizeof(int));
       dc_candidate[i] = (JCOEF *)malloc(num_blocks * sizeof(JCOEF));
+      dc_context[i] = (int *)malloc(num_blocks * sizeof(int));
       if (!accumulated_dc_cost[i] ||
           !dc_cost_backtrack[i] ||
-          !dc_candidate[i]) {
+          !dc_candidate[i] ||
+          !dc_context[i]) {
         ERREXIT(cinfo, JERR_OUT_OF_MEMORY);
       }
     }
@@ -1364,7 +1365,6 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
         int delta;
         int dc_delta;
         float bits;
-        int st = dc_context;
         int m;
         int v2;
         
@@ -1394,7 +1394,8 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
         /* loop of candidates from previous block */
         for (l = 0; l < (bi == 0 ? 1 : DC_TRELLIS_CANDIDATES); l++) {
           int dc_pred = (bi == 0 ? *last_dc_val : dc_candidate[l][bi-1]);
-          
+          int updated_dc_context = 0;
+          int st = (bi == 0) ? 0 : dc_context[l][bi-1];
           dc_delta = dc_candidate[k][bi] - dc_pred;
           
           bits = r->rate_dc[st][dc_delta != 0];
@@ -1402,6 +1403,8 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
           if (dc_delta != 0) {
             bits += r->rate_dc[st+1][dc_delta < 0];
             st += 2 + (dc_delta < 0);
+            updated_dc_context = (dc_delta < 0) ? 8 : 4;
+            
             dc_delta = abs(dc_delta);
             
             m = 0;
@@ -1417,6 +1420,12 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
               }
             }
             bits += r->rate_dc[st][0];
+            
+            if (m < (int) ((1L << r->arith_dc_L) >> 1))
+              updated_dc_context = 0;    /* zero diff category */
+            else if (m > (int) ((1L << r->arith_dc_U) >> 1))
+              updated_dc_context += 8;   /* large diff category */
+
             st += 14;
             while (m >>= 1)
               bits += r->rate_dc[st][(m & dc_delta) ? 1 : 0];
@@ -1429,6 +1438,7 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
           if (l == 0 || cost < accumulated_dc_cost[k][bi]) {
             accumulated_dc_cost[k][bi] = cost;
             dc_cost_backtrack[k][bi] = (bi == 0 ? -1 : l);
+            dc_context[k][bi] = updated_dc_context;
           }
         }
       }
@@ -1590,6 +1600,7 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
       free(accumulated_dc_cost[i]);
       free(dc_cost_backtrack[i]);
       free(dc_candidate[i]);
+      free(dc_context[i]);
     }
   }
 }
diff --git a/jpegint.h b/jpegint.h
index 42c3c5d5..194bb9ee 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -110,6 +110,8 @@ struct jpeg_comp_master {
 typedef struct {
   float rate_dc[DC_STAT_BINS][2];
   float rate_ac[AC_STAT_BINS][2];
+  int arith_dc_L;
+  int arith_dc_U;
   int arith_ac_K;
 } arith_rates;
 

From 598cd994f3db36a87393eea95cd6afcbdd622bed Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Mon, 22 Dec 2014 01:00:42 +0000
Subject: [PATCH 070/108] Split AltiVec algorithms into separate files for ease
 of maintenance;  Rename constants using lowercase so they are not confused
 with macros

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1463 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/Makefile.am             |   6 +-
 simd/jfdctfst-altivec.c      | 156 +++++++
 simd/jfdctint-altivec.c      | 225 +++++++++
 simd/jidctfst-altivec.c      | 226 +++++++++
 simd/jidctint-altivec.c      | 299 ++++++++++++
 simd/jsimd_altivec.h         |  76 +++
 simd/jsimd_powerpc_altivec.c | 862 -----------------------------------
 7 files changed, 986 insertions(+), 864 deletions(-)
 create mode 100644 simd/jfdctfst-altivec.c
 create mode 100644 simd/jfdctint-altivec.c
 create mode 100644 simd/jidctfst-altivec.c
 create mode 100644 simd/jidctint-altivec.c
 create mode 100644 simd/jsimd_altivec.h
 delete mode 100644 simd/jsimd_powerpc_altivec.c

diff --git a/simd/Makefile.am b/simd/Makefile.am
index c5213b8c..d7b8d02a 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -72,8 +72,10 @@ endif
 
 if SIMD_POWERPC
 
-libsimd_la_SOURCES = jsimd_powerpc.c jsimd_powerpc_altivec.c
-libsimd_la_CFLAGS = -maltivec 
+libsimd_la_SOURCES = jsimd_powerpc.c \
+	jfdctfst-altivec.c    jfdctint-altivec.c \
+	jidctfst-altivec.c    jidctint-altivec.c
+libsimd_la_CFLAGS = -maltivec
 
 endif
 
diff --git a/simd/jfdctfst-altivec.c b/simd/jfdctfst-altivec.c
new file mode 100644
index 00000000..0c3687f8
--- /dev/null
+++ b/simd/jfdctfst-altivec.c
@@ -0,0 +1,156 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ *   the elements in arg3 + the most significant 17 bits of
+ *     (the elements in arg1 * the elements in arg2).
+ */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_382 98   /* FIX(0.382683433) */
+#define F_0_541 139  /* FIX(0.541196100) */
+#define F_0_707 181  /* FIX(0.707106781) */
+#define F_1_306 334  /* FIX(1.306562965) */
+
+#define CONST_BITS 8
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+
+
+#define DO_FDCT()  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(tmp0, tmp3);  \
+  tmp13 = vec_sub(tmp0, tmp3);  \
+  tmp11 = vec_add(tmp1, tmp2);  \
+  tmp12 = vec_sub(tmp1, tmp2);  \
+  \
+  out0  = vec_add(tmp10, tmp11);  \
+  out4  = vec_sub(tmp10, tmp11);  \
+  \
+  z1 = vec_add(tmp12, tmp13);  \
+  z1 = vec_sl(z1, pre_multiply_scale_bits);  \
+  z1 = vec_madds(z1, pw_0707, zero);  \
+  \
+  out2 = vec_add(tmp13, z1);  \
+  out6 = vec_sub(tmp13, z1);  \
+  \
+  /* Odd part */  \
+  \
+  tmp10 = vec_add(tmp4, tmp5);  \
+  tmp11 = vec_add(tmp5, tmp6);  \
+  tmp12 = vec_add(tmp6, tmp7);  \
+  \
+  tmp10 = vec_sl(tmp10, pre_multiply_scale_bits);  \
+  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
+  z5 = vec_sub(tmp10, tmp12);  \
+  z5 = vec_madds(z5, pw_0382, zero);  \
+  \
+  z2 = vec_madds(tmp10, pw_0541, z5);  \
+  z4 = vec_madds(tmp12, pw_1306, z5);  \
+  \
+  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
+  z3 = vec_madds(tmp11, pw_0707, zero);  \
+  \
+  z11 = vec_add(tmp7, z3);  \
+  z13 = vec_sub(tmp7, z3);  \
+  \
+  out5 = vec_add(z13, z2);  \
+  out3 = vec_sub(z13, z2);  \
+  out1 = vec_add(z11, z4);  \
+  out7 = vec_sub(z11, z4);  \
+}
+
+
+void
+jsimd_fdct_ifast_altivec (DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    z1, z2, z3, z4, z5, z11, z13,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+
+  /* Constants */
+  __vector short zero = vec_splat_s16(0),
+    pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
+    pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
+    pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
+    pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
+  __vector unsigned short
+    pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
+
+  /* Pass 1: process rows. */
+
+  row0 = *(__vector short *)&data[0];
+  row1 = *(__vector short *)&data[8];
+  row2 = *(__vector short *)&data[16];
+  row3 = *(__vector short *)&data[24];
+  row4 = *(__vector short *)&data[32];
+  row5 = *(__vector short *)&data[40];
+  row6 = *(__vector short *)&data[48];
+  row7 = *(__vector short *)&data[56];
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_FDCT();
+
+  /* Pass 2: process columns. */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_FDCT();
+
+  *(__vector short *)&data[0] = out0;
+  *(__vector short *)&data[8] = out1;
+  *(__vector short *)&data[16] = out2;
+  *(__vector short *)&data[24] = out3;
+  *(__vector short *)&data[32] = out4;
+  *(__vector short *)&data[40] = out5;
+  *(__vector short *)&data[48] = out6;
+  *(__vector short *)&data[56] = out7;
+}
diff --git a/simd/jfdctint-altivec.c b/simd/jfdctint-altivec.c
new file mode 100644
index 00000000..d1230c7c
--- /dev/null
+++ b/simd/jfdctint-altivec.c
@@ -0,0 +1,225 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* SLOW INTEGER FORWARD DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298 2446   /* FIX(0.298631336) */
+#define F_0_390 3196   /* FIX(0.390180644) */
+#define F_0_541 4433   /* FIX(0.541196100) */
+#define F_0_765 6270   /* FIX(0.765366865) */
+#define F_0_899 7373   /* FIX(0.899976223) */
+#define F_1_175 9633   /* FIX(1.175875602) */
+#define F_1_501 12299  /* FIX(1.501321110) */
+#define F_1_847 15137  /* FIX(1.847759065) */
+#define F_1_961 16069  /* FIX(1.961570560) */
+#define F_2_053 16819  /* FIX(2.053119869) */
+#define F_2_562 20995  /* FIX(2.562915447) */
+#define F_3_072 25172  /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+
+#define DO_FDCT_COMMON(PASS)  \
+{  \
+  tmp1312l = vec_mergeh(tmp13, tmp12);  \
+  tmp1312h = vec_mergel(tmp13, tmp12);  \
+  \
+  out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS);  \
+  out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS);  \
+  out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS);  \
+  out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS);  \
+  \
+  out2l = vec_sra(out2l, descale_p##PASS);  \
+  out2h = vec_sra(out2h, descale_p##PASS);  \
+  out6l = vec_sra(out6l, descale_p##PASS);  \
+  out6h = vec_sra(out6h, descale_p##PASS);  \
+  \
+  out2 = vec_pack(out2l, out2h);  \
+  out6 = vec_pack(out6l, out6h);  \
+  \
+  /* Odd part */  \
+  \
+  z3 = vec_add(tmp4, tmp6);  \
+  z4 = vec_add(tmp5, tmp7);  \
+  \
+  z34l = vec_mergeh(z3, z4);  \
+  z34h = vec_mergel(z3, z4);  \
+  \
+  z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS);  \
+  z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS);  \
+  z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS);  \
+  z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS);  \
+  \
+  tmp47l = vec_mergeh(tmp4, tmp7);  \
+  tmp47h = vec_mergel(tmp4, tmp7);  \
+  \
+  out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l);  \
+  out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h);  \
+  out1l = vec_msums(tmp47l, pw_mf089_f060, z4l);  \
+  out1h = vec_msums(tmp47h, pw_mf089_f060, z4h);  \
+  \
+  out7l = vec_sra(out7l, descale_p##PASS);  \
+  out7h = vec_sra(out7h, descale_p##PASS);  \
+  out1l = vec_sra(out1l, descale_p##PASS);  \
+  out1h = vec_sra(out1h, descale_p##PASS);  \
+  \
+  out7 = vec_pack(out7l, out7h);  \
+  out1 = vec_pack(out1l, out1h);  \
+  \
+  tmp56l = vec_mergeh(tmp5, tmp6);  \
+  tmp56h = vec_mergel(tmp5, tmp6);  \
+  \
+  out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l);  \
+  out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h);  \
+  out3l = vec_msums(tmp56l, pw_mf256_f050, z3l);  \
+  out3h = vec_msums(tmp56h, pw_mf256_f050, z3h);  \
+  \
+  out5l = vec_sra(out5l, descale_p##PASS);  \
+  out5h = vec_sra(out5h, descale_p##PASS);  \
+  out3l = vec_sra(out3l, descale_p##PASS);  \
+  out3h = vec_sra(out3h, descale_p##PASS);  \
+  \
+  out5 = vec_pack(out5l, out5h);  \
+  out3 = vec_pack(out3l, out3h);  \
+}
+
+#define DO_FDCT_ROWS()  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(tmp0, tmp3);  \
+  tmp13 = vec_sub(tmp0, tmp3);  \
+  tmp11 = vec_add(tmp1, tmp2);  \
+  tmp12 = vec_sub(tmp1, tmp2);  \
+  \
+  out0  = vec_add(tmp10, tmp11);  \
+  out0  = vec_sl(out0, pass1_bits);  \
+  out4  = vec_sub(tmp10, tmp11);  \
+  out4  = vec_sl(out4, pass1_bits);  \
+  \
+  DO_FDCT_COMMON(1);  \
+}
+
+#define DO_FDCT_COLS()  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(tmp0, tmp3);  \
+  tmp13 = vec_sub(tmp0, tmp3);  \
+  tmp11 = vec_add(tmp1, tmp2);  \
+  tmp12 = vec_sub(tmp1, tmp2);  \
+  \
+  out0  = vec_add(tmp10, tmp11);  \
+  out0  = vec_add(out0, pw_descale_p2x);  \
+  out0  = vec_sra(out0, pass1_bits);  \
+  out4  = vec_sub(tmp10, tmp11);  \
+  out4  = vec_add(out4, pw_descale_p2x);  \
+  out4  = vec_sra(out4, pass1_bits);  \
+  \
+  DO_FDCT_COMMON(2);  \
+}
+
+
+void
+jsimd_fdct_islow_altivec (DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+    z3, z4, z34l, z34h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int z3l, z3h, z4l, z4h,
+    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+    out7l, out7h;
+
+  /* Constants */
+  __vector short
+    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
+    pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
+  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+  __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+    descale_p2 = { __4X(DESCALE_P2) };
+
+  /* Pass 1: process rows. */
+
+  row0 = *(__vector short *)&data[0];
+  row1 = *(__vector short *)&data[8];
+  row2 = *(__vector short *)&data[16];
+  row3 = *(__vector short *)&data[24];
+  row4 = *(__vector short *)&data[32];
+  row5 = *(__vector short *)&data[40];
+  row6 = *(__vector short *)&data[48];
+  row7 = *(__vector short *)&data[56];
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_FDCT_ROWS();
+
+  /* Pass 2: process columns. */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_FDCT_COLS();
+
+  *(__vector short *)&data[0] = out0;
+  *(__vector short *)&data[8] = out1;
+  *(__vector short *)&data[16] = out2;
+  *(__vector short *)&data[24] = out3;
+  *(__vector short *)&data[32] = out4;
+  *(__vector short *)&data[40] = out5;
+  *(__vector short *)&data[48] = out6;
+  *(__vector short *)&data[56] = out7;
+}
diff --git a/simd/jidctfst-altivec.c b/simd/jidctfst-altivec.c
new file mode 100644
index 00000000..f7749b84
--- /dev/null
+++ b/simd/jidctfst-altivec.c
@@ -0,0 +1,226 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ *   the elements in arg3 + the most significant 17 bits of
+ *     (the elements in arg1 * the elements in arg2).
+ */
+
+#include "jsimd_altivec.h"
+
+
+#define F_1_082 277              /* FIX(1.082392200) */
+#define F_1_414 362              /* FIX(1.414213562) */
+#define F_1_847 473              /* FIX(1.847759065) */
+#define F_2_613 669              /* FIX(2.613125930) */
+#define F_1_613 (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
+
+#define CONST_BITS 8
+#define PASS1_BITS 2
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+
+
+#define DO_IDCT(in)  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(in##0, in##4);  \
+  tmp11 = vec_sub(in##0, in##4);  \
+  tmp13 = vec_add(in##2, in##6);  \
+  \
+  tmp12 = vec_sub(in##2, in##6);  \
+  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
+  tmp12 = vec_madds(tmp12, pw_F1414, zero);  \
+  tmp12 = vec_sub(tmp12, tmp13);  \
+  \
+  tmp0 = vec_add(tmp10, tmp13);  \
+  tmp3 = vec_sub(tmp10, tmp13);  \
+  tmp1 = vec_add(tmp11, tmp12);  \
+  tmp2 = vec_sub(tmp11, tmp12);  \
+  \
+  /* Odd part */  \
+  \
+  z13 = vec_add(in##5, in##3);  \
+  z10 = vec_sub(in##5, in##3);  \
+  z10s = vec_sl(z10, pre_multiply_scale_bits);  \
+  z11 = vec_add(in##1, in##7);  \
+  z12s = vec_sub(in##1, in##7);  \
+  z12s = vec_sl(z12s, pre_multiply_scale_bits);  \
+  \
+  tmp11 = vec_sub(z11, z13);  \
+  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
+  tmp11 = vec_madds(tmp11, pw_F1414, zero);  \
+  \
+  tmp7 = vec_add(z11, z13);  \
+  \
+  z5 = vec_add(z10s, z12s);  \
+  z5 = vec_madds(z5, pw_F1847, zero);  \
+  \
+  tmp10 = vec_madds(z12s, pw_F1082, zero);  \
+  tmp10 = vec_sub(tmp10, z5);  \
+  tmp12 = vec_madds(z10s, pw_MF1613, z5);  \
+  tmp12 = vec_sub(tmp12, z10);  \
+  \
+  tmp6 = vec_sub(tmp12, tmp7);  \
+  tmp5 = vec_sub(tmp11, tmp6);  \
+  tmp4 = vec_add(tmp10, tmp5);  \
+  \
+  out0 = vec_add(tmp0, tmp7);  \
+  out1 = vec_add(tmp1, tmp6);  \
+  out2 = vec_add(tmp2, tmp5);  \
+  out3 = vec_sub(tmp3, tmp4);  \
+  out4 = vec_add(tmp3, tmp4);  \
+  out5 = vec_sub(tmp2, tmp5);  \
+  out6 = vec_sub(tmp1, tmp6);  \
+  out7 = vec_sub(tmp0, tmp7);  \
+}
+
+
+void
+jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  short *dct_table = (short *)dct_table_;
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    z5, z10, z10s, z11, z12s, z13,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector signed char outb;
+  long long *outptr, *outbptr = (long long *)(&outb);
+
+  /* Constants */
+  __vector short zero = { __8X(0) },
+    pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
+    pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
+    pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
+    pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
+  __vector unsigned short
+    pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
+    pass1_bits3 = { __8X(PASS1_BITS + 3) };
+  __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+  /* Pass 1: process columns. */
+
+  col0 = *(__vector short *)&coef_block[0];
+  col1 = *(__vector short *)&coef_block[8];
+  col2 = *(__vector short *)&coef_block[16];
+  col3 = *(__vector short *)&coef_block[24];
+  col4 = *(__vector short *)&coef_block[32];
+  col5 = *(__vector short *)&coef_block[40];
+  col6 = *(__vector short *)&coef_block[48];
+  col7 = *(__vector short *)&coef_block[56];
+
+  tmp1 = vec_or(col1, col2);
+  tmp2 = vec_or(col3, col4);
+  tmp1 = vec_or(tmp1, tmp2);
+  tmp3 = vec_or(col5, col6);
+  tmp3 = vec_or(tmp3, col7);
+  tmp1 = vec_or(tmp1, tmp3);
+
+  quant0 = *(__vector short *)&dct_table[0];
+  col0 = vec_mladd(col0, quant0, zero);
+
+  if (vec_all_eq(tmp1, zero)) {
+    /* AC terms all zero */
+
+    row0 = vec_splat(col0, 0);
+    row1 = vec_splat(col0, 1);
+    row2 = vec_splat(col0, 2);
+    row3 = vec_splat(col0, 3);
+    row4 = vec_splat(col0, 4);
+    row5 = vec_splat(col0, 5);
+    row6 = vec_splat(col0, 6);
+    row7 = vec_splat(col0, 7);
+
+  } else {
+
+    quant1 = *(__vector short *)&dct_table[8];
+    quant2 = *(__vector short *)&dct_table[16];
+    quant3 = *(__vector short *)&dct_table[24];
+    quant4 = *(__vector short *)&dct_table[32];
+    quant5 = *(__vector short *)&dct_table[40];
+    quant6 = *(__vector short *)&dct_table[48];
+    quant7 = *(__vector short *)&dct_table[56];
+
+    col1 = vec_mladd(col1, quant1, zero);
+    col2 = vec_mladd(col2, quant2, zero);
+    col3 = vec_mladd(col3, quant3, zero);
+    col4 = vec_mladd(col4, quant4, zero);
+    col5 = vec_mladd(col5, quant5, zero);
+    col6 = vec_mladd(col6, quant6, zero);
+    col7 = vec_mladd(col7, quant7, zero);
+
+    DO_IDCT(col);
+
+    TRANSPOSE(out, row);
+  }
+
+  /* Pass 2: process rows. */
+
+  DO_IDCT(row);
+
+  out0 = vec_sra(out0, pass1_bits3);
+  out1 = vec_sra(out1, pass1_bits3);
+  out2 = vec_sra(out2, pass1_bits3);
+  out3 = vec_sra(out3, pass1_bits3);
+  out4 = vec_sra(out4, pass1_bits3);
+  out5 = vec_sra(out5, pass1_bits3);
+  out6 = vec_sra(out6, pass1_bits3);
+  out7 = vec_sra(out7, pass1_bits3);
+
+  TRANSPOSE(out, col);
+
+  outb = vec_packs(col0, col1);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (long long *)(output_buf[0] + output_col);
+  *outptr = outbptr[0];
+  outptr = (long long *)(output_buf[1] + output_col);
+  *outptr = outbptr[1];
+
+  outb = vec_packs(col2, col3);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (long long *)(output_buf[2] + output_col);
+  *outptr = outbptr[0];
+  outptr = (long long *)(output_buf[3] + output_col);
+  *outptr = outbptr[1];
+
+  outb = vec_packs(col4, col5);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (long long *)(output_buf[4] + output_col);
+  *outptr = outbptr[0];
+  outptr = (long long *)(output_buf[5] + output_col);
+  *outptr = outbptr[1];
+
+  outb = vec_packs(col6, col7);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (long long *)(output_buf[6] + output_col);
+  *outptr = outbptr[0];
+  outptr = (long long *)(output_buf[7] + output_col);
+  *outptr = outbptr[1];
+}
diff --git a/simd/jidctint-altivec.c b/simd/jidctint-altivec.c
new file mode 100644
index 00000000..00c6c58d
--- /dev/null
+++ b/simd/jidctint-altivec.c
@@ -0,0 +1,299 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* SLOW INTEGER INVERSE DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298 2446   /* FIX(0.298631336) */
+#define F_0_390 3196   /* FIX(0.390180644) */
+#define F_0_541 4433   /* FIX(0.541196100) */
+#define F_0_765 6270   /* FIX(0.765366865) */
+#define F_0_899 7373   /* FIX(0.899976223) */
+#define F_1_175 9633   /* FIX(1.175875602) */
+#define F_1_501 12299  /* FIX(1.501321110) */
+#define F_1_847 15137  /* FIX(1.847759065) */
+#define F_1_961 16069  /* FIX(1.961570560) */
+#define F_2_053 16819  /* FIX(2.053119869) */
+#define F_2_562 20995  /* FIX(2.562915447) */
+#define F_3_072 25172  /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+
+#define DO_IDCT(in, PASS)  \
+{  \
+  /* Even part */  \
+  \
+  in##26l = vec_mergeh(in##2, in##6);  \
+  in##26h = vec_mergel(in##2, in##6);  \
+  \
+  tmp3l = vec_msums(in##26l, pw_f130_f054, zero32);  \
+  tmp3h = vec_msums(in##26h, pw_f130_f054, zero32);  \
+  tmp2l = vec_msums(in##26l, pw_f054_mf130, zero32);  \
+  tmp2h = vec_msums(in##26h, pw_f054_mf130, zero32);  \
+  \
+  tmp0 = vec_add(in##0, in##4);  \
+  tmp1 = vec_sub(in##0, in##4);  \
+  \
+  tmp0l = vec_unpackh(tmp0);  \
+  tmp0h = vec_unpackl(tmp0);  \
+  tmp0l = vec_sl(tmp0l, const_bits);  \
+  tmp0h = vec_sl(tmp0h, const_bits);  \
+  tmp0l = vec_add(tmp0l, pd_descale_p##PASS);  \
+  tmp0h = vec_add(tmp0h, pd_descale_p##PASS);  \
+  \
+  tmp10l = vec_add(tmp0l, tmp3l);  \
+  tmp10h = vec_add(tmp0h, tmp3h);  \
+  tmp13l = vec_sub(tmp0l, tmp3l);  \
+  tmp13h = vec_sub(tmp0h, tmp3h);  \
+  \
+  tmp1l = vec_unpackh(tmp1);  \
+  tmp1h = vec_unpackl(tmp1);  \
+  tmp1l = vec_sl(tmp1l, const_bits);  \
+  tmp1h = vec_sl(tmp1h, const_bits);  \
+  tmp1l = vec_add(tmp1l, pd_descale_p##PASS);  \
+  tmp1h = vec_add(tmp1h, pd_descale_p##PASS);  \
+  \
+  tmp11l = vec_add(tmp1l, tmp2l);  \
+  tmp11h = vec_add(tmp1h, tmp2h);  \
+  tmp12l = vec_sub(tmp1l, tmp2l);  \
+  tmp12h = vec_sub(tmp1h, tmp2h);  \
+  \
+  /* Odd part */  \
+  \
+  z3 = vec_add(in##3, in##7);  \
+  z4 = vec_add(in##1, in##5);  \
+  \
+  z34l = vec_mergeh(z3, z4);  \
+  z34h = vec_mergel(z3, z4);  \
+  \
+  z3l = vec_msums(z34l, pw_mf078_f117, zero32);  \
+  z3h = vec_msums(z34h, pw_mf078_f117, zero32);  \
+  z4l = vec_msums(z34l, pw_f117_f078, zero32);  \
+  z4h = vec_msums(z34h, pw_f117_f078, zero32);  \
+  \
+  in##71l = vec_mergeh(in##7, in##1);  \
+  in##71h = vec_mergel(in##7, in##1);  \
+  \
+  tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l);  \
+  tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h);  \
+  tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l);  \
+  tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h);  \
+  \
+  in##53l = vec_mergeh(in##5, in##3);  \
+  in##53h = vec_mergel(in##5, in##3);  \
+  \
+  tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l);  \
+  tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h);  \
+  tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l);  \
+  tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h);  \
+  \
+  /* Final output stage */  \
+  \
+  out0l = vec_add(tmp10l, tmp3l);  \
+  out0h = vec_add(tmp10h, tmp3h);  \
+  out7l = vec_sub(tmp10l, tmp3l);  \
+  out7h = vec_sub(tmp10h, tmp3h);  \
+  \
+  out0l = vec_sra(out0l, descale_p##PASS);  \
+  out0h = vec_sra(out0h, descale_p##PASS);  \
+  out7l = vec_sra(out7l, descale_p##PASS);  \
+  out7h = vec_sra(out7h, descale_p##PASS);  \
+  \
+  out0 = vec_pack(out0l, out0h);  \
+  out7 = vec_pack(out7l, out7h);  \
+  \
+  out1l = vec_add(tmp11l, tmp2l);  \
+  out1h = vec_add(tmp11h, tmp2h);  \
+  out6l = vec_sub(tmp11l, tmp2l);  \
+  out6h = vec_sub(tmp11h, tmp2h);  \
+  \
+  out1l = vec_sra(out1l, descale_p##PASS);  \
+  out1h = vec_sra(out1h, descale_p##PASS);  \
+  out6l = vec_sra(out6l, descale_p##PASS);  \
+  out6h = vec_sra(out6h, descale_p##PASS);  \
+  \
+  out1 = vec_pack(out1l, out1h);  \
+  out6 = vec_pack(out6l, out6h);  \
+  \
+  out2l = vec_add(tmp12l, tmp1l);  \
+  out2h = vec_add(tmp12h, tmp1h);  \
+  out5l = vec_sub(tmp12l, tmp1l);  \
+  out5h = vec_sub(tmp12h, tmp1h);  \
+  \
+  out2l = vec_sra(out2l, descale_p##PASS);  \
+  out2h = vec_sra(out2h, descale_p##PASS);  \
+  out5l = vec_sra(out5l, descale_p##PASS);  \
+  out5h = vec_sra(out5h, descale_p##PASS);  \
+  \
+  out2 = vec_pack(out2l, out2h);  \
+  out5 = vec_pack(out5l, out5h);  \
+  \
+  out3l = vec_add(tmp13l, tmp0l);  \
+  out3h = vec_add(tmp13h, tmp0h);  \
+  out4l = vec_sub(tmp13l, tmp0l);  \
+  out4h = vec_sub(tmp13h, tmp0h);  \
+  \
+  out3l = vec_sra(out3l, descale_p##PASS);  \
+  out3h = vec_sra(out3h, descale_p##PASS);  \
+  out4l = vec_sra(out4l, descale_p##PASS);  \
+  out4h = vec_sra(out4h, descale_p##PASS);  \
+  \
+  out3 = vec_pack(out3l, out3h);  \
+  out4 = vec_pack(out4l, out4h);  \
+}
+
+
+void
+jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  short *dct_table = (short *)dct_table_;
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+    tmp0, tmp1, tmp2, tmp3, z3, z4,
+    z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
+    row71l, row71h, row26l, row26h, row53l, row53h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
+    tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
+    z3l, z3h, z4l, z4h,
+    out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
+    out5l, out5h, out6l, out6h, out7l, out7h;
+  __vector signed char outb;
+  long long *outptr, *outbptr = (long long *)(&outb);
+
+  /* Constants */
+  __vector short zero16 = { __8X(0) },
+    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
+  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+  __vector int zero32 = { __4X(0) },
+    pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+    descale_p2 = { __4X(DESCALE_P2) },
+    const_bits = { __4X(CONST_BITS) };
+  __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+  /* Pass 1: process columns. */
+
+  col0 = *(__vector short *)&coef_block[0];
+  col1 = *(__vector short *)&coef_block[8];
+  col2 = *(__vector short *)&coef_block[16];
+  col3 = *(__vector short *)&coef_block[24];
+  col4 = *(__vector short *)&coef_block[32];
+  col5 = *(__vector short *)&coef_block[40];
+  col6 = *(__vector short *)&coef_block[48];
+  col7 = *(__vector short *)&coef_block[56];
+
+  tmp1 = vec_or(col1, col2);
+  tmp2 = vec_or(col3, col4);
+  tmp1 = vec_or(tmp1, tmp2);
+  tmp3 = vec_or(col5, col6);
+  tmp3 = vec_or(tmp3, col7);
+  tmp1 = vec_or(tmp1, tmp3);
+
+  quant0 = *(__vector short *)&dct_table[0];
+  col0 = vec_mladd(col0, quant0, zero16);
+
+  if (vec_all_eq(tmp1, zero16)) {
+    /* AC terms all zero */
+
+    col0 = vec_sl(col0, pass1_bits);
+
+    row0 = vec_splat(col0, 0);
+    row1 = vec_splat(col0, 1);
+    row2 = vec_splat(col0, 2);
+    row3 = vec_splat(col0, 3);
+    row4 = vec_splat(col0, 4);
+    row5 = vec_splat(col0, 5);
+    row6 = vec_splat(col0, 6);
+    row7 = vec_splat(col0, 7);
+
+  } else {
+
+    quant1 = *(__vector short *)&dct_table[8];
+    quant2 = *(__vector short *)&dct_table[16];
+    quant3 = *(__vector short *)&dct_table[24];
+    quant4 = *(__vector short *)&dct_table[32];
+    quant5 = *(__vector short *)&dct_table[40];
+    quant6 = *(__vector short *)&dct_table[48];
+    quant7 = *(__vector short *)&dct_table[56];
+
+    col1 = vec_mladd(col1, quant1, zero16);
+    col2 = vec_mladd(col2, quant2, zero16);
+    col3 = vec_mladd(col3, quant3, zero16);
+    col4 = vec_mladd(col4, quant4, zero16);
+    col5 = vec_mladd(col5, quant5, zero16);
+    col6 = vec_mladd(col6, quant6, zero16);
+    col7 = vec_mladd(col7, quant7, zero16);
+
+    DO_IDCT(col, 1);
+
+    TRANSPOSE(out, row);
+  }
+
+  DO_IDCT(row, 2);
+
+  TRANSPOSE(out, col);
+
+  outb = vec_packs(col0, col1);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (long long *)(output_buf[0] + output_col);
+  *outptr = outbptr[0];
+  outptr = (long long *)(output_buf[1] + output_col);
+  *outptr = outbptr[1];
+
+  outb = vec_packs(col2, col3);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (long long *)(output_buf[2] + output_col);
+  *outptr = outbptr[0];
+  outptr = (long long *)(output_buf[3] + output_col);
+  *outptr = outbptr[1];
+
+  outb = vec_packs(col4, col5);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (long long *)(output_buf[4] + output_col);
+  *outptr = outbptr[0];
+  outptr = (long long *)(output_buf[5] + output_col);
+  *outptr = outbptr[1];
+
+  outb = vec_packs(col6, col7);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (long long *)(output_buf[6] + output_col);
+  *outptr = outbptr[0];
+  outptr = (long long *)(output_buf[7] + output_col);
+  *outptr = outbptr[1];
+}
diff --git a/simd/jsimd_altivec.h b/simd/jsimd_altivec.h
new file mode 100644
index 00000000..78705568
--- /dev/null
+++ b/simd/jsimd_altivec.h
@@ -0,0 +1,76 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+#include <altivec.h>
+
+
+/* Common code */
+
+#define __4X(a) a, a, a, a
+#define __4X2(a, b) a, b, a, b, a, b, a, b
+#define __8X(a) __4X(a), __4X(a)
+#define __16X(a) __8X(a), __8X(a)
+
+#define TRANSPOSE(row, col)  \
+{  \
+  __vector short row04l, row04h, row15l, row15h,  \
+                 row26l, row26h, row37l, row37h;  \
+  __vector short col01e, col01o, col23e, col23o,  \
+                 col45e, col45o, col67e, col67o;  \
+  \
+                                       /* transpose coefficients (phase 1) */ \
+  row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
+  row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \
+  row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \
+  row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \
+  row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \
+  row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \
+  row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \
+  row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
+  \
+                                       /* transpose coefficients (phase 2) */ \
+  col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \
+  col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
+  col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
+  col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
+  col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \
+  col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \
+  col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \
+  col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
+  \
+                                       /* transpose coefficients (phase 3) */ \
+  col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */   \
+  col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */   \
+  col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */   \
+  col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */   \
+  col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */   \
+  col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */   \
+  col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */   \
+  col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */   \
+}
diff --git a/simd/jsimd_powerpc_altivec.c b/simd/jsimd_powerpc_altivec.c
deleted file mode 100644
index 7796b707..00000000
--- a/simd/jsimd_powerpc_altivec.c
+++ /dev/null
@@ -1,862 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, D. R. Commander.
- * All rights reserved.
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-#include <altivec.h>
-
-
-/* Common code */
-
-#define __4X(a) a, a, a, a
-#define __4X2(a, b) a, b, a, b, a, b, a, b
-#define __8X(a) __4X(a), __4X(a)
-#define __16X(a) __8X(a), __8X(a)
-
-#define TRANSPOSE(row, col)  \
-{  \
-  __vector short row04l, row04h, row15l, row15h,  \
-                 row26l, row26h, row37l, row37h;  \
-  __vector short col01e, col01o, col23e, col23o,  \
-                 col45e, col45o, col67e, col67o;  \
-  \
-                                       /* transpose coefficients (phase 1) */ \
-  row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
-  row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \
-  row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \
-  row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \
-  row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \
-  row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \
-  row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \
-  row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
-  \
-                                       /* transpose coefficients (phase 2) */ \
-  col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \
-  col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
-  col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
-  col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
-  col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \
-  col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \
-  col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \
-  col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
-  \
-                                       /* transpose coefficients (phase 3) */ \
-  col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */   \
-  col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */   \
-  col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */   \
-  col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */   \
-  col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */   \
-  col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */   \
-  col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */   \
-  col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */   \
-}
-
-
-/* SLOW INTEGER FORWARD DCT */
-
-#define ISLOW_F_0_298 2446   /* FIX(0.298631336) */
-#define ISLOW_F_0_390 3196   /* FIX(0.390180644) */
-#define ISLOW_F_0_541 4433   /* FIX(0.541196100) */
-#define ISLOW_F_0_765 6270   /* FIX(0.765366865) */
-#define ISLOW_F_0_899 7373   /* FIX(0.899976223) */
-#define ISLOW_F_1_175 9633   /* FIX(1.175875602) */
-#define ISLOW_F_1_501 12299  /* FIX(1.501321110) */
-#define ISLOW_F_1_847 15137  /* FIX(1.847759065) */
-#define ISLOW_F_1_961 16069  /* FIX(1.961570560) */
-#define ISLOW_F_2_053 16819  /* FIX(2.053119869) */
-#define ISLOW_F_2_562 20995  /* FIX(2.562915447) */
-#define ISLOW_F_3_072 25172  /* FIX(3.072711026) */
-
-#define ISLOW_CONST_BITS 13
-#define ISLOW_PASS1_BITS 2
-#define ISLOW_DESCALE_P1 (ISLOW_CONST_BITS - ISLOW_PASS1_BITS)
-#define ISLOW_DESCALE_P2 (ISLOW_CONST_BITS + ISLOW_PASS1_BITS)
-
-#define DO_FDCT_ISLOW_COMMON(PASS)  \
-{  \
-  tmp1312l = vec_mergeh(tmp13, tmp12);  \
-  tmp1312h = vec_mergel(tmp13, tmp12);  \
-  \
-  out2l = vec_msums(tmp1312l, PW_F130_F054, PD_DESCALE_P##PASS);  \
-  out2h = vec_msums(tmp1312h, PW_F130_F054, PD_DESCALE_P##PASS);  \
-  out6l = vec_msums(tmp1312l, PW_F054_MF130, PD_DESCALE_P##PASS);  \
-  out6h = vec_msums(tmp1312h, PW_F054_MF130, PD_DESCALE_P##PASS);  \
-  \
-  out2l = vec_sra(out2l, DESCALE_P##PASS);  \
-  out2h = vec_sra(out2h, DESCALE_P##PASS);  \
-  out6l = vec_sra(out6l, DESCALE_P##PASS);  \
-  out6h = vec_sra(out6h, DESCALE_P##PASS);  \
-  \
-  out2 = vec_pack(out2l, out2h);  \
-  out6 = vec_pack(out6l, out6h);  \
-  \
-  /* Odd part */  \
-  \
-  z3 = vec_add(tmp4, tmp6);  \
-  z4 = vec_add(tmp5, tmp7);  \
-  \
-  z34l = vec_mergeh(z3, z4);  \
-  z34h = vec_mergel(z3, z4);  \
-  \
-  z3l = vec_msums(z34l, PW_MF078_F117, PD_DESCALE_P##PASS);  \
-  z3h = vec_msums(z34h, PW_MF078_F117, PD_DESCALE_P##PASS);  \
-  z4l = vec_msums(z34l, PW_F117_F078, PD_DESCALE_P##PASS);  \
-  z4h = vec_msums(z34h, PW_F117_F078, PD_DESCALE_P##PASS);  \
-  \
-  tmp47l = vec_mergeh(tmp4, tmp7);  \
-  tmp47h = vec_mergel(tmp4, tmp7);  \
-  \
-  out7l = vec_msums(tmp47l, PW_MF060_MF089, z3l);  \
-  out7h = vec_msums(tmp47h, PW_MF060_MF089, z3h);  \
-  out1l = vec_msums(tmp47l, PW_MF089_F060, z4l);  \
-  out1h = vec_msums(tmp47h, PW_MF089_F060, z4h);  \
-  \
-  out7l = vec_sra(out7l, DESCALE_P##PASS);  \
-  out7h = vec_sra(out7h, DESCALE_P##PASS);  \
-  out1l = vec_sra(out1l, DESCALE_P##PASS);  \
-  out1h = vec_sra(out1h, DESCALE_P##PASS);  \
-  \
-  out7 = vec_pack(out7l, out7h);  \
-  out1 = vec_pack(out1l, out1h);  \
-  \
-  tmp56l = vec_mergeh(tmp5, tmp6);  \
-  tmp56h = vec_mergel(tmp5, tmp6);  \
-  \
-  out5l = vec_msums(tmp56l, PW_MF050_MF256, z4l);  \
-  out5h = vec_msums(tmp56h, PW_MF050_MF256, z4h);  \
-  out3l = vec_msums(tmp56l, PW_MF256_F050, z3l);  \
-  out3h = vec_msums(tmp56h, PW_MF256_F050, z3h);  \
-  \
-  out5l = vec_sra(out5l, DESCALE_P##PASS);  \
-  out5h = vec_sra(out5h, DESCALE_P##PASS);  \
-  out3l = vec_sra(out3l, DESCALE_P##PASS);  \
-  out3h = vec_sra(out3h, DESCALE_P##PASS);  \
-  \
-  out5 = vec_pack(out5l, out5h);  \
-  out3 = vec_pack(out3l, out3h);  \
-}
-
-#define DO_FDCT_ISLOW_ROWS()  \
-{  \
-  /* Even part */  \
-  \
-  tmp10 = vec_add(tmp0, tmp3);  \
-  tmp13 = vec_sub(tmp0, tmp3);  \
-  tmp11 = vec_add(tmp1, tmp2);  \
-  tmp12 = vec_sub(tmp1, tmp2);  \
-  \
-  out0  = vec_add(tmp10, tmp11);  \
-  out0  = vec_sl(out0, PASS1_BITS);  \
-  out4  = vec_sub(tmp10, tmp11);  \
-  out4  = vec_sl(out4, PASS1_BITS);  \
-  \
-  DO_FDCT_ISLOW_COMMON(1);  \
-}
-
-#define DO_FDCT_ISLOW_COLS()  \
-{  \
-  /* Even part */  \
-  \
-  tmp10 = vec_add(tmp0, tmp3);  \
-  tmp13 = vec_sub(tmp0, tmp3);  \
-  tmp11 = vec_add(tmp1, tmp2);  \
-  tmp12 = vec_sub(tmp1, tmp2);  \
-  \
-  out0  = vec_add(tmp10, tmp11);  \
-  out0  = vec_add(out0, PW_DESCALE_P2X);  \
-  out0  = vec_sra(out0, PASS1_BITS);  \
-  out4  = vec_sub(tmp10, tmp11);  \
-  out4  = vec_add(out4, PW_DESCALE_P2X);  \
-  out4  = vec_sra(out4, PASS1_BITS);  \
-  \
-  DO_FDCT_ISLOW_COMMON(2);  \
-}
-
-void
-jsimd_fdct_islow_altivec (DCTELEM *data)
-{
-  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
-    col0, col1, col2, col3, col4, col5, col6, col7,
-    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
-    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
-    z3, z4, z34l, z34h,
-    out0, out1, out2, out3, out4, out5, out6, out7;
-  __vector int z3l, z3h, z4l, z4h,
-    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
-    out7l, out7h;
-
-  /* Constants */
-  __vector short
-    PW_F130_F054 = { __4X2(ISLOW_F_0_541 + ISLOW_F_0_765, ISLOW_F_0_541) },
-    PW_F054_MF130 = { __4X2(ISLOW_F_0_541, ISLOW_F_0_541 - ISLOW_F_1_847) },
-    PW_MF078_F117 = { __4X2(ISLOW_F_1_175 - ISLOW_F_1_961, ISLOW_F_1_175) },
-    PW_F117_F078 = { __4X2(ISLOW_F_1_175, ISLOW_F_1_175 - ISLOW_F_0_390) },
-    PW_MF060_MF089 = { __4X2(ISLOW_F_0_298 - ISLOW_F_0_899, -ISLOW_F_0_899) },
-    PW_MF089_F060 = { __4X2(-ISLOW_F_0_899, ISLOW_F_1_501 - ISLOW_F_0_899) },
-    PW_MF050_MF256 = { __4X2(ISLOW_F_2_053 - ISLOW_F_2_562, -ISLOW_F_2_562) },
-    PW_MF256_F050 = { __4X2(-ISLOW_F_2_562, ISLOW_F_3_072 - ISLOW_F_2_562) },
-    PW_DESCALE_P2X = { __8X(1 << (ISLOW_PASS1_BITS - 1)) };
-  __vector unsigned short PASS1_BITS = { __8X(ISLOW_PASS1_BITS) };
-  __vector int PD_DESCALE_P1 = { __4X(1 << (ISLOW_DESCALE_P1 - 1)) },
-    PD_DESCALE_P2 = { __4X(1 << (ISLOW_DESCALE_P2 - 1)) };
-  __vector unsigned int DESCALE_P1 = { __4X(ISLOW_DESCALE_P1) },
-    DESCALE_P2 = { __4X(ISLOW_DESCALE_P2) };
-
-  /* Pass 1: process rows. */
-
-  row0 = *(__vector short *)&data[0];
-  row1 = *(__vector short *)&data[8];
-  row2 = *(__vector short *)&data[16];
-  row3 = *(__vector short *)&data[24];
-  row4 = *(__vector short *)&data[32];
-  row5 = *(__vector short *)&data[40];
-  row6 = *(__vector short *)&data[48];
-  row7 = *(__vector short *)&data[56];
-
-  TRANSPOSE(row, col);
-
-  tmp0 = vec_add(col0, col7);
-  tmp7 = vec_sub(col0, col7);
-  tmp1 = vec_add(col1, col6);
-  tmp6 = vec_sub(col1, col6);
-  tmp2 = vec_add(col2, col5);
-  tmp5 = vec_sub(col2, col5);
-  tmp3 = vec_add(col3, col4);
-  tmp4 = vec_sub(col3, col4);
-
-  DO_FDCT_ISLOW_ROWS();
-
-  /* Pass 2: process columns. */
-
-  TRANSPOSE(out, row);
-
-  tmp0 = vec_add(row0, row7);
-  tmp7 = vec_sub(row0, row7);
-  tmp1 = vec_add(row1, row6);
-  tmp6 = vec_sub(row1, row6);
-  tmp2 = vec_add(row2, row5);
-  tmp5 = vec_sub(row2, row5);
-  tmp3 = vec_add(row3, row4);
-  tmp4 = vec_sub(row3, row4);
-
-  DO_FDCT_ISLOW_COLS();
-
-  *(__vector short *)&data[0] = out0;
-  *(__vector short *)&data[8] = out1;
-  *(__vector short *)&data[16] = out2;
-  *(__vector short *)&data[24] = out3;
-  *(__vector short *)&data[32] = out4;
-  *(__vector short *)&data[40] = out5;
-  *(__vector short *)&data[48] = out6;
-  *(__vector short *)&data[56] = out7;
-}
-
-
-/* FAST INTEGER FORWARD DCT
- *
- * This is similar to the SSE2 implementation, except that we left-shift the
- * constants by 1 less bit (the -1 in IFAST_CONST_SHIFT.)  This is because
- * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
- *   the elements in arg3 + the most significant 17 bits of
- *     (the elements in arg1 * the elements in arg2).
- */
-
-#define IFAST_F_0_382 98   /* FIX(0.382683433) */
-#define IFAST_F_0_541 139  /* FIX(0.541196100) */
-#define IFAST_F_0_707 181  /* FIX(0.707106781) */
-#define IFAST_F_1_306 334  /* FIX(1.306562965) */
-
-#define IFAST_CONST_BITS 8
-#define IFAST_PRE_MULTIPLY_SCALE_BITS 2
-#define IFAST_CONST_SHIFT \
-  (16 - IFAST_PRE_MULTIPLY_SCALE_BITS - IFAST_CONST_BITS - 1)
-
-#define DO_FDCT_IFAST()  \
-{  \
-  /* Even part */  \
-  \
-  tmp10 = vec_add(tmp0, tmp3);  \
-  tmp13 = vec_sub(tmp0, tmp3);  \
-  tmp11 = vec_add(tmp1, tmp2);  \
-  tmp12 = vec_sub(tmp1, tmp2);  \
-  \
-  out0  = vec_add(tmp10, tmp11);  \
-  out4  = vec_sub(tmp10, tmp11);  \
-  \
-  z1 = vec_add(tmp12, tmp13);  \
-  z1 = vec_sl(z1, PRE_MULTIPLY_SCALE_BITS);  \
-  z1 = vec_madds(z1, PW_0707, zero);  \
-  \
-  out2 = vec_add(tmp13, z1);  \
-  out6 = vec_sub(tmp13, z1);  \
-  \
-  /* Odd part */  \
-  \
-  tmp10 = vec_add(tmp4, tmp5);  \
-  tmp11 = vec_add(tmp5, tmp6);  \
-  tmp12 = vec_add(tmp6, tmp7);  \
-  \
-  tmp10 = vec_sl(tmp10, PRE_MULTIPLY_SCALE_BITS);  \
-  tmp12 = vec_sl(tmp12, PRE_MULTIPLY_SCALE_BITS);  \
-  z5 = vec_sub(tmp10, tmp12);  \
-  z5 = vec_madds(z5, PW_0382, zero);  \
-  \
-  z2 = vec_madds(tmp10, PW_0541, z5);  \
-  z4 = vec_madds(tmp12, PW_1306, z5);  \
-  \
-  tmp11 = vec_sl(tmp11, PRE_MULTIPLY_SCALE_BITS);  \
-  z3 = vec_madds(tmp11, PW_0707, zero);  \
-  \
-  z11 = vec_add(tmp7, z3);  \
-  z13 = vec_sub(tmp7, z3);  \
-  \
-  out5 = vec_add(z13, z2);  \
-  out3 = vec_sub(z13, z2);  \
-  out1 = vec_add(z11, z4);  \
-  out7 = vec_sub(z11, z4);  \
-}
-
-void
-jsimd_fdct_ifast_altivec (DCTELEM *data)
-{
-  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
-    col0, col1, col2, col3, col4, col5, col6, col7,
-    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
-    z1, z2, z3, z4, z5, z11, z13,
-    out0, out1, out2, out3, out4, out5, out6, out7;
-
-  /* Constants */
-  __vector short zero = vec_splat_s16(0),
-    PW_0382 = { __8X(IFAST_F_0_382 << IFAST_CONST_SHIFT) },
-    PW_0541 = { __8X(IFAST_F_0_541 << IFAST_CONST_SHIFT) },
-    PW_0707 = { __8X(IFAST_F_0_707 << IFAST_CONST_SHIFT) },
-    PW_1306 = { __8X(IFAST_F_1_306 << IFAST_CONST_SHIFT) };
-  __vector unsigned short
-    PRE_MULTIPLY_SCALE_BITS = { __8X(IFAST_PRE_MULTIPLY_SCALE_BITS) };
-
-  /* Pass 1: process rows. */
-
-  row0 = *(__vector short *)&data[0];
-  row1 = *(__vector short *)&data[8];
-  row2 = *(__vector short *)&data[16];
-  row3 = *(__vector short *)&data[24];
-  row4 = *(__vector short *)&data[32];
-  row5 = *(__vector short *)&data[40];
-  row6 = *(__vector short *)&data[48];
-  row7 = *(__vector short *)&data[56];
-
-  TRANSPOSE(row, col);
-
-  tmp0 = vec_add(col0, col7);
-  tmp7 = vec_sub(col0, col7);
-  tmp1 = vec_add(col1, col6);
-  tmp6 = vec_sub(col1, col6);
-  tmp2 = vec_add(col2, col5);
-  tmp5 = vec_sub(col2, col5);
-  tmp3 = vec_add(col3, col4);
-  tmp4 = vec_sub(col3, col4);
-
-  DO_FDCT_IFAST();
-
-  /* Pass 2: process columns. */
-
-  TRANSPOSE(out, row);
-
-  tmp0 = vec_add(row0, row7);
-  tmp7 = vec_sub(row0, row7);
-  tmp1 = vec_add(row1, row6);
-  tmp6 = vec_sub(row1, row6);
-  tmp2 = vec_add(row2, row5);
-  tmp5 = vec_sub(row2, row5);
-  tmp3 = vec_add(row3, row4);
-  tmp4 = vec_sub(row3, row4);
-
-  DO_FDCT_IFAST();
-
-  *(__vector short *)&data[0] = out0;
-  *(__vector short *)&data[8] = out1;
-  *(__vector short *)&data[16] = out2;
-  *(__vector short *)&data[24] = out3;
-  *(__vector short *)&data[32] = out4;
-  *(__vector short *)&data[40] = out5;
-  *(__vector short *)&data[48] = out6;
-  *(__vector short *)&data[56] = out7;
-}
-
-
-/* SLOW INTEGER INVERSE DCT */
-
-#define DO_IDCT_ISLOW(in, PASS)  \
-{  \
-  /* Even part */  \
-  \
-  in##26l = vec_mergeh(in##2, in##6);  \
-  in##26h = vec_mergel(in##2, in##6);  \
-  \
-  tmp3l = vec_msums(in##26l, PW_F130_F054, zero32);  \
-  tmp3h = vec_msums(in##26h, PW_F130_F054, zero32);  \
-  tmp2l = vec_msums(in##26l, PW_F054_MF130, zero32);  \
-  tmp2h = vec_msums(in##26h, PW_F054_MF130, zero32);  \
-  \
-  tmp0 = vec_add(in##0, in##4);  \
-  tmp1 = vec_sub(in##0, in##4);  \
-  \
-  tmp0l = vec_unpackh(tmp0);  \
-  tmp0h = vec_unpackl(tmp0);  \
-  tmp0l = vec_sl(tmp0l, CONST_BITS);  \
-  tmp0h = vec_sl(tmp0h, CONST_BITS);  \
-  tmp0l = vec_add(tmp0l, PD_DESCALE_P##PASS);  \
-  tmp0h = vec_add(tmp0h, PD_DESCALE_P##PASS);  \
-  \
-  tmp10l = vec_add(tmp0l, tmp3l);  \
-  tmp10h = vec_add(tmp0h, tmp3h);  \
-  tmp13l = vec_sub(tmp0l, tmp3l);  \
-  tmp13h = vec_sub(tmp0h, tmp3h);  \
-  \
-  tmp1l = vec_unpackh(tmp1);  \
-  tmp1h = vec_unpackl(tmp1);  \
-  tmp1l = vec_sl(tmp1l, CONST_BITS);  \
-  tmp1h = vec_sl(tmp1h, CONST_BITS);  \
-  tmp1l = vec_add(tmp1l, PD_DESCALE_P##PASS);  \
-  tmp1h = vec_add(tmp1h, PD_DESCALE_P##PASS);  \
-  \
-  tmp11l = vec_add(tmp1l, tmp2l);  \
-  tmp11h = vec_add(tmp1h, tmp2h);  \
-  tmp12l = vec_sub(tmp1l, tmp2l);  \
-  tmp12h = vec_sub(tmp1h, tmp2h);  \
-  \
-  /* Odd part */  \
-  \
-  z3 = vec_add(in##3, in##7);  \
-  z4 = vec_add(in##1, in##5);  \
-  \
-  z34l = vec_mergeh(z3, z4);  \
-  z34h = vec_mergel(z3, z4);  \
-  \
-  z3l = vec_msums(z34l, PW_MF078_F117, zero32);  \
-  z3h = vec_msums(z34h, PW_MF078_F117, zero32);  \
-  z4l = vec_msums(z34l, PW_F117_F078, zero32);  \
-  z4h = vec_msums(z34h, PW_F117_F078, zero32);  \
-  \
-  in##71l = vec_mergeh(in##7, in##1);  \
-  in##71h = vec_mergel(in##7, in##1);  \
-  \
-  tmp0l = vec_msums(in##71l, PW_MF060_MF089, z3l);  \
-  tmp0h = vec_msums(in##71h, PW_MF060_MF089, z3h);  \
-  tmp3l = vec_msums(in##71l, PW_MF089_F060, z4l);  \
-  tmp3h = vec_msums(in##71h, PW_MF089_F060, z4h);  \
-  \
-  in##53l = vec_mergeh(in##5, in##3);  \
-  in##53h = vec_mergel(in##5, in##3);  \
-  \
-  tmp1l = vec_msums(in##53l, PW_MF050_MF256, z4l);  \
-  tmp1h = vec_msums(in##53h, PW_MF050_MF256, z4h);  \
-  tmp2l = vec_msums(in##53l, PW_MF256_F050, z3l);  \
-  tmp2h = vec_msums(in##53h, PW_MF256_F050, z3h);  \
-  \
-  /* Final output stage */  \
-  \
-  out0l = vec_add(tmp10l, tmp3l);  \
-  out0h = vec_add(tmp10h, tmp3h);  \
-  out7l = vec_sub(tmp10l, tmp3l);  \
-  out7h = vec_sub(tmp10h, tmp3h);  \
-  \
-  out0l = vec_sra(out0l, DESCALE_P##PASS);  \
-  out0h = vec_sra(out0h, DESCALE_P##PASS);  \
-  out7l = vec_sra(out7l, DESCALE_P##PASS);  \
-  out7h = vec_sra(out7h, DESCALE_P##PASS);  \
-  \
-  out0 = vec_pack(out0l, out0h);  \
-  out7 = vec_pack(out7l, out7h);  \
-  \
-  out1l = vec_add(tmp11l, tmp2l);  \
-  out1h = vec_add(tmp11h, tmp2h);  \
-  out6l = vec_sub(tmp11l, tmp2l);  \
-  out6h = vec_sub(tmp11h, tmp2h);  \
-  \
-  out1l = vec_sra(out1l, DESCALE_P##PASS);  \
-  out1h = vec_sra(out1h, DESCALE_P##PASS);  \
-  out6l = vec_sra(out6l, DESCALE_P##PASS);  \
-  out6h = vec_sra(out6h, DESCALE_P##PASS);  \
-  \
-  out1 = vec_pack(out1l, out1h);  \
-  out6 = vec_pack(out6l, out6h);  \
-  \
-  out2l = vec_add(tmp12l, tmp1l);  \
-  out2h = vec_add(tmp12h, tmp1h);  \
-  out5l = vec_sub(tmp12l, tmp1l);  \
-  out5h = vec_sub(tmp12h, tmp1h);  \
-  \
-  out2l = vec_sra(out2l, DESCALE_P##PASS);  \
-  out2h = vec_sra(out2h, DESCALE_P##PASS);  \
-  out5l = vec_sra(out5l, DESCALE_P##PASS);  \
-  out5h = vec_sra(out5h, DESCALE_P##PASS);  \
-  \
-  out2 = vec_pack(out2l, out2h);  \
-  out5 = vec_pack(out5l, out5h);  \
-  \
-  out3l = vec_add(tmp13l, tmp0l);  \
-  out3h = vec_add(tmp13h, tmp0h);  \
-  out4l = vec_sub(tmp13l, tmp0l);  \
-  out4h = vec_sub(tmp13h, tmp0h);  \
-  \
-  out3l = vec_sra(out3l, DESCALE_P##PASS);  \
-  out3h = vec_sra(out3h, DESCALE_P##PASS);  \
-  out4l = vec_sra(out4l, DESCALE_P##PASS);  \
-  out4h = vec_sra(out4h, DESCALE_P##PASS);  \
-  \
-  out3 = vec_pack(out3l, out3h);  \
-  out4 = vec_pack(out4l, out4h);  \
-}
-
-void
-jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
-                          JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  short *dct_table = (short *)dct_table_;
-  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
-    col0, col1, col2, col3, col4, col5, col6, col7,
-    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
-    tmp0, tmp1, tmp2, tmp3, z3, z4,
-    z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
-    row71l, row71h, row26l, row26h, row53l, row53h,
-    out0, out1, out2, out3, out4, out5, out6, out7;
-  __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
-    tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
-    z3l, z3h, z4l, z4h,
-    out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
-    out5l, out5h, out6l, out6h, out7l, out7h;
-  __vector signed char outb;
-  long long *outptr, *outbptr = (long long *)(&outb);
-
-  /* Constants */
-  __vector short zero16 = { __8X(0) },
-    PW_F130_F054 = { __4X2(ISLOW_F_0_541 + ISLOW_F_0_765, ISLOW_F_0_541) },
-    PW_F054_MF130 = { __4X2(ISLOW_F_0_541, ISLOW_F_0_541 - ISLOW_F_1_847) },
-    PW_MF078_F117 = { __4X2(ISLOW_F_1_175 - ISLOW_F_1_961, ISLOW_F_1_175) },
-    PW_F117_F078 = { __4X2(ISLOW_F_1_175, ISLOW_F_1_175 - ISLOW_F_0_390) },
-    PW_MF060_MF089 = { __4X2(ISLOW_F_0_298 - ISLOW_F_0_899, -ISLOW_F_0_899) },
-    PW_MF089_F060 = { __4X2(-ISLOW_F_0_899, ISLOW_F_1_501 - ISLOW_F_0_899) },
-    PW_MF050_MF256 = { __4X2(ISLOW_F_2_053 - ISLOW_F_2_562, -ISLOW_F_2_562) },
-    PW_MF256_F050 = { __4X2(-ISLOW_F_2_562, ISLOW_F_3_072 - ISLOW_F_2_562) };
-  __vector unsigned short PASS1_BITS = { __8X(ISLOW_PASS1_BITS) };
-  __vector int zero32 = { __4X(0) },
-    PD_DESCALE_P1 = { __4X(1 << (ISLOW_DESCALE_P1 - 1)) },
-    PD_DESCALE_P2 = { __4X(1 << (ISLOW_DESCALE_P2 + 3 - 1)) };
-  __vector unsigned int DESCALE_P1 = { __4X(ISLOW_DESCALE_P1) },
-    DESCALE_P2 = { __4X(ISLOW_DESCALE_P2 + 3) },
-    CONST_BITS = { __4X(ISLOW_CONST_BITS) };
-  __vector signed char PB_CENTERJSAMP = { __16X(CENTERJSAMPLE) };
-
-  /* Pass 1: process columns. */
-
-  col0 = *(__vector short *)&coef_block[0];
-  col1 = *(__vector short *)&coef_block[8];
-  col2 = *(__vector short *)&coef_block[16];
-  col3 = *(__vector short *)&coef_block[24];
-  col4 = *(__vector short *)&coef_block[32];
-  col5 = *(__vector short *)&coef_block[40];
-  col6 = *(__vector short *)&coef_block[48];
-  col7 = *(__vector short *)&coef_block[56];
-
-  tmp1 = vec_or(col1, col2);
-  tmp2 = vec_or(col3, col4);
-  tmp1 = vec_or(tmp1, tmp2);
-  tmp3 = vec_or(col5, col6);
-  tmp3 = vec_or(tmp3, col7);
-  tmp1 = vec_or(tmp1, tmp3);
-
-  quant0 = *(__vector short *)&dct_table[0];
-  col0 = vec_mladd(col0, quant0, zero16);
-
-  if (vec_all_eq(tmp1, zero16)) {
-    /* AC terms all zero */
-
-    col0 = vec_sl(col0, PASS1_BITS);
-
-    row0 = vec_splat(col0, 0);
-    row1 = vec_splat(col0, 1);
-    row2 = vec_splat(col0, 2);
-    row3 = vec_splat(col0, 3);
-    row4 = vec_splat(col0, 4);
-    row5 = vec_splat(col0, 5);
-    row6 = vec_splat(col0, 6);
-    row7 = vec_splat(col0, 7);
-
-  } else {
-
-    quant1 = *(__vector short *)&dct_table[8];
-    quant2 = *(__vector short *)&dct_table[16];
-    quant3 = *(__vector short *)&dct_table[24];
-    quant4 = *(__vector short *)&dct_table[32];
-    quant5 = *(__vector short *)&dct_table[40];
-    quant6 = *(__vector short *)&dct_table[48];
-    quant7 = *(__vector short *)&dct_table[56];
-
-    col1 = vec_mladd(col1, quant1, zero16);
-    col2 = vec_mladd(col2, quant2, zero16);
-    col3 = vec_mladd(col3, quant3, zero16);
-    col4 = vec_mladd(col4, quant4, zero16);
-    col5 = vec_mladd(col5, quant5, zero16);
-    col6 = vec_mladd(col6, quant6, zero16);
-    col7 = vec_mladd(col7, quant7, zero16);
-
-    DO_IDCT_ISLOW(col, 1);
-
-    TRANSPOSE(out, row);
-  }
-
-  DO_IDCT_ISLOW(row, 2);
-
-  TRANSPOSE(out, col);
-
-  outb = vec_packs(col0, col1);
-  outb = vec_add(outb, PB_CENTERJSAMP);
-  outptr = (long long *)(output_buf[0] + output_col);
-  *outptr = outbptr[0];
-  outptr = (long long *)(output_buf[1] + output_col);
-  *outptr = outbptr[1];
-
-  outb = vec_packs(col2, col3);
-  outb = vec_add(outb, PB_CENTERJSAMP);
-  outptr = (long long *)(output_buf[2] + output_col);
-  *outptr = outbptr[0];
-  outptr = (long long *)(output_buf[3] + output_col);
-  *outptr = outbptr[1];
-
-  outb = vec_packs(col4, col5);
-  outb = vec_add(outb, PB_CENTERJSAMP);
-  outptr = (long long *)(output_buf[4] + output_col);
-  *outptr = outbptr[0];
-  outptr = (long long *)(output_buf[5] + output_col);
-  *outptr = outbptr[1];
-
-  outb = vec_packs(col6, col7);
-  outb = vec_add(outb, PB_CENTERJSAMP);
-  outptr = (long long *)(output_buf[6] + output_col);
-  *outptr = outbptr[0];
-  outptr = (long long *)(output_buf[7] + output_col);
-  *outptr = outbptr[1];
-}
-
-
-/* FAST INTEGER INVERSE DCT
- *
- * This is similar to the SSE2 implementation, except that we left-shift the
- * constants by 1 less bit (the -1 in IFAST_CONST_SHIFT.)  This is because
- * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
- *   the elements in arg3 + the most significant 17 bits of
- *     (the elements in arg1 * the elements in arg2).
- */
-
-#define IFAST_F_1_082 277                    /* FIX(1.082392200) */
-#define IFAST_F_1_414 362                    /* FIX(1.414213562) */
-#define IFAST_F_1_847 473                    /* FIX(1.847759065) */
-#define IFAST_F_2_613 669                    /* FIX(2.613125930) */
-#define IFAST_F_1_613 (IFAST_F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
-
-#define IFAST_PASS1_BITS 2
-
-#define DO_IDCT_IFAST(in)  \
-{  \
-  /* Even part */  \
-  \
-  tmp10 = vec_add(in##0, in##4);  \
-  tmp11 = vec_sub(in##0, in##4);  \
-  tmp13 = vec_add(in##2, in##6);  \
-  \
-  tmp12 = vec_sub(in##2, in##6);  \
-  tmp12 = vec_sl(tmp12, PRE_MULTIPLY_SCALE_BITS);  \
-  tmp12 = vec_madds(tmp12, PW_F1414, zero);  \
-  tmp12 = vec_sub(tmp12, tmp13);  \
-  \
-  tmp0 = vec_add(tmp10, tmp13);  \
-  tmp3 = vec_sub(tmp10, tmp13);  \
-  tmp1 = vec_add(tmp11, tmp12);  \
-  tmp2 = vec_sub(tmp11, tmp12);  \
-  \
-  /* Odd part */  \
-  \
-  z13 = vec_add(in##5, in##3);  \
-  z10 = vec_sub(in##5, in##3);  \
-  z10s = vec_sl(z10, PRE_MULTIPLY_SCALE_BITS);  \
-  z11 = vec_add(in##1, in##7);  \
-  z12s = vec_sub(in##1, in##7);  \
-  z12s = vec_sl(z12s, PRE_MULTIPLY_SCALE_BITS);  \
-  \
-  tmp11 = vec_sub(z11, z13);  \
-  tmp11 = vec_sl(tmp11, PRE_MULTIPLY_SCALE_BITS);  \
-  tmp11 = vec_madds(tmp11, PW_F1414, zero);  \
-  \
-  tmp7 = vec_add(z11, z13);  \
-  \
-  z5 = vec_add(z10s, z12s);  \
-  z5 = vec_madds(z5, PW_F1847, zero);  \
-  \
-  tmp10 = vec_madds(z12s, PW_F1082, zero);  \
-  tmp10 = vec_sub(tmp10, z5);  \
-  tmp12 = vec_madds(z10s, PW_MF1613, z5);  \
-  tmp12 = vec_sub(tmp12, z10);  \
-  \
-  tmp6 = vec_sub(tmp12, tmp7);  \
-  tmp5 = vec_sub(tmp11, tmp6);  \
-  tmp4 = vec_add(tmp10, tmp5);  \
-  \
-  out0 = vec_add(tmp0, tmp7);  \
-  out1 = vec_add(tmp1, tmp6);  \
-  out2 = vec_add(tmp2, tmp5);  \
-  out3 = vec_sub(tmp3, tmp4);  \
-  out4 = vec_add(tmp3, tmp4);  \
-  out5 = vec_sub(tmp2, tmp5);  \
-  out6 = vec_sub(tmp1, tmp6);  \
-  out7 = vec_sub(tmp0, tmp7);  \
-}
-
-void
-jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
-                          JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  short *dct_table = (short *)dct_table_;
-  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
-    col0, col1, col2, col3, col4, col5, col6, col7,
-    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
-    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
-    z5, z10, z10s, z11, z12s, z13,
-    out0, out1, out2, out3, out4, out5, out6, out7;
-  __vector signed char outb;
-  long long *outptr, *outbptr = (long long *)(&outb);
-
-  /* Constants */
-  __vector short zero = { __8X(0) },
-    PW_F1414 = { __8X(IFAST_F_1_414 << IFAST_CONST_SHIFT) },
-    PW_F1847 = { __8X(IFAST_F_1_847 << IFAST_CONST_SHIFT) },
-    PW_MF1613 = { __8X(-IFAST_F_1_613 << IFAST_CONST_SHIFT) },
-    PW_F1082 = { __8X(IFAST_F_1_082 << IFAST_CONST_SHIFT) };
-  __vector unsigned short
-    PRE_MULTIPLY_SCALE_BITS = { __8X(IFAST_PRE_MULTIPLY_SCALE_BITS) },
-    PASS1_BITS3 = { __8X(IFAST_PASS1_BITS + 3) };
-  __vector signed char PB_CENTERJSAMP = { __16X(CENTERJSAMPLE) };
-
-  /* Pass 1: process columns. */
-
-  col0 = *(__vector short *)&coef_block[0];
-  col1 = *(__vector short *)&coef_block[8];
-  col2 = *(__vector short *)&coef_block[16];
-  col3 = *(__vector short *)&coef_block[24];
-  col4 = *(__vector short *)&coef_block[32];
-  col5 = *(__vector short *)&coef_block[40];
-  col6 = *(__vector short *)&coef_block[48];
-  col7 = *(__vector short *)&coef_block[56];
-
-  tmp1 = vec_or(col1, col2);
-  tmp2 = vec_or(col3, col4);
-  tmp1 = vec_or(tmp1, tmp2);
-  tmp3 = vec_or(col5, col6);
-  tmp3 = vec_or(tmp3, col7);
-  tmp1 = vec_or(tmp1, tmp3);
-
-  quant0 = *(__vector short *)&dct_table[0];
-  col0 = vec_mladd(col0, quant0, zero);
-
-  if (vec_all_eq(tmp1, zero)) {
-    /* AC terms all zero */
-
-    row0 = vec_splat(col0, 0);
-    row1 = vec_splat(col0, 1);
-    row2 = vec_splat(col0, 2);
-    row3 = vec_splat(col0, 3);
-    row4 = vec_splat(col0, 4);
-    row5 = vec_splat(col0, 5);
-    row6 = vec_splat(col0, 6);
-    row7 = vec_splat(col0, 7);
-
-  } else {
-
-    quant1 = *(__vector short *)&dct_table[8];
-    quant2 = *(__vector short *)&dct_table[16];
-    quant3 = *(__vector short *)&dct_table[24];
-    quant4 = *(__vector short *)&dct_table[32];
-    quant5 = *(__vector short *)&dct_table[40];
-    quant6 = *(__vector short *)&dct_table[48];
-    quant7 = *(__vector short *)&dct_table[56];
-
-    col1 = vec_mladd(col1, quant1, zero);
-    col2 = vec_mladd(col2, quant2, zero);
-    col3 = vec_mladd(col3, quant3, zero);
-    col4 = vec_mladd(col4, quant4, zero);
-    col5 = vec_mladd(col5, quant5, zero);
-    col6 = vec_mladd(col6, quant6, zero);
-    col7 = vec_mladd(col7, quant7, zero);
-
-    DO_IDCT_IFAST(col);
-
-    TRANSPOSE(out, row);
-  }
-
-  /* Pass 2: process rows. */
-
-  DO_IDCT_IFAST(row);
-
-  out0 = vec_sra(out0, PASS1_BITS3);
-  out1 = vec_sra(out1, PASS1_BITS3);
-  out2 = vec_sra(out2, PASS1_BITS3);
-  out3 = vec_sra(out3, PASS1_BITS3);
-  out4 = vec_sra(out4, PASS1_BITS3);
-  out5 = vec_sra(out5, PASS1_BITS3);
-  out6 = vec_sra(out6, PASS1_BITS3);
-  out7 = vec_sra(out7, PASS1_BITS3);
-
-  TRANSPOSE(out, col);
-
-  outb = vec_packs(col0, col1);
-  outb = vec_add(outb, PB_CENTERJSAMP);
-  outptr = (long long *)(output_buf[0] + output_col);
-  *outptr = outbptr[0];
-  outptr = (long long *)(output_buf[1] + output_col);
-  *outptr = outbptr[1];
-
-  outb = vec_packs(col2, col3);
-  outb = vec_add(outb, PB_CENTERJSAMP);
-  outptr = (long long *)(output_buf[2] + output_col);
-  *outptr = outbptr[0];
-  outptr = (long long *)(output_buf[3] + output_col);
-  *outptr = outbptr[1];
-
-  outb = vec_packs(col4, col5);
-  outb = vec_add(outb, PB_CENTERJSAMP);
-  outptr = (long long *)(output_buf[4] + output_col);
-  *outptr = outbptr[0];
-  outptr = (long long *)(output_buf[5] + output_col);
-  *outptr = outbptr[1];
-
-  outb = vec_packs(col6, col7);
-  outb = vec_add(outb, PB_CENTERJSAMP);
-  outptr = (long long *)(output_buf[6] + output_col);
-  *outptr = outbptr[0];
-  outptr = (long long *)(output_buf[7] + output_col);
-  *outptr = outbptr[1];
-}

From d44ffd9db0029996f86e90594546621abc9999e4 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Mon, 22 Dec 2014 01:10:11 +0000
Subject: [PATCH 071/108] Cosmetic tweaks to the PowerPC SIMD stubs

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1464 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/jsimd_powerpc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c
index f92851c5..e3807bb9 100644
--- a/simd/jsimd_powerpc.c
+++ b/simd/jsimd_powerpc.c
@@ -1,5 +1,5 @@
 /*
- * jsimd_powerpc64.c
+ * jsimd_powerpc.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright 2009-2011, 2014 D. R. Commander
@@ -338,7 +338,7 @@ jsimd_can_idct_islow (void)
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
-  if (sizeof(DCTELEM) != 2)
+  if (sizeof(JCOEF) != 2)
     return 0;
 
   if (simd_support & JSIMD_ALTIVEC)
@@ -355,7 +355,7 @@ jsimd_can_idct_ifast (void)
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
-  if (sizeof(DCTELEM) != 2)
+  if (sizeof(JCOEF) != 2)
     return 0;
 
   if (simd_support & JSIMD_ALTIVEC)

From 752ee33e86ed1033d7262f8e74e10d29a8bc2e2c Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Mon, 22 Dec 2014 01:32:51 +0000
Subject: [PATCH 072/108] Add a "quicktest" pseudo-target, for those times when
 you just don't want to sit through 11 iterations of TJUnitTest.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1465 632fc199-4ca6-4c93-a231-07263d6284db
---
 Makefile.am | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Makefile.am b/Makefile.am
index b32a41b2..772d48ae 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -262,7 +262,7 @@ MD5_JPEG_CROP = b4197f377e621c4e9b1d20471432610d
 
 endif
 
-test: testclean all
+test: quicktest
 
 if WITH_TURBOJPEG
 if WITH_JAVA
@@ -280,6 +280,8 @@ endif
 	./tjunittest -yuv -noyuvpad
 endif
 
+quicktest: testclean all
+
 # These tests are carefully crafted to provide full coverage of as many of the
 # underlying algorithms as possible (including all of the SIMD-accelerated
 # ones.)

From ace9d06b9ebc9b0ef4dfa9b38634d1cc81686ab6 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Mon, 22 Dec 2014 01:38:01 +0000
Subject: [PATCH 073/108] Make comments more consistent

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1466 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/jfdctfst-altivec.c | 4 ++--
 simd/jfdctint-altivec.c | 4 ++--
 simd/jidctfst-altivec.c | 4 ++--
 simd/jidctint-altivec.c | 4 +++-
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/simd/jfdctfst-altivec.c b/simd/jfdctfst-altivec.c
index 0c3687f8..3556ab56 100644
--- a/simd/jfdctfst-altivec.c
+++ b/simd/jfdctfst-altivec.c
@@ -106,7 +106,7 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
   __vector unsigned short
     pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
 
-  /* Pass 1: process rows. */
+  /* Pass 1: process rows */
 
   row0 = *(__vector short *)&data[0];
   row1 = *(__vector short *)&data[8];
@@ -130,7 +130,7 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
 
   DO_FDCT();
 
-  /* Pass 2: process columns. */
+  /* Pass 2: process columns */
 
   TRANSPOSE(out, row);
 
diff --git a/simd/jfdctint-altivec.c b/simd/jfdctint-altivec.c
index d1230c7c..7699b299 100644
--- a/simd/jfdctint-altivec.c
+++ b/simd/jfdctint-altivec.c
@@ -175,7 +175,7 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
   __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
     descale_p2 = { __4X(DESCALE_P2) };
 
-  /* Pass 1: process rows. */
+  /* Pass 1: process rows */
 
   row0 = *(__vector short *)&data[0];
   row1 = *(__vector short *)&data[8];
@@ -199,7 +199,7 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
 
   DO_FDCT_ROWS();
 
-  /* Pass 2: process columns. */
+  /* Pass 2: process columns */
 
   TRANSPOSE(out, row);
 
diff --git a/simd/jidctfst-altivec.c b/simd/jidctfst-altivec.c
index f7749b84..33acb83c 100644
--- a/simd/jidctfst-altivec.c
+++ b/simd/jidctfst-altivec.c
@@ -125,7 +125,7 @@ jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
     pass1_bits3 = { __8X(PASS1_BITS + 3) };
   __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
 
-  /* Pass 1: process columns. */
+  /* Pass 1: process columns */
 
   col0 = *(__vector short *)&coef_block[0];
   col1 = *(__vector short *)&coef_block[8];
@@ -181,7 +181,7 @@ jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
     TRANSPOSE(out, row);
   }
 
-  /* Pass 2: process rows. */
+  /* Pass 2: process rows */
 
   DO_IDCT(row);
 
diff --git a/simd/jidctint-altivec.c b/simd/jidctint-altivec.c
index 00c6c58d..d54479b7 100644
--- a/simd/jidctint-altivec.c
+++ b/simd/jidctint-altivec.c
@@ -207,7 +207,7 @@ jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
     const_bits = { __4X(CONST_BITS) };
   __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
 
-  /* Pass 1: process columns. */
+  /* Pass 1: process columns */
 
   col0 = *(__vector short *)&coef_block[0];
   col1 = *(__vector short *)&coef_block[8];
@@ -265,6 +265,8 @@ jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
     TRANSPOSE(out, row);
   }
 
+  /* Pass 2: process rows */
+
   DO_IDCT(row, 2);
 
   TRANSPOSE(out, col);

From a07787f4236fb20d45d9e71deb3d5683958b54e0 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Mon, 22 Dec 2014 13:15:08 +0000
Subject: [PATCH 074/108] Maintain the traditional order of the regression
 tests while allowing the TurboJPEG and libjpeg portions to be executed
 separately

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1467 632fc199-4ca6-4c93-a231-07263d6284db
---
 Makefile.am | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index 772d48ae..c312c91a 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -262,7 +262,9 @@ MD5_JPEG_CROP = b4197f377e621c4e9b1d20471432610d
 
 endif
 
-test: quicktest
+test: tjquicktest bittest
+
+tjquicktest: testclean all
 
 if WITH_TURBOJPEG
 if WITH_JAVA
@@ -280,7 +282,7 @@ endif
 	./tjunittest -yuv -noyuvpad
 endif
 
-quicktest: testclean all
+bittest: testclean all
 
 # These tests are carefully crafted to provide full coverage of as many of the
 # underlying algorithms as possible (including all of the SIMD-accelerated

From 2ee9faef10689cbf691b869872e9769ba3742bc8 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Mon, 22 Dec 2014 13:41:18 +0000
Subject: [PATCH 075/108] Make test a phony target so things don't go haywire
 if there is a file named test.c in the current directory.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1468 632fc199-4ca6-4c93-a231-07263d6284db
---
 Makefile.am | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile.am b/Makefile.am
index c312c91a..c55b2c73 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -262,6 +262,7 @@ MD5_JPEG_CROP = b4197f377e621c4e9b1d20471432610d
 
 endif
 
+.PHONY: test
 test: tjquicktest bittest
 
 tjquicktest: testclean all

From 6a244cb5145ea8b2bdf4bf50802956218e542ed4 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Mon, 22 Dec 2014 13:42:26 +0000
Subject: [PATCH 076/108] AltiVec SIMD implementation of RGB-to-YCC color
 conversion

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1469 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/Makefile.am        |   1 +
 simd/jccolext-altivec.c | 250 ++++++++++++++++++++++++++++++++++++++++
 simd/jccolor-altivec.c  | 143 +++++++++++++++++++++++
 simd/jsimd.h            |  22 ++++
 simd/jsimd_powerpc.c    |  44 +++++++
 5 files changed, 460 insertions(+)
 create mode 100644 simd/jccolext-altivec.c
 create mode 100644 simd/jccolor-altivec.c

diff --git a/simd/Makefile.am b/simd/Makefile.am
index d7b8d02a..6ce52822 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -73,6 +73,7 @@ endif
 if SIMD_POWERPC
 
 libsimd_la_SOURCES = jsimd_powerpc.c \
+	jccolor-altivec.c \
 	jfdctfst-altivec.c    jfdctint-altivec.c \
 	jidctfst-altivec.c    jidctint-altivec.c
 libsimd_la_CFLAGS = -maltivec
diff --git a/simd/jccolext-altivec.c b/simd/jccolext-altivec.c
new file mode 100644
index 00000000..989fc3c8
--- /dev/null
+++ b/simd/jccolext-altivec.c
@@ -0,0 +1,250 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * Copyright (C) 2014, Jay Foad.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-altivec.c */
+
+
+void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
+                                    JSAMPIMAGE output_buf,
+                                    JDIMENSION output_row, int num_rows)
+{
+  JSAMPROW inptr;
+  JSAMPROW outptr0, outptr1, outptr2;
+  int pitch;
+  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0}, rgbg0,
+    rgbg1, rgbg2, rgbg3, y, cb, cr;
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char rgb4;
+#endif
+  __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+  __vector unsigned short y01, y23, cr01, cr23, cb01, cb23;
+  __vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
+
+  /* Constants */
+  __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
+    pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) },
+    pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) },
+    pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) };
+  __vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) },
+    pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
+  __vector unsigned char zero = { __16X(0) },
+    shift_pack_index =
+      { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    for (pitch = img_width * RGB_PIXELSIZE; pitch > 0;
+         pitch -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
+         outptr0 += 16, outptr1 += 16, outptr2 += 16) {
+
+#if RGB_PIXELSIZE == 3
+      /* Load 16 pixels == 48 bytes */
+      if ((size_t)inptr & 15) {
+        __vector unsigned char unaligned_shift_index;
+        rgb0 = vec_ld(0, inptr);
+        if (pitch > 16)
+          rgb1 = vec_ld(16, inptr);
+        else
+          rgb1 = vec_ld(-1, inptr + pitch);
+        if (pitch > 32)
+          rgb2 = vec_ld(32, inptr);
+        else
+          rgb2 = vec_ld(-1, inptr + pitch);
+        if (pitch > 48)
+          rgb3 = vec_ld(48, inptr);
+        else
+          rgb3 = vec_ld(-1, inptr + pitch);
+        unaligned_shift_index = vec_lvsl(0, inptr);
+        rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+        rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+        rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+      } else {
+        rgb0 = vec_ld(0, inptr);
+        if (pitch > 16)
+          rgb1 = vec_ld(16, inptr);
+        if (pitch > 32)
+          rgb2 = vec_ld(32, inptr);
+      }
+
+      /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
+      rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
+      rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
+      rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
+#else
+      /* Load 16 pixels == 64 bytes */
+      if ((size_t)inptr & 15) {
+        __vector unsigned char unaligned_shift_index;
+        rgb0 = vec_ld(0, inptr);
+        if (pitch > 16)
+          rgb1 = vec_ld(16, inptr);
+        else
+          rgb1 = vec_ld(-1, inptr + pitch);
+        if (pitch > 32)
+          rgb2 = vec_ld(32, inptr);
+        else
+          rgb2 = vec_ld(-1, inptr + pitch);
+        if (pitch > 48)
+          rgb3 = vec_ld(48, inptr);
+        else
+          rgb3 = vec_ld(-1, inptr + pitch);
+        if (pitch > 64)
+          rgb4 = vec_ld(64, inptr);
+        else
+          rgb4 = vec_ld(-1, inptr + pitch);
+        unaligned_shift_index = vec_lvsl(0, inptr);
+        rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+        rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+        rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+        rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
+      } else {
+        rgb0 = vec_ld(0, inptr);
+        if (pitch > 16)
+          rgb1 = vec_ld(16, inptr);
+        if (pitch > 32)
+          rgb2 = vec_ld(32, inptr);
+        if (pitch > 48)
+          rgb3 = vec_ld(48, inptr);
+      }
+
+      /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb0 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb0 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb0 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
+      rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
+      rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
+      rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
+#endif
+
+      /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
+       * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
+       * ...
+       *
+       * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+       * support unsigned vectors.
+       */
+      rg0 = (__vector signed short)vec_mergeh(zero, rgbg0);
+      bg0 = (__vector signed short)vec_mergel(zero, rgbg0);
+      rg1 = (__vector signed short)vec_mergeh(zero, rgbg1);
+      bg1 = (__vector signed short)vec_mergel(zero, rgbg1);
+      rg2 = (__vector signed short)vec_mergeh(zero, rgbg2);
+      bg2 = (__vector signed short)vec_mergel(zero, rgbg2);
+      rg3 = (__vector signed short)vec_mergeh(zero, rgbg3);
+      bg3 = (__vector signed short)vec_mergel(zero, rgbg3);
+
+      /* (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       */
+
+      /* Calculate Y values */
+
+      y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
+      y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
+      y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
+      y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
+      y0 = vec_msums(bg0, pw_f0114_f0250, y0);
+      y1 = vec_msums(bg1, pw_f0114_f0250, y1);
+      y2 = vec_msums(bg2, pw_f0114_f0250, y2);
+      y3 = vec_msums(bg3, pw_f0114_f0250, y3);
+      /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+       * each dword into a new 16-bit vector, which is the equivalent of
+       * descaling the 32-bit results (right-shifting by 16 bits) and then
+       * packing them.
+       */
+      y01 = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
+                     shift_pack_index);
+      y23 = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
+                     shift_pack_index);
+      y = vec_pack(y01, y23);
+      vec_st(y, 0, outptr0);
+
+      /* Calculate Cb values */
+      cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000,
+                                   (__vector unsigned int)cb0);
+      cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000,
+                                   (__vector unsigned int)cb1);
+      cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000,
+                                   (__vector unsigned int)cb2);
+      cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000,
+                                   (__vector unsigned int)cb3);
+      cb01 = vec_perm((__vector unsigned short)cb0,
+                      (__vector unsigned short)cb1, shift_pack_index);
+      cb23 = vec_perm((__vector unsigned short)cb2,
+                      (__vector unsigned short)cb3, shift_pack_index);
+      cb = vec_pack(cb01, cb23);
+      vec_st(cb, 0, outptr1);
+
+      /* Calculate Cr values */
+      cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000,
+                                   (__vector unsigned int)cr0);
+      cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000,
+                                   (__vector unsigned int)cr1);
+      cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000,
+                                   (__vector unsigned int)cr2);
+      cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000,
+                                   (__vector unsigned int)cr3);
+      cr01 = vec_perm((__vector unsigned short)cr0,
+                      (__vector unsigned short)cr1, shift_pack_index);
+      cr23 = vec_perm((__vector unsigned short)cr2,
+                      (__vector unsigned short)cr3, shift_pack_index);
+      cr = vec_pack(cr01, cr23);
+      vec_st(cr, 0, outptr2);
+    }
+  }
+}
diff --git a/simd/jccolor-altivec.c b/simd/jccolor-altivec.c
new file mode 100644
index 00000000..ddb9fa06
--- /dev/null
+++ b/simd/jccolor-altivec.c
@@ -0,0 +1,143 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_081 5329                 /* FIX(0.08131) */
+#define F_0_114 7471                 /* FIX(0.11400) */
+#define F_0_168 11059                /* FIX(0.16874) */
+#define F_0_250 16384                /* FIX(0.25000) */
+#define F_0_299 19595                /* FIX(0.29900) */
+#define F_0_331 21709                /* FIX(0.33126) */
+#define F_0_418 27439                /* FIX(0.41869) */
+#define F_0_500 32768                /* FIX(0.50000) */
+#define F_0_587 38470                /* FIX(0.58700) */
+#define F_0_337 (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
+#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
+#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
+#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
+#include "jccolext-altivec.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
+#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
+#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
+#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 9b50610a..9f41ef64 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -116,6 +116,28 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_mips_dspr2
         (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
          JDIMENSION output_row, int num_rows);
 
+EXTERN(void) jsimd_rgb_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+
 /* RGB & extended RGB --> Grayscale Colorspace Conversion */
 EXTERN(void) jsimd_rgb_gray_convert_mmx
         (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c
index e3807bb9..6e6be265 100644
--- a/simd/jsimd_powerpc.c
+++ b/simd/jsimd_powerpc.c
@@ -42,6 +42,19 @@ init_simd (void)
 GLOBAL(int)
 jsimd_can_rgb_ycc (void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
   return 0;
 }
 
@@ -68,6 +81,37 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
                        JDIMENSION output_row, int num_rows)
 {
+  void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space) {
+    case JCS_EXT_RGB:
+      altivecfct=jsimd_extrgb_ycc_convert_altivec;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      altivecfct=jsimd_extrgbx_ycc_convert_altivec;
+      break;
+    case JCS_EXT_BGR:
+      altivecfct=jsimd_extbgr_ycc_convert_altivec;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      altivecfct=jsimd_extbgrx_ycc_convert_altivec;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      altivecfct=jsimd_extxbgr_ycc_convert_altivec;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      altivecfct=jsimd_extxrgb_ycc_convert_altivec;
+      break;
+    default:
+      altivecfct=jsimd_rgb_ycc_convert_altivec;
+      break;
+  }
+
+  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)

From 2a4e7f1bc347994a8924d551b1d65d2ded6afaff Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Mon, 22 Dec 2014 13:57:30 +0000
Subject: [PATCH 077/108] Remove unneeded code;  Make sure jccolor-altivec.o
 will be rebuilt if jccolext-altivec.c changes.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1470 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/Makefile.am       |  2 ++
 simd/jccolor-altivec.c | 39 ---------------------------------------
 2 files changed, 2 insertions(+), 39 deletions(-)

diff --git a/simd/Makefile.am b/simd/Makefile.am
index 6ce52822..8701194a 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -78,6 +78,8 @@ libsimd_la_SOURCES = jsimd_powerpc.c \
 	jidctfst-altivec.c    jidctint-altivec.c
 libsimd_la_CFLAGS = -maltivec
 
+jccolor-altivec.lo:  jccolext-altivec.c
+
 endif
 
 AM_CPPFLAGS = -I$(top_srcdir)
diff --git a/simd/jccolor-altivec.c b/simd/jccolor-altivec.c
index ddb9fa06..04b8708d 100644
--- a/simd/jccolor-altivec.c
+++ b/simd/jccolor-altivec.c
@@ -45,20 +45,11 @@
 #define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
 #define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
 #include "jccolext-altivec.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
 #undef RGB_PIXELSIZE
 
-#define RGB_RED EXT_RGB_RED
-#define RGB_GREEN EXT_RGB_GREEN
-#define RGB_BLUE EXT_RGB_BLUE
 #define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 #define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec
 #include "jccolext-altivec.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
 #undef RGB_PIXELSIZE
 #undef RGBG_INDEX0
 #undef RGBG_INDEX1
@@ -66,23 +57,14 @@
 #undef RGBG_INDEX3
 #undef jsimd_rgb_ycc_convert_altivec
 
-#define RGB_RED EXT_RGBX_RED
-#define RGB_GREEN EXT_RGBX_GREEN
-#define RGB_BLUE EXT_RGBX_BLUE
 #define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
 #define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
 #define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec
 #include "jccolext-altivec.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
 #undef RGB_PIXELSIZE
 #undef RGBG_INDEX
 #undef jsimd_rgb_ycc_convert_altivec
 
-#define RGB_RED EXT_BGR_RED
-#define RGB_GREEN EXT_BGR_GREEN
-#define RGB_BLUE EXT_BGR_BLUE
 #define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
 #define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
 #define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
@@ -90,9 +72,6 @@
 #define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
 #define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec
 #include "jccolext-altivec.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
 #undef RGB_PIXELSIZE
 #undef RGBG_INDEX0
 #undef RGBG_INDEX1
@@ -100,44 +79,26 @@
 #undef RGBG_INDEX3
 #undef jsimd_rgb_ycc_convert_altivec
 
-#define RGB_RED EXT_BGRX_RED
-#define RGB_GREEN EXT_BGRX_GREEN
-#define RGB_BLUE EXT_BGRX_BLUE
 #define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
 #define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
 #define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec
 #include "jccolext-altivec.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
 #undef RGB_PIXELSIZE
 #undef RGBG_INDEX
 #undef jsimd_rgb_ycc_convert_altivec
 
-#define RGB_RED EXT_XBGR_RED
-#define RGB_GREEN EXT_XBGR_GREEN
-#define RGB_BLUE EXT_XBGR_BLUE
 #define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
 #define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
 #define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec
 #include "jccolext-altivec.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
 #undef RGB_PIXELSIZE
 #undef RGBG_INDEX
 #undef jsimd_rgb_ycc_convert_altivec
 
-#define RGB_RED EXT_XRGB_RED
-#define RGB_GREEN EXT_XRGB_GREEN
-#define RGB_BLUE EXT_XRGB_BLUE
 #define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 #define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
 #define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec
 #include "jccolext-altivec.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
 #undef RGB_PIXELSIZE
 #undef RGBG_INDEX
 #undef jsimd_rgb_ycc_convert_altivec

From 243aba148ef1ffad006c1a4ef920ebde247fb6a8 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Mon, 22 Dec 2014 14:10:33 +0000
Subject: [PATCH 078/108] AltiVec SIMD implementation of RGB-to-Grayscale color
 conversion

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1471 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/Makefile.am        |   3 +-
 simd/jcgray-altivec.c   |  99 ++++++++++++++++++++
 simd/jcgryext-altivec.c | 200 ++++++++++++++++++++++++++++++++++++++++
 simd/jsimd.h            |  22 +++++
 simd/jsimd_powerpc.c    |  44 +++++++++
 5 files changed, 367 insertions(+), 1 deletion(-)
 create mode 100644 simd/jcgray-altivec.c
 create mode 100644 simd/jcgryext-altivec.c

diff --git a/simd/Makefile.am b/simd/Makefile.am
index 8701194a..7aa36ae1 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -73,12 +73,13 @@ endif
 if SIMD_POWERPC
 
 libsimd_la_SOURCES = jsimd_powerpc.c \
-	jccolor-altivec.c \
+	jccolor-altivec.c     jcgray-altivec.c \
 	jfdctfst-altivec.c    jfdctint-altivec.c \
 	jidctfst-altivec.c    jidctint-altivec.c
 libsimd_la_CFLAGS = -maltivec
 
 jccolor-altivec.lo:  jccolext-altivec.c
+jcgray-altivec.lo:   jcgryext-altivec.c
 
 endif
 
diff --git a/simd/jcgray-altivec.c b/simd/jcgray-altivec.c
new file mode 100644
index 00000000..b52fade0
--- /dev/null
+++ b/simd/jcgray-altivec.c
@@ -0,0 +1,99 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_114 7471                 /* FIX(0.11400) */
+#define F_0_250 16384                /* FIX(0.25000) */
+#define F_0_299 19595                /* FIX(0.29900) */
+#define F_0_587 38470                /* FIX(0.58700) */
+#define F_0_337 (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
+#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
+#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
+#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
+#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
+#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
+#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
+#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
+#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
+#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
+#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
+#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
diff --git a/simd/jcgryext-altivec.c b/simd/jcgryext-altivec.c
new file mode 100644
index 00000000..c1d0a453
--- /dev/null
+++ b/simd/jcgryext-altivec.c
@@ -0,0 +1,200 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * Copyright (C) 2014, Jay Foad.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-altivec.c */
+
+
+void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
+                                     JSAMPARRAY input_buf,
+                                     JSAMPIMAGE output_buf,
+                                     JDIMENSION output_row, int num_rows)
+{
+  JSAMPROW inptr, outptr;
+  int pitch;
+  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0}, rgbg0,
+    rgbg1, rgbg2, rgbg3, y;
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char rgb4;
+#endif
+  __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+  __vector unsigned short y01, y23;
+  __vector int y0, y1, y2, y3;
+
+  /* Constants */
+  __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
+    pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) };
+  __vector unsigned char zero = { __16X(0) },
+    shift_pack_index =
+      { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    for (pitch = img_width * RGB_PIXELSIZE; pitch > 0;
+         pitch -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
+         outptr += 16) {
+
+#if RGB_PIXELSIZE == 3
+      /* Load 16 pixels == 48 bytes */
+      if ((size_t)inptr & 15) {
+        __vector unsigned char unaligned_shift_index;
+        rgb0 = vec_ld(0, inptr);
+        if (pitch > 16)
+          rgb1 = vec_ld(16, inptr);
+        else
+          rgb1 = vec_ld(-1, inptr + pitch);
+        if (pitch > 32)
+          rgb2 = vec_ld(32, inptr);
+        else
+          rgb2 = vec_ld(-1, inptr + pitch);
+        if (pitch > 48)
+          rgb3 = vec_ld(48, inptr);
+        else
+          rgb3 = vec_ld(-1, inptr + pitch);
+        unaligned_shift_index = vec_lvsl(0, inptr);
+        rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+        rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+        rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+      } else {
+        rgb0 = vec_ld(0, inptr);
+        if (pitch > 16)
+          rgb1 = vec_ld(16, inptr);
+        if (pitch > 32)
+          rgb2 = vec_ld(32, inptr);
+      }
+
+      /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
+      rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
+      rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
+      rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
+#else
+      /* Load 16 pixels == 64 bytes */
+      if ((size_t)inptr & 15) {
+        __vector unsigned char unaligned_shift_index;
+        rgb0 = vec_ld(0, inptr);
+        if (pitch > 16)
+          rgb1 = vec_ld(16, inptr);
+        else
+          rgb1 = vec_ld(-1, inptr + pitch);
+        if (pitch > 32)
+          rgb2 = vec_ld(32, inptr);
+        else
+          rgb2 = vec_ld(-1, inptr + pitch);
+        if (pitch > 48)
+          rgb3 = vec_ld(48, inptr);
+        else
+          rgb3 = vec_ld(-1, inptr + pitch);
+        if (pitch > 64)
+          rgb4 = vec_ld(64, inptr);
+        else
+          rgb4 = vec_ld(-1, inptr + pitch);
+        unaligned_shift_index = vec_lvsl(0, inptr);
+        rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+        rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+        rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+        rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
+      } else {
+        rgb0 = vec_ld(0, inptr);
+        if (pitch > 16)
+          rgb1 = vec_ld(16, inptr);
+        if (pitch > 32)
+          rgb2 = vec_ld(32, inptr);
+        if (pitch > 48)
+          rgb3 = vec_ld(48, inptr);
+      }
+
+      /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb0 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb0 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb0 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
+      rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
+      rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
+      rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
+#endif
+
+      /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
+       * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
+       * ...
+       *
+       * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+       * support unsigned vectors.
+       */
+      rg0 = (__vector signed short)vec_mergeh(zero, rgbg0);
+      bg0 = (__vector signed short)vec_mergel(zero, rgbg0);
+      rg1 = (__vector signed short)vec_mergeh(zero, rgbg1);
+      bg1 = (__vector signed short)vec_mergel(zero, rgbg1);
+      rg2 = (__vector signed short)vec_mergeh(zero, rgbg2);
+      bg2 = (__vector signed short)vec_mergel(zero, rgbg2);
+      rg3 = (__vector signed short)vec_mergeh(zero, rgbg3);
+      bg3 = (__vector signed short)vec_mergel(zero, rgbg3);
+
+      /* (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       */
+
+      /* Calculate Y values */
+
+      y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
+      y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
+      y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
+      y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
+      y0 = vec_msums(bg0, pw_f0114_f0250, y0);
+      y1 = vec_msums(bg1, pw_f0114_f0250, y1);
+      y2 = vec_msums(bg2, pw_f0114_f0250, y2);
+      y3 = vec_msums(bg3, pw_f0114_f0250, y3);
+      /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+       * each dword into a new 16-bit vector, which is the equivalent of
+       * descaling the 32-bit results (right-shifting by 16 bits) and then
+       * packing them.
+       */
+      y01 = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
+                     shift_pack_index);
+      y23 = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
+                     shift_pack_index);
+      y = vec_pack(y01, y23);
+      vec_st(y, 0, outptr);
+    }
+  }
+}
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 9f41ef64..a8009b57 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -206,6 +206,28 @@ EXTERN(void) jsimd_extxrgb_gray_convert_mips_dspr2
         (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
          JDIMENSION output_row, int num_rows);
 
+EXTERN(void) jsimd_rgb_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+
 /* YCC --> RGB & extended RGB Colorspace Conversion */
 EXTERN(void) jsimd_ycc_rgb_convert_mmx
         (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c
index 6e6be265..a33bf626 100644
--- a/simd/jsimd_powerpc.c
+++ b/simd/jsimd_powerpc.c
@@ -61,6 +61,19 @@ jsimd_can_rgb_ycc (void)
 GLOBAL(int)
 jsimd_can_rgb_gray (void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
   return 0;
 }
 
@@ -119,6 +132,37 @@ jsimd_rgb_gray_convert (j_compress_ptr cinfo,
                         JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
                         JDIMENSION output_row, int num_rows)
 {
+  void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space) {
+    case JCS_EXT_RGB:
+      altivecfct=jsimd_extrgb_gray_convert_altivec;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      altivecfct=jsimd_extrgbx_gray_convert_altivec;
+      break;
+    case JCS_EXT_BGR:
+      altivecfct=jsimd_extbgr_gray_convert_altivec;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      altivecfct=jsimd_extbgrx_gray_convert_altivec;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      altivecfct=jsimd_extxbgr_gray_convert_altivec;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      altivecfct=jsimd_extxrgb_gray_convert_altivec;
+      break;
+    default:
+      altivecfct=jsimd_rgb_gray_convert_altivec;
+      break;
+  }
+
+  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)

From 510e67c542d4aea8bcb4042e82c1fc3c0ab78e9f Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Mon, 22 Dec 2014 16:04:17 +0000
Subject: [PATCH 079/108] Use intrinsics for loading/storing data in the
 DCT/IDCT functions.  This has no effect on the performance of the aligned
 loads/stores, but it makes it more obvious what that code is doing.  Using
 intrinsics for the unaligned stores in the inverse DCT functions increases
 overall decompression performance by 1-2%.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1472 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/jfdctfst-altivec.c | 32 ++++++++---------
 simd/jfdctint-altivec.c | 32 ++++++++---------
 simd/jidctfst-altivec.c | 78 ++++++++++++++++++++++++++---------------
 simd/jidctint-altivec.c | 62 +++++++++++++++++++++-----------
 4 files changed, 122 insertions(+), 82 deletions(-)

diff --git a/simd/jfdctfst-altivec.c b/simd/jfdctfst-altivec.c
index 3556ab56..16a52df6 100644
--- a/simd/jfdctfst-altivec.c
+++ b/simd/jfdctfst-altivec.c
@@ -108,14 +108,14 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
 
   /* Pass 1: process rows */
 
-  row0 = *(__vector short *)&data[0];
-  row1 = *(__vector short *)&data[8];
-  row2 = *(__vector short *)&data[16];
-  row3 = *(__vector short *)&data[24];
-  row4 = *(__vector short *)&data[32];
-  row5 = *(__vector short *)&data[40];
-  row6 = *(__vector short *)&data[48];
-  row7 = *(__vector short *)&data[56];
+  row0 = vec_ld(0, data);
+  row1 = vec_ld(16, data);
+  row2 = vec_ld(32, data);
+  row3 = vec_ld(48, data);
+  row4 = vec_ld(64, data);
+  row5 = vec_ld(80, data);
+  row6 = vec_ld(96, data);
+  row7 = vec_ld(112, data);
 
   TRANSPOSE(row, col);
 
@@ -145,12 +145,12 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
 
   DO_FDCT();
 
-  *(__vector short *)&data[0] = out0;
-  *(__vector short *)&data[8] = out1;
-  *(__vector short *)&data[16] = out2;
-  *(__vector short *)&data[24] = out3;
-  *(__vector short *)&data[32] = out4;
-  *(__vector short *)&data[40] = out5;
-  *(__vector short *)&data[48] = out6;
-  *(__vector short *)&data[56] = out7;
+  vec_st(out0, 0, data);
+  vec_st(out1, 16, data);
+  vec_st(out2, 32, data);
+  vec_st(out3, 48, data);
+  vec_st(out4, 64, data);
+  vec_st(out5, 80, data);
+  vec_st(out6, 96, data);
+  vec_st(out7, 112, data);
 }
diff --git a/simd/jfdctint-altivec.c b/simd/jfdctint-altivec.c
index 7699b299..548ab964 100644
--- a/simd/jfdctint-altivec.c
+++ b/simd/jfdctint-altivec.c
@@ -177,14 +177,14 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
 
   /* Pass 1: process rows */
 
-  row0 = *(__vector short *)&data[0];
-  row1 = *(__vector short *)&data[8];
-  row2 = *(__vector short *)&data[16];
-  row3 = *(__vector short *)&data[24];
-  row4 = *(__vector short *)&data[32];
-  row5 = *(__vector short *)&data[40];
-  row6 = *(__vector short *)&data[48];
-  row7 = *(__vector short *)&data[56];
+  row0 = vec_ld(0, data);
+  row1 = vec_ld(16, data);
+  row2 = vec_ld(32, data);
+  row3 = vec_ld(48, data);
+  row4 = vec_ld(64, data);
+  row5 = vec_ld(80, data);
+  row6 = vec_ld(96, data);
+  row7 = vec_ld(112, data);
 
   TRANSPOSE(row, col);
 
@@ -214,12 +214,12 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
 
   DO_FDCT_COLS();
 
-  *(__vector short *)&data[0] = out0;
-  *(__vector short *)&data[8] = out1;
-  *(__vector short *)&data[16] = out2;
-  *(__vector short *)&data[24] = out3;
-  *(__vector short *)&data[32] = out4;
-  *(__vector short *)&data[40] = out5;
-  *(__vector short *)&data[48] = out6;
-  *(__vector short *)&data[56] = out7;
+  vec_st(out0, 0, data);
+  vec_st(out1, 16, data);
+  vec_st(out2, 32, data);
+  vec_st(out3, 48, data);
+  vec_st(out4, 64, data);
+  vec_st(out5, 80, data);
+  vec_st(out6, 96, data);
+  vec_st(out7, 112, data);
 }
diff --git a/simd/jidctfst-altivec.c b/simd/jidctfst-altivec.c
index 33acb83c..37a2f4e9 100644
--- a/simd/jidctfst-altivec.c
+++ b/simd/jidctfst-altivec.c
@@ -112,7 +112,7 @@ jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
     z5, z10, z10s, z11, z12s, z13,
     out0, out1, out2, out3, out4, out5, out6, out7;
   __vector signed char outb;
-  long long *outptr, *outbptr = (long long *)(&outb);
+  int *outptr;
 
   /* Constants */
   __vector short zero = { __8X(0) },
@@ -127,14 +127,14 @@ jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
 
   /* Pass 1: process columns */
 
-  col0 = *(__vector short *)&coef_block[0];
-  col1 = *(__vector short *)&coef_block[8];
-  col2 = *(__vector short *)&coef_block[16];
-  col3 = *(__vector short *)&coef_block[24];
-  col4 = *(__vector short *)&coef_block[32];
-  col5 = *(__vector short *)&coef_block[40];
-  col6 = *(__vector short *)&coef_block[48];
-  col7 = *(__vector short *)&coef_block[56];
+  col0 = vec_ld(0, coef_block);
+  col1 = vec_ld(16, coef_block);
+  col2 = vec_ld(32, coef_block);
+  col3 = vec_ld(48, coef_block);
+  col4 = vec_ld(64, coef_block);
+  col5 = vec_ld(80, coef_block);
+  col6 = vec_ld(96, coef_block);
+  col7 = vec_ld(112, coef_block);
 
   tmp1 = vec_or(col1, col2);
   tmp2 = vec_or(col3, col4);
@@ -196,31 +196,51 @@ jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
 
   TRANSPOSE(out, col);
 
-  outb = vec_packs(col0, col1);
+  outb = vec_packs(col0, col0);
   outb = vec_add(outb, pb_centerjsamp);
-  outptr = (long long *)(output_buf[0] + output_col);
-  *outptr = outbptr[0];
-  outptr = (long long *)(output_buf[1] + output_col);
-  *outptr = outbptr[1];
+  outptr = (int *)(output_buf[0] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
 
-  outb = vec_packs(col2, col3);
+  outb = vec_packs(col1, col1);
   outb = vec_add(outb, pb_centerjsamp);
-  outptr = (long long *)(output_buf[2] + output_col);
-  *outptr = outbptr[0];
-  outptr = (long long *)(output_buf[3] + output_col);
-  *outptr = outbptr[1];
+  outptr = (int *)(output_buf[1] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
 
-  outb = vec_packs(col4, col5);
+  outb = vec_packs(col2, col2);
   outb = vec_add(outb, pb_centerjsamp);
-  outptr = (long long *)(output_buf[4] + output_col);
-  *outptr = outbptr[0];
-  outptr = (long long *)(output_buf[5] + output_col);
-  *outptr = outbptr[1];
+  outptr = (int *)(output_buf[2] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
 
-  outb = vec_packs(col6, col7);
+  outb = vec_packs(col3, col3);
   outb = vec_add(outb, pb_centerjsamp);
-  outptr = (long long *)(output_buf[6] + output_col);
-  *outptr = outbptr[0];
-  outptr = (long long *)(output_buf[7] + output_col);
-  *outptr = outbptr[1];
+  outptr = (int *)(output_buf[3] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col4, col4);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[4] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col5, col5);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[5] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col6, col6);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[6] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col7, col7);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[7] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
 }
diff --git a/simd/jidctint-altivec.c b/simd/jidctint-altivec.c
index d54479b7..a354fcc6 100644
--- a/simd/jidctint-altivec.c
+++ b/simd/jidctint-altivec.c
@@ -186,7 +186,7 @@ jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
     out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
     out5l, out5h, out6l, out6h, out7l, out7h;
   __vector signed char outb;
-  long long *outptr, *outbptr = (long long *)(&outb);
+  int *outptr;
 
   /* Constants */
   __vector short zero16 = { __8X(0) },
@@ -271,31 +271,51 @@ jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
 
   TRANSPOSE(out, col);
 
-  outb = vec_packs(col0, col1);
+  outb = vec_packs(col0, col0);
   outb = vec_add(outb, pb_centerjsamp);
-  outptr = (long long *)(output_buf[0] + output_col);
-  *outptr = outbptr[0];
-  outptr = (long long *)(output_buf[1] + output_col);
-  *outptr = outbptr[1];
+  outptr = (int *)(output_buf[0] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
 
-  outb = vec_packs(col2, col3);
+  outb = vec_packs(col1, col1);
   outb = vec_add(outb, pb_centerjsamp);
-  outptr = (long long *)(output_buf[2] + output_col);
-  *outptr = outbptr[0];
-  outptr = (long long *)(output_buf[3] + output_col);
-  *outptr = outbptr[1];
+  outptr = (int *)(output_buf[1] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
 
-  outb = vec_packs(col4, col5);
+  outb = vec_packs(col2, col2);
   outb = vec_add(outb, pb_centerjsamp);
-  outptr = (long long *)(output_buf[4] + output_col);
-  *outptr = outbptr[0];
-  outptr = (long long *)(output_buf[5] + output_col);
-  *outptr = outbptr[1];
+  outptr = (int *)(output_buf[2] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
 
-  outb = vec_packs(col6, col7);
+  outb = vec_packs(col3, col3);
   outb = vec_add(outb, pb_centerjsamp);
-  outptr = (long long *)(output_buf[6] + output_col);
-  *outptr = outbptr[0];
-  outptr = (long long *)(output_buf[7] + output_col);
-  *outptr = outbptr[1];
+  outptr = (int *)(output_buf[3] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col4, col4);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[4] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col5, col5);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[5] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col6, col6);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[6] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col7, col7);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[7] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
 }

From 70d831dc0dc38a9f3ffbaf83f370bd292b0a7608 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Tue, 23 Dec 2014 02:42:59 +0000
Subject: [PATCH 080/108] Document the fact that the AltiVec implementation
 uses the same modified algorithms as the SSE2 implementation

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1473 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/jfdctint-altivec.c | 37 +++++++++++++++++++++++++++++++++++++
 simd/jidctfst-altivec.c | 10 ++++++++++
 simd/jidctint-altivec.c | 39 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/simd/jfdctint-altivec.c b/simd/jfdctint-altivec.c
index 548ab964..1ddf2619 100644
--- a/simd/jfdctint-altivec.c
+++ b/simd/jfdctint-altivec.c
@@ -46,6 +46,16 @@
 
 #define DO_FDCT_COMMON(PASS)  \
 {  \
+  /* (Original)  \
+   * z1 = (tmp12 + tmp13) * 0.541196100;  \
+   * data2 = z1 + tmp13 * 0.765366865;  \
+   * data6 = z1 + tmp12 * -1.847759065;  \
+   *  \
+   * (This implementation)  \
+   * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;  \
+   * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);  \
+   */  \
+  \
   tmp1312l = vec_mergeh(tmp13, tmp12);  \
   tmp1312h = vec_mergel(tmp13, tmp12);  \
   \
@@ -67,6 +77,16 @@
   z3 = vec_add(tmp4, tmp6);  \
   z4 = vec_add(tmp5, tmp7);  \
   \
+  /* (Original)  \
+   * z5 = (z3 + z4) * 1.175875602;  \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;  \
+   * z3 += z5;  z4 += z5;  \
+   *  \
+   * (This implementation)  \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;  \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);  \
+   */  \
+  \
   z34l = vec_mergeh(z3, z4);  \
   z34h = vec_mergel(z3, z4);  \
   \
@@ -75,6 +95,23 @@
   z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS);  \
   z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS);  \
   \
+  /* (Original)  \
+   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;  \
+   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;  \
+   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;  \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;  \
+   * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;  \
+   * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;  \
+   *  \
+   * (This implementation)  \
+   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;  \
+   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;  \
+   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);  \
+   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);  \
+   * data7 = tmp4 + z3;  data5 = tmp5 + z4;  \
+   * data3 = tmp6 + z3;  data1 = tmp7 + z4;  \
+   */  \
+  \
   tmp47l = vec_mergeh(tmp4, tmp7);  \
   tmp47h = vec_mergel(tmp4, tmp7);  \
   \
diff --git a/simd/jidctfst-altivec.c b/simd/jidctfst-altivec.c
index 37a2f4e9..282a97e0 100644
--- a/simd/jidctfst-altivec.c
+++ b/simd/jidctfst-altivec.c
@@ -77,6 +77,16 @@
   \
   tmp7 = vec_add(z11, z13);  \
   \
+  /* To avoid overflow...  \
+   *  \
+   * (Original)  \
+   * tmp12 = -2.613125930 * z10 + z5;  \
+   *  \
+   * (This implementation)  \
+   * tmp12 = (-1.613125930 - 1) * z10 + z5;  \
+   *       = -1.613125930 * z10 - z10 + z5;  \
+   */  \
+  \
   z5 = vec_add(z10s, z12s);  \
   z5 = vec_madds(z5, pw_F1847, zero);  \
   \
diff --git a/simd/jidctint-altivec.c b/simd/jidctint-altivec.c
index a354fcc6..c30c885d 100644
--- a/simd/jidctint-altivec.c
+++ b/simd/jidctint-altivec.c
@@ -46,7 +46,17 @@
 
 #define DO_IDCT(in, PASS)  \
 {  \
-  /* Even part */  \
+  /* Even part  \
+   *  \
+   * (Original)  \
+   * z1 = (z2 + z3) * 0.541196100;  \
+   * tmp2 = z1 + z3 * -1.847759065;  \
+   * tmp3 = z1 + z2 * 0.765366865;  \
+   *  \
+   * (This implementation)  \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);  \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;  \
+   */  \
   \
   in##26l = vec_mergeh(in##2, in##6);  \
   in##26h = vec_mergel(in##2, in##6);  \
@@ -88,6 +98,16 @@
   z3 = vec_add(in##3, in##7);  \
   z4 = vec_add(in##1, in##5);  \
   \
+  /* (Original)  \
+   * z5 = (z3 + z4) * 1.175875602;  \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;  \
+   * z3 += z5;  z4 += z5;  \
+   *  \
+   * (This implementation)  \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;  \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);  \
+   */  \
+  \
   z34l = vec_mergeh(z3, z4);  \
   z34h = vec_mergel(z3, z4);  \
   \
@@ -96,6 +116,23 @@
   z4l = vec_msums(z34l, pw_f117_f078, zero32);  \
   z4h = vec_msums(z34h, pw_f117_f078, zero32);  \
   \
+  /* (Original)  \
+   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;  \
+   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;  \
+   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;  \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;  \
+   * tmp0 += z1 + z3;  tmp1 += z2 + z4;  \
+   * tmp2 += z2 + z3;  tmp3 += z1 + z4;  \
+   *  \
+   * (This implementation)  \
+   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;  \
+   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;  \
+   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);  \
+   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);  \
+   * tmp0 += z3;  tmp1 += z4;  \
+   * tmp2 += z3;  tmp3 += z4;  \
+   */  \
+  \
   in##71l = vec_mergeh(in##7, in##1);  \
   in##71h = vec_mergel(in##7, in##1);  \
   \

From f654cf0e2c266d0ba8a3b85c72c14d3bf027d91f Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Tue, 23 Dec 2014 04:14:54 +0000
Subject: [PATCH 081/108] AltiVec SIMD implementation of sample conversion and
 integer quantization

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1474 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/Makefile.am       |   3 +-
 simd/jquanti-altivec.c | 236 +++++++++++++++++++++++++++++++++++++++++
 simd/jsimd.h           |   6 ++
 simd/jsimd_powerpc.c   |  30 ++++++
 4 files changed, 274 insertions(+), 1 deletion(-)
 create mode 100644 simd/jquanti-altivec.c

diff --git a/simd/Makefile.am b/simd/Makefile.am
index 7aa36ae1..dd0148db 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -75,7 +75,8 @@ if SIMD_POWERPC
 libsimd_la_SOURCES = jsimd_powerpc.c \
 	jccolor-altivec.c     jcgray-altivec.c \
 	jfdctfst-altivec.c    jfdctint-altivec.c \
-	jidctfst-altivec.c    jidctint-altivec.c
+	jidctfst-altivec.c    jidctint-altivec.c \
+  jquanti-altivec.c
 libsimd_la_CFLAGS = -maltivec
 
 jccolor-altivec.lo:  jccolext-altivec.c
diff --git a/simd/jquanti-altivec.c b/simd/jquanti-altivec.c
new file mode 100644
index 00000000..12c97eef
--- /dev/null
+++ b/simd/jquanti-altivec.c
@@ -0,0 +1,236 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+/* NOTE: The address will either be aligned or offset by 8 bytes, so we can
+ * always get the data we want by using a single vector load (although we may
+ * have to permute the result.)
+ */
+#define LOAD_ROW(row) {  \
+  elemptr = sample_data[row] + start_col;  \
+  in##row = vec_ld(0, elemptr);  \
+  if ((size_t)elemptr & 15)  \
+    in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr));  \
+}
+
+
+void
+jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
+                        DCTELEM * workspace)
+{
+  JSAMPROW elemptr;
+  __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
+  __vector short out0, out1, out2, out3, out4, out5, out6, out7;
+
+  /* Constants */
+  __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
+  __vector unsigned char zero = { __16X(0) };
+
+  LOAD_ROW(0);
+  LOAD_ROW(1);
+  LOAD_ROW(2);
+  LOAD_ROW(3);
+  LOAD_ROW(4);
+  LOAD_ROW(5);
+  LOAD_ROW(6);
+  LOAD_ROW(7);
+
+  out0 = (__vector short)vec_mergeh(zero, in0);
+  out1 = (__vector short)vec_mergeh(zero, in1);
+  out2 = (__vector short)vec_mergeh(zero, in2);
+  out3 = (__vector short)vec_mergeh(zero, in3);
+  out4 = (__vector short)vec_mergeh(zero, in4);
+  out5 = (__vector short)vec_mergeh(zero, in5);
+  out6 = (__vector short)vec_mergeh(zero, in6);
+  out7 = (__vector short)vec_mergeh(zero, in7);
+
+  out0 = vec_sub(out0, pw_centerjsamp);
+  out1 = vec_sub(out1, pw_centerjsamp);
+  out2 = vec_sub(out2, pw_centerjsamp);
+  out3 = vec_sub(out3, pw_centerjsamp);
+  out4 = vec_sub(out4, pw_centerjsamp);
+  out5 = vec_sub(out5, pw_centerjsamp);
+  out6 = vec_sub(out6, pw_centerjsamp);
+  out7 = vec_sub(out7, pw_centerjsamp);
+
+  vec_st(out0, 0, workspace);
+  vec_st(out1, 16, workspace);
+  vec_st(out2, 32, workspace);
+  vec_st(out3, 48, workspace);
+  vec_st(out4, 64, workspace);
+  vec_st(out5, 80, workspace);
+  vec_st(out6, 96, workspace);
+  vec_st(out7, 112, workspace);
+}
+
+
+#define WORD_BIT 16
+
+/* There is no AltiVec unsigned multiply instruction, hence this. */
+
+#define MULTIPLY(vs0, vs1, out) {  \
+  tmpe = vec_mule((__vector unsigned short)vs0,  \
+                  (__vector unsigned short)vs1);  \
+  tmpo = vec_mulo((__vector unsigned short)vs0,  \
+                  (__vector unsigned short)vs1);  \
+  out = (__vector short)vec_perm((__vector unsigned short)tmpe,  \
+                                 (__vector unsigned short)tmpo,  \
+                                 shift_pack_index);  \
+}
+
+void
+jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM * divisors,
+                        DCTELEM * workspace)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7;
+  __vector short row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s;
+  __vector short corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7;
+  __vector short recip0, recip1, recip2, recip3, recip4, recip5, recip6,
+    recip7;
+  __vector short scale0, scale1, scale2, scale3, scale4, scale5, scale6,
+    scale7;
+  __vector unsigned int tmpe, tmpo;
+
+  /* Constants */
+  __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
+  __vector unsigned char shift_pack_index =
+      { 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
+
+  row0 = vec_ld(0, workspace);
+  row1 = vec_ld(16, workspace);
+  row2 = vec_ld(32, workspace);
+  row3 = vec_ld(48, workspace);
+  row4 = vec_ld(64, workspace);
+  row5 = vec_ld(80, workspace);
+  row6 = vec_ld(96, workspace);
+  row7 = vec_ld(112, workspace);
+
+  /* Branch-less absolute value */
+  row0s = vec_sra(row0, pw_word_bit_m1);
+  row1s = vec_sra(row1, pw_word_bit_m1);
+  row2s = vec_sra(row2, pw_word_bit_m1);
+  row3s = vec_sra(row3, pw_word_bit_m1);
+  row4s = vec_sra(row4, pw_word_bit_m1);
+  row5s = vec_sra(row5, pw_word_bit_m1);
+  row6s = vec_sra(row6, pw_word_bit_m1);
+  row7s = vec_sra(row7, pw_word_bit_m1);
+  row0 = vec_xor(row0, row0s);
+  row1 = vec_xor(row1, row1s);
+  row2 = vec_xor(row2, row2s);
+  row3 = vec_xor(row3, row3s);
+  row4 = vec_xor(row4, row4s);
+  row5 = vec_xor(row5, row5s);
+  row6 = vec_xor(row6, row6s);
+  row7 = vec_xor(row7, row7s);
+  row0 = vec_sub(row0, row0s);
+  row1 = vec_sub(row1, row1s);
+  row2 = vec_sub(row2, row2s);
+  row3 = vec_sub(row3, row3s);
+  row4 = vec_sub(row4, row4s);
+  row5 = vec_sub(row5, row5s);
+  row6 = vec_sub(row6, row6s);
+  row7 = vec_sub(row7, row7s);
+
+  corr0 = vec_ld(DCTSIZE2 * 2, divisors);
+  corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
+  corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
+  corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
+  corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
+  corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
+  corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
+  corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
+
+  row0 = vec_add(row0, corr0);
+  row1 = vec_add(row1, corr1);
+  row2 = vec_add(row2, corr2);
+  row3 = vec_add(row3, corr3);
+  row4 = vec_add(row4, corr4);
+  row5 = vec_add(row5, corr5);
+  row6 = vec_add(row6, corr6);
+  row7 = vec_add(row7, corr7);
+
+  recip0 = vec_ld(0, divisors);
+  recip1 = vec_ld(16, divisors);
+  recip2 = vec_ld(32, divisors);
+  recip3 = vec_ld(48, divisors);
+  recip4 = vec_ld(64, divisors);
+  recip5 = vec_ld(80, divisors);
+  recip6 = vec_ld(96, divisors);
+  recip7 = vec_ld(112, divisors);
+
+  MULTIPLY(row0, recip0, row0);
+  MULTIPLY(row1, recip1, row1);
+  MULTIPLY(row2, recip2, row2);
+  MULTIPLY(row3, recip3, row3);
+  MULTIPLY(row4, recip4, row4);
+  MULTIPLY(row5, recip5, row5);
+  MULTIPLY(row6, recip6, row6);
+  MULTIPLY(row7, recip7, row7);
+
+  scale0 = vec_ld(DCTSIZE2 * 4, divisors);
+  scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
+  scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
+  scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
+  scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
+  scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
+  scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
+  scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
+
+  MULTIPLY(row0, scale0, row0);
+  MULTIPLY(row1, scale1, row1);
+  MULTIPLY(row2, scale2, row2);
+  MULTIPLY(row3, scale3, row3);
+  MULTIPLY(row4, scale4, row4);
+  MULTIPLY(row5, scale5, row5);
+  MULTIPLY(row6, scale6, row6);
+  MULTIPLY(row7, scale7, row7);
+
+  row0 = vec_xor(row0, row0s);
+  row1 = vec_xor(row1, row1s);
+  row2 = vec_xor(row2, row2s);
+  row3 = vec_xor(row3, row3s);
+  row4 = vec_xor(row4, row4s);
+  row5 = vec_xor(row5, row5s);
+  row6 = vec_xor(row6, row6s);
+  row7 = vec_xor(row7, row7s);
+  row0 = vec_sub(row0, row0s);
+  row1 = vec_sub(row1, row1s);
+  row2 = vec_sub(row2, row2s);
+  row3 = vec_sub(row3, row3s);
+  row4 = vec_sub(row4, row4s);
+  row5 = vec_sub(row5, row5s);
+  row6 = vec_sub(row6, row6s);
+  row7 = vec_sub(row7, row7s);
+
+  vec_st(row0, 0, coef_block);
+  vec_st(row1, 16, coef_block);
+  vec_st(row2, 32, coef_block);
+  vec_st(row3, 48, coef_block);
+  vec_st(row4, 64, coef_block);
+  vec_st(row5, 80, coef_block);
+  vec_st(row6, 96, coef_block);
+  vec_st(row7, 112, coef_block);
+}
diff --git a/simd/jsimd.h b/simd/jsimd.h
index a8009b57..3cb63eca 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -568,6 +568,9 @@ EXTERN(void) jsimd_convsamp_neon
 EXTERN(void) jsimd_convsamp_mips_dspr2
         (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace);
 
+EXTERN(void) jsimd_convsamp_altivec
+        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace);
+
 /* Floating Point Sample Conversion */
 EXTERN(void) jsimd_convsamp_float_3dnow
         (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace);
@@ -622,6 +625,9 @@ EXTERN(void) jsimd_quantize_neon
 EXTERN(void) jsimd_quantize_mips_dspr2
         (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace);
 
+EXTERN(void) jsimd_quantize_altivec
+        (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace);
+
 /* Floating Point Quantization */
 EXTERN(void) jsimd_quantize_float_3dnow
         (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace);
diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c
index a33bf626..60dad60b 100644
--- a/simd/jsimd_powerpc.c
+++ b/simd/jsimd_powerpc.c
@@ -290,6 +290,21 @@ jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
 GLOBAL(int)
 jsimd_can_convsamp (void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
   return 0;
 }
 
@@ -303,6 +318,7 @@ GLOBAL(void)
 jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
                 DCTELEM * workspace)
 {
+  jsimd_convsamp_altivec(sample_data, start_col, workspace);
 }
 
 GLOBAL(void)
@@ -371,6 +387,19 @@ jsimd_fdct_float (FAST_FLOAT * data)
 GLOBAL(int)
 jsimd_can_quantize (void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
   return 0;
 }
 
@@ -384,6 +413,7 @@ GLOBAL(void)
 jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
                 DCTELEM * workspace)
 {
+  jsimd_quantize_altivec(coef_block, divisors, workspace);
 }
 
 GLOBAL(void)

From 576eef0509f2a12452c03397bb9fdb2a11ec79c8 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Fri, 26 Dec 2014 17:49:25 -0500
Subject: [PATCH 082/108] Fix compilation issue

Issue arose when C_ARITH_CODING_SUPPORTED was undefined
---
 jccoefct.c | 11 +++++++++--
 jcdctmgr.c |  2 ++
 jpegint.h  |  4 ++++
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/jccoefct.c b/jccoefct.c
index 7c3d58f9..8d9b50d7 100644
--- a/jccoefct.c
+++ b/jccoefct.c
@@ -367,18 +367,23 @@ compress_trellis_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
     c_derived_tbl actbl_data;
     c_derived_tbl *actbl = &actbl_data;
     
+#ifdef C_ARITH_CODING_SUPPORTED
     arith_rates arith_r_data;
     arith_rates *arith_r = &arith_r_data;
+#endif
     
     compptr = cinfo->cur_comp_info[ci];
 
+#ifdef C_ARITH_CODING_SUPPORTED
     if (cinfo->arith_code)
       jget_arith_rates(cinfo, compptr->dc_tbl_no, compptr->ac_tbl_no, arith_r);
-    else {
+    else
+#endif
+    {
       jpeg_make_c_derived_tbl(cinfo, TRUE, compptr->dc_tbl_no, &dctbl);
       jpeg_make_c_derived_tbl(cinfo, FALSE, compptr->ac_tbl_no, &actbl);
     }
-    
+
     /* Align the virtual buffer for this component. */
     buffer = (*cinfo->mem->access_virt_barray)
     ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index],
@@ -413,6 +418,7 @@ compress_trellis_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
     for (block_row = 0; block_row < block_rows; block_row++) {
       thisblockrow = buffer[block_row];
       lastblockrow = (block_row > 0) ? buffer[block_row-1] : NULL;
+#ifdef C_ARITH_CODING_SUPPORTED
       if (cinfo->arith_code)
         quantize_trellis_arith(cinfo, arith_r, thisblockrow,
                                buffer_dst[block_row], blocks_across,
@@ -421,6 +427,7 @@ compress_trellis_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
                                cinfo->master->norm_coef[compptr->quant_tbl_no],
                                &lastDC, lastblockrow, buffer_dst[block_row-1]);
       else
+#endif
         quantize_trellis(cinfo, dctbl, actbl, thisblockrow,
                          buffer_dst[block_row], blocks_across,
                          cinfo->quant_tbl_ptrs[compptr->quant_tbl_no],
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 3380b05c..d2465de7 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -1261,6 +1261,7 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
 
 }
 
+#ifdef C_ARITH_CODING_SUPPORTED
 GLOBAL(void)
 quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_blocks, JBLOCKROW src, JDIMENSION num_blocks,
                  JQUANT_TBL * qtbl, double *norm_src, double *norm_coef, JCOEF *last_dc_val,
@@ -1604,6 +1605,7 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
     }
   }
 }
+#endif
 
 /*
  * Initialize FDCT manager.
diff --git a/jpegint.h b/jpegint.h
index 194bb9ee..995f188f 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -91,6 +91,7 @@ struct jpeg_comp_master {
   float trellis_delta_dc_weight;
 };
 
+#ifdef C_ARITH_CODING_SUPPORTED
 /* The following two definitions specify the allocation chunk size
  * for the statistics area.
  * According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least
@@ -114,6 +115,7 @@ typedef struct {
   int arith_dc_U;
   int arith_ac_K;
 } arith_rates;
+#endif
 
 /* Main buffer control (downsampled-data buffer) */
 struct jpeg_c_main_controller {
@@ -389,12 +391,14 @@ EXTERN(void) jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row,
                               JDIMENSION num_blocks);
 EXTERN(void) jzero_far (void * target, size_t bytestozero);
 
+#ifdef C_ARITH_CODING_SUPPORTED
 EXTERN(void) jget_arith_rates (j_compress_ptr cinfo, int dc_tbl_no, int ac_tbl_no, arith_rates *r);
 
 EXTERN(void) quantize_trellis_arith
 (j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_blocks, JBLOCKROW src, JDIMENSION num_blocks,
  JQUANT_TBL * qtbl, double *norm_src, double *norm_coef, JCOEF *last_dc_val,
  JBLOCKROW coef_blocks_above, JBLOCKROW src_above);
+#endif
 
 /* Constant tables in jutils.c */
 #if 0                           /* This table is not actually needed in v6a */

From cdb6c34e1c5e7342014ee71739697c353fa9b35f Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Mon, 29 Dec 2014 18:23:51 +0100
Subject: [PATCH 083/108] Fix overflow issue in DCT

Replace add/sub with their saturating equivalent to avoid overflow.
Should resolve #139
---
 simd/jfdctint-mmx.asm     | 8 ++++----
 simd/jfdctint-sse2-64.asm | 8 ++++----
 simd/jfdctint-sse2.asm    | 8 ++++----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/simd/jfdctint-mmx.asm b/simd/jfdctint-mmx.asm
index 8536ae2c..e2d87438 100644
--- a/simd/jfdctint-mmx.asm
+++ b/simd/jfdctint-mmx.asm
@@ -444,11 +444,11 @@ EXTN(jsimd_fdct_islow_mmx):
         psubw   mm6,mm4                 ; mm6=tmp12
 
         movq    mm7,mm5
-        paddw   mm5,mm0                 ; mm5=tmp10+tmp11
-        psubw   mm7,mm0                 ; mm7=tmp10-tmp11
+        paddsw   mm5,mm0                 ; mm5=tmp10+tmp11
+        psubsw   mm7,mm0                 ; mm7=tmp10-tmp11
 
-        paddw   mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
-        paddw   mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
+        paddsw   mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
+        paddsw   mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
         psraw   mm5,PASS1_BITS          ; mm5=data0
         psraw   mm7,PASS1_BITS          ; mm7=data4
 
diff --git a/simd/jfdctint-sse2-64.asm b/simd/jfdctint-sse2-64.asm
index 53ec5276..c5583725 100644
--- a/simd/jfdctint-sse2-64.asm
+++ b/simd/jfdctint-sse2-64.asm
@@ -454,11 +454,11 @@ EXTN(jsimd_fdct_islow_sse2):
         psubw   xmm6,xmm4               ; xmm6=tmp12
 
         movdqa  xmm5,xmm7
-        paddw   xmm7,xmm2               ; xmm7=tmp10+tmp11
-        psubw   xmm5,xmm2               ; xmm5=tmp10-tmp11
+        paddsw   xmm7,xmm2               ; xmm7=tmp10+tmp11
+        psubsw   xmm5,xmm2               ; xmm5=tmp10-tmp11
 
-        paddw   xmm7,[rel PW_DESCALE_P2X]
-        paddw   xmm5,[rel PW_DESCALE_P2X]
+        paddsw   xmm7,[rel PW_DESCALE_P2X]
+        paddsw   xmm5,[rel PW_DESCALE_P2X]
         psraw   xmm7,PASS1_BITS         ; xmm7=data0
         psraw   xmm5,PASS1_BITS         ; xmm5=data4
 
diff --git a/simd/jfdctint-sse2.asm b/simd/jfdctint-sse2.asm
index 85133f80..af33ec89 100644
--- a/simd/jfdctint-sse2.asm
+++ b/simd/jfdctint-sse2.asm
@@ -462,11 +462,11 @@ EXTN(jsimd_fdct_islow_sse2):
         psubw   xmm6,xmm4               ; xmm6=tmp12
 
         movdqa  xmm5,xmm7
-        paddw   xmm7,xmm2               ; xmm7=tmp10+tmp11
-        psubw   xmm5,xmm2               ; xmm5=tmp10-tmp11
+        paddsw   xmm7,xmm2               ; xmm7=tmp10+tmp11
+        psubsw   xmm5,xmm2               ; xmm5=tmp10-tmp11
 
-        paddw   xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
-        paddw   xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
+        paddsw   xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
+        paddsw   xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
         psraw   xmm7,PASS1_BITS         ; xmm7=data0
         psraw   xmm5,PASS1_BITS         ; xmm5=data4
 

From bf506e11b79c470ebb7840f1b4f9545a9cfb690c Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Mon, 29 Dec 2014 18:38:36 +0100
Subject: [PATCH 084/108] Check image size when reading targa file

Throw an error when image width or height is 0 when reading a targa file
Solves #140
---
 rdtarga.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rdtarga.c b/rdtarga.c
index afd17c43..4e38258d 100644
--- a/rdtarga.c
+++ b/rdtarga.c
@@ -364,7 +364,8 @@ start_input_tga (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
   if (cmaptype > 1 ||           /* cmaptype must be 0 or 1 */
       source->pixel_size < 1 || source->pixel_size > 4 ||
       (UCH(targaheader[16]) & 7) != 0 || /* bits/pixel must be multiple of 8 */
-      interlace_type != 0)      /* currently don't allow interlaced image */
+      interlace_type != 0 ||      /* currently don't allow interlaced image */
+      width == 0 || height == 0)  /* image width/height must be nonzero */
     ERREXIT(cinfo, JERR_TGA_BADPARMS);
 
   if (subtype > 8) {

From 5ba6c7effb70a8f1ef6cf4b524102d36d6976570 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Mon, 29 Dec 2014 19:42:20 +0100
Subject: [PATCH 085/108] Check range of integer values in PPM text file

Add checks in PPM text file reading to make sure values are within the
specified range.
Fixes #141
---
 cderror.h |  1 +
 rdppm.c   | 24 ++++++++++++++++--------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/cderror.h b/cderror.h
index 71a77d19..77b8eb64 100644
--- a/cderror.h
+++ b/cderror.h
@@ -74,6 +74,7 @@ JMESSAGE(JWRN_GIF_NOMOREDATA, "Ran out of GIF bits")
 #ifdef PPM_SUPPORTED
 JMESSAGE(JERR_PPM_COLORSPACE, "PPM output must be grayscale or RGB")
 JMESSAGE(JERR_PPM_NONNUMERIC, "Nonnumeric data in PPM file")
+JMESSAGE(JERR_PPM_TOOLARGE, "Integer value too large in PPM file")
 JMESSAGE(JERR_PPM_NOT, "Not a PPM/PGM file")
 JMESSAGE(JTRC_PGM, "%ux%u PGM image")
 JMESSAGE(JTRC_PGM_TEXT, "%ux%u text PGM image")
diff --git a/rdppm.c b/rdppm.c
index f56d5cf3..18119895 100644
--- a/rdppm.c
+++ b/rdppm.c
@@ -68,6 +68,7 @@ typedef struct {
   JSAMPROW pixrow;              /* compressor input buffer */
   size_t buffer_width;          /* width of I/O buffer */
   JSAMPLE *rescale;             /* => maxval-remapping array, or NULL */
+  int maxval;
 } ppm_source_struct;
 
 typedef ppm_source_struct * ppm_source_ptr;
@@ -91,7 +92,7 @@ pbm_getc (FILE * infile)
 
 
 LOCAL(unsigned int)
-read_pbm_integer (j_compress_ptr cinfo, FILE * infile)
+read_pbm_integer (j_compress_ptr cinfo, FILE * infile, int maxval)
 /* Read an unsigned decimal integer from the PPM file */
 /* Swallows one trailing character after the integer */
 /* Note that on a 16-bit-int machine, only values up to 64k can be read. */
@@ -115,6 +116,10 @@ read_pbm_integer (j_compress_ptr cinfo, FILE * infile)
     val *= 10;
     val += ch - '0';
   }
+  
+  if (val > maxval)
+    ERREXIT(cinfo, JERR_PPM_TOOLARGE);
+  
   return val;
 }
 
@@ -139,10 +144,11 @@ get_text_gray_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
   register JSAMPROW ptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
+  int maxval = source->maxval;
 
   ptr = source->pub.buffer[0];
   for (col = cinfo->image_width; col > 0; col--) {
-    *ptr++ = rescale[read_pbm_integer(cinfo, infile)];
+    *ptr++ = rescale[read_pbm_integer(cinfo, infile, maxval)];
   }
   return 1;
 }
@@ -157,12 +163,13 @@ get_text_rgb_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
   register JSAMPROW ptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
+  int maxval = source->maxval;
 
   ptr = source->pub.buffer[0];
   for (col = cinfo->image_width; col > 0; col--) {
-    *ptr++ = rescale[read_pbm_integer(cinfo, infile)];
-    *ptr++ = rescale[read_pbm_integer(cinfo, infile)];
-    *ptr++ = rescale[read_pbm_integer(cinfo, infile)];
+    *ptr++ = rescale[read_pbm_integer(cinfo, infile, maxval)];
+    *ptr++ = rescale[read_pbm_integer(cinfo, infile, maxval)];
+    *ptr++ = rescale[read_pbm_integer(cinfo, infile, maxval)];
   }
   return 1;
 }
@@ -311,9 +318,9 @@ start_input_ppm (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
   }
 
   /* fetch the remaining header info */
-  w = read_pbm_integer(cinfo, source->pub.input_file);
-  h = read_pbm_integer(cinfo, source->pub.input_file);
-  maxval = read_pbm_integer(cinfo, source->pub.input_file);
+  w = read_pbm_integer(cinfo, source->pub.input_file, 65535);
+  h = read_pbm_integer(cinfo, source->pub.input_file, 65535);
+  maxval = read_pbm_integer(cinfo, source->pub.input_file, 65535);
 
   if (w <= 0 || h <= 0 || maxval <= 0) /* error check */
     ERREXIT(cinfo, JERR_PPM_NOT);
@@ -321,6 +328,7 @@ start_input_ppm (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
   cinfo->data_precision = BITS_IN_JSAMPLE; /* we always rescale data to this */
   cinfo->image_width = (JDIMENSION) w;
   cinfo->image_height = (JDIMENSION) h;
+  source->maxval = maxval;
 
   /* initialize flags to most common settings */
   need_iobuffer = TRUE;         /* do we need an I/O buffer? */

From e4189accd3b8407b868880ce70b59b18f06b768d Mon Sep 17 00:00:00 2001
From: Josh Aas <joshmoz@gmail.com>
Date: Mon, 29 Dec 2014 18:55:10 -0600
Subject: [PATCH 086/108] Add instructions for building from release tarball,
 no autoreconf required.

---
 BUILDING.txt | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/BUILDING.txt b/BUILDING.txt
index fbcd3b94..cd3064c9 100644
--- a/BUILDING.txt
+++ b/BUILDING.txt
@@ -68,12 +68,25 @@ The following procedure will build mozjpeg on Linux, FreeBSD, Cygwin, and
 Solaris/x86 systems (on Solaris, this generates a 32-bit library.  See below
 for 64-bit build instructions.)
 
+Simple Release tar.gz Source Build
+----------------------------------
+
+  cd {source_directory}
+  ./configure [additional configure flags]
+  make
+
+Non-Release Source Build (e.g. GitHub clone)
+--------------------------------------------
+
   cd {source_directory}
   autoreconf -fiv
   cd {build_directory}
   sh {source_directory}/configure [additional configure flags]
   make
 
+Results
+-------
+
 This will generate the following files under .libs/
 
   libjpeg.a

From 7faa703ebf7360e5c0a37d71a74c293232998340 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Tue, 30 Dec 2014 08:31:39 +0100
Subject: [PATCH 087/108] Remove unused variables

Fixes #142
---
 jcdctmgr.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/jcdctmgr.c b/jcdctmgr.c
index d2465de7..17127b0f 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -1282,9 +1282,6 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
   jpeg_lambda_weights_csf_luma :
   jpeg_lambda_weights_flat;
   int Ss, Se;
-  int has_eob;
-  float cost_all_zeros;
-  float best_cost_skip;
   float cost;
   float run_bits;
   int rate;
@@ -1537,14 +1534,11 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
     
     last_coeff_idx = Ss-1;
     best_cost = accumulated_zero_dist[Se] + r->rate_ac[0][1];
-    cost_all_zeros = accumulated_zero_dist[Se];
-    best_cost_skip = cost_all_zeros;
     
     for (i = Ss; i <= Se; i++) {
       int z = jpeg_natural_order[i];
       if (coef_blocks[bi][z] != 0) {
         float cost = accumulated_cost[i] + accumulated_zero_dist[Se] - accumulated_zero_dist[i];
-        float cost_wo_eob = cost;
         
         if (i < Se)
           cost += r->rate_ac[3*(i-1)][1];
@@ -1552,13 +1546,10 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
         if (cost < best_cost) {
           best_cost = cost;
           last_coeff_idx = i;
-          best_cost_skip = cost_wo_eob;
         }
       }
     }
     
-    has_eob = (last_coeff_idx < Se) + (last_coeff_idx == Ss-1);
-    
     /* Zero out coefficients that are part of runs */
     i = Se;
     while (i >= Ss)

From aa20343efd67df2785da0db46ed9fc416c33a455 Mon Sep 17 00:00:00 2001
From: Josh Aas <joshmoz@gmail.com>
Date: Tue, 30 Dec 2014 13:58:28 -0600
Subject: [PATCH 088/108] Update README.md with 3.0 announcement

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 07f2036b..74f5eece 100644
--- a/README.md
+++ b/README.md
@@ -11,4 +11,5 @@ More information:
 
 * [Version 1.0 Announcement](https://blog.mozilla.org/research/2014/03/05/introducing-the-mozjpeg-project/)
 * [Version 2.0 Announcement](https://blog.mozilla.org/research/2014/07/15/mozilla-advances-jpeg-encoding-with-mozjpeg-2-0/)
+* [Version 3.0 Announcement](https://boomswaggerboom.wordpress.com/2014/12/30/mozjpeg-3-0-released/)
 * [Mailing List](https://lists.mozilla.org/listinfo/dev-mozjpeg)</a>

From 71441f322ecefdd2da1f9ddd5992e4c6bd6f0b53 Mon Sep 17 00:00:00 2001
From: Josh Aas <joshmoz@gmail.com>
Date: Tue, 30 Dec 2014 13:59:44 -0600
Subject: [PATCH 089/108] Get rid of changelog file that we don't update.

---
 ChangeLog-mozilla.txt | 4 ----
 1 file changed, 4 deletions(-)
 delete mode 100644 ChangeLog-mozilla.txt

diff --git a/ChangeLog-mozilla.txt b/ChangeLog-mozilla.txt
deleted file mode 100644
index 3c2bf8db..00000000
--- a/ChangeLog-mozilla.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-Version 1.0 (March 4, 2014)
-===========================
-
-* Add 'jpgcrush' functionality. Figures out which progressive coding configuration uses the fewest bits.

From 5e8fd2413171efab8d5f93a0ec3fc8dd5cd424f1 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Wed, 7 Jan 2015 05:47:08 +0000
Subject: [PATCH 090/108] Oops.  Delete the duplicate copy of
 [lib]turbojpeg.dll in the binary directory when uninstalling the package.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1482 632fc199-4ca6-4c93-a231-07263d6284db
---
 release/libjpeg-turbo.nsi.in | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/release/libjpeg-turbo.nsi.in b/release/libjpeg-turbo.nsi.in
index c3941fe4..c675eac2 100755
--- a/release/libjpeg-turbo.nsi.in
+++ b/release/libjpeg-turbo.nsi.in
@@ -110,6 +110,7 @@ Section "Uninstall"
 
 !ifdef GCC
 	Delete $INSTDIR\bin\libjpeg-@DLL_VERSION@.dll
+	Delete $INSTDIR\bin\libturbojpeg.dll
 	Delete $SYSDIR\libturbojpeg.dll
 	Delete $INSTDIR\lib\libturbojpeg.dll.a"
 	Delete $INSTDIR\lib\libturbojpeg.a"
@@ -117,6 +118,7 @@ Section "Uninstall"
 	Delete $INSTDIR\lib\libjpeg.a"
 !else
 	Delete $INSTDIR\bin\jpeg@DLL_VERSION@.dll
+	Delete $INSTDIR\bin\turbojpeg.dll
 	Delete $SYSDIR\turbojpeg.dll
 	Delete $INSTDIR\lib\jpeg.lib
 	Delete $INSTDIR\lib\jpeg-static.lib

From 85e2e0f9c27790fba501ed7b7fa01f4bd562c8b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kornel=20Lesin=CC=81ski?= <kornel@geekhood.net>
Date: Thu, 8 Jan 2015 00:53:31 +0000
Subject: [PATCH 091/108] Declare inbuffer const

---
 jdatasrc.c | 4 ++--
 jpeglib.h  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/jdatasrc.c b/jdatasrc.c
index bf70422b..22a09c87 100644
--- a/jdatasrc.c
+++ b/jdatasrc.c
@@ -254,7 +254,7 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE * infile)
 
 GLOBAL(void)
 jpeg_mem_src (j_decompress_ptr cinfo,
-              unsigned char * inbuffer, unsigned long insize)
+              const unsigned char * inbuffer, unsigned long insize)
 {
   struct jpeg_source_mgr * src;
 
@@ -278,6 +278,6 @@ jpeg_mem_src (j_decompress_ptr cinfo,
   src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
   src->term_source = term_source;
   src->bytes_in_buffer = (size_t) insize;
-  src->next_input_byte = (JOCTET *) inbuffer;
+  src->next_input_byte = (const JOCTET *) inbuffer;
 }
 #endif
diff --git a/jpeglib.h b/jpeglib.h
index 059b0cf4..c1bff3bf 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -956,7 +956,7 @@ EXTERN(void) jpeg_stdio_src (j_decompress_ptr cinfo, FILE * infile);
 /* Data source and destination managers: memory buffers. */
 EXTERN(void) jpeg_mem_dest (j_compress_ptr cinfo, unsigned char ** outbuffer,
                             unsigned long * outsize);
-EXTERN(void) jpeg_mem_src (j_decompress_ptr cinfo, unsigned char * inbuffer,
+EXTERN(void) jpeg_mem_src (j_decompress_ptr cinfo, const unsigned char * inbuffer,
                            unsigned long insize);
 #endif
 

From 0aea1da9f0c8577e7fde4ff99c5e75dfdf075472 Mon Sep 17 00:00:00 2001
From: Josh Aas <joshmoz@gmail.com>
Date: Mon, 12 Jan 2015 12:18:58 -0800
Subject: [PATCH 092/108] Re-order links.

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 74f5eece..e39d7593 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ The idea is to reduce transfer times for JPEGs on the Web, thus reducing page lo
 
 More information:
 
-* [Version 1.0 Announcement](https://blog.mozilla.org/research/2014/03/05/introducing-the-mozjpeg-project/)
-* [Version 2.0 Announcement](https://blog.mozilla.org/research/2014/07/15/mozilla-advances-jpeg-encoding-with-mozjpeg-2-0/)
-* [Version 3.0 Announcement](https://boomswaggerboom.wordpress.com/2014/12/30/mozjpeg-3-0-released/)
 * [Mailing List](https://lists.mozilla.org/listinfo/dev-mozjpeg)</a>
+* [Version 3.0 Announcement](https://boomswaggerboom.wordpress.com/2014/12/30/mozjpeg-3-0-released/)
+* [Version 2.0 Announcement](https://blog.mozilla.org/research/2014/07/15/mozilla-advances-jpeg-encoding-with-mozjpeg-2-0/)
+* [Version 1.0 Announcement](https://blog.mozilla.org/research/2014/03/05/introducing-the-mozjpeg-project/)

From 0c8bf27a3cc90abc442d6a225aeb24c6a0dda026 Mon Sep 17 00:00:00 2001
From: Jodie Cunningham <jodie.cunningham@gmail.com>
Date: Wed, 21 Jan 2015 22:20:37 -0600
Subject: [PATCH 093/108] Add pkg-config requirement

Related to issue #99
---
 BUILDING.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/BUILDING.txt b/BUILDING.txt
index cd3064c9..7e753c68 100644
--- a/BUILDING.txt
+++ b/BUILDING.txt
@@ -7,6 +7,7 @@
 Build Requirements
 ==================
 
+-- pkg-config
 -- autoconf 2.56 or later
 -- automake 1.7 or later
 -- libtool 1.4 or later

From f446e5d5c71b372f09498d72c79b75e3b3445191 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kornel=20Lesin=CC=81ski?= <kornel@geekhood.net>
Date: Sat, 24 Jan 2015 12:56:39 +0000
Subject: [PATCH 094/108] Expanded .gitignore

---
 .gitignore | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 781c56c3..29478635 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,14 +1,44 @@
 Makefile.in
+Makefile
+/CMakeFiles
 /autom4te.cache
 /aclocal.m4
 /compile
 /configure
 /depcomp
 /install-sh
+/libtool
 /missing
-/stamp-h1
+/stamp-h*
+/java/classnoinst.stamp
+/pkgscripts/
+/jconfig.h
+/jconfigint.h
 /config.guess
+/config.h
 /config.h.in
+/config.log
+/config.status
 /config.sub
 /ltmain.sh
 /ar-lib
+/libjpeg.map
+/.libs/
+/simd/.libs/
+/simd/jsimdcfg.inc
+*.o
+*.lo
+*.la
+/cjpeg
+/djpeg
+/jcstest
+/jpegtran
+/jpegyuv
+/md5/md5cmp
+/rdjpgcom
+/test_enc_*
+/tjbench
+/tjbenchtest
+/tjunittest
+/wrjpgcom
+/yuvjpeg

From 67753d12988934497a93a92e93c5c57449082126 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kornel=20Lesin=CC=81ski?= <kornel@geekhood.net>
Date: Sat, 24 Jan 2015 16:27:46 +0000
Subject: [PATCH 095/108] Const on simple getters and copy source

---
 jcext.c    |  6 +++---
 jctrans.c  |  2 +-
 jdapimin.c |  4 ++--
 jpeglib.h  | 12 ++++++------
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/jcext.c b/jcext.c
index 9c95f198..1a7da407 100644
--- a/jcext.c
+++ b/jcext.c
@@ -16,7 +16,7 @@
 
 
 GLOBAL(boolean)
-jpeg_c_bool_param_supported (j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
+jpeg_c_bool_param_supported (const j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
 {
   switch (param) {
   case JBOOLEAN_OPTIMIZE_SCANS:
@@ -98,7 +98,7 @@ jpeg_c_get_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
 
 
 GLOBAL(boolean)
-jpeg_c_float_param_supported (j_compress_ptr cinfo, J_FLOAT_PARAM param)
+jpeg_c_float_param_supported (const j_compress_ptr cinfo, J_FLOAT_PARAM param)
 {
   switch (param) {
   case JFLOAT_LAMBDA_LOG_SCALE1:
@@ -149,7 +149,7 @@ jpeg_c_get_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param)
 
 
 GLOBAL(boolean)
-jpeg_c_int_param_supported (j_compress_ptr cinfo, J_INT_PARAM param)
+jpeg_c_int_param_supported (const j_compress_ptr cinfo, J_INT_PARAM param)
 {
   switch (param) {
   case JINT_COMPRESS_PROFILE:
diff --git a/jctrans.c b/jctrans.c
index f4be9f3b..c0907849 100644
--- a/jctrans.c
+++ b/jctrans.c
@@ -69,7 +69,7 @@ jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr * coef_arrays)
  */
 
 GLOBAL(void)
-jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
+jpeg_copy_critical_parameters (const j_decompress_ptr srcinfo,
                                j_compress_ptr dstinfo)
 {
   JQUANT_TBL ** qtblptr;
diff --git a/jdapimin.c b/jdapimin.c
index fc8898f8..e1327077 100644
--- a/jdapimin.c
+++ b/jdapimin.c
@@ -333,7 +333,7 @@ jpeg_consume_input (j_decompress_ptr cinfo)
  */
 
 GLOBAL(boolean)
-jpeg_input_complete (j_decompress_ptr cinfo)
+jpeg_input_complete (const j_decompress_ptr cinfo)
 {
   /* Check for valid jpeg object */
   if (cinfo->global_state < DSTATE_START ||
@@ -348,7 +348,7 @@ jpeg_input_complete (j_decompress_ptr cinfo)
  */
 
 GLOBAL(boolean)
-jpeg_has_multiple_scans (j_decompress_ptr cinfo)
+jpeg_has_multiple_scans (const j_decompress_ptr cinfo)
 {
   /* Only valid after jpeg_read_header completes */
   if (cinfo->global_state < DSTATE_READY ||
diff --git a/jpeglib.h b/jpeglib.h
index c1bff3bf..1ebced88 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -1036,10 +1036,10 @@ EXTERN(JDIMENSION) jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
                                        JDIMENSION max_lines);
 
 /* Additional entry points for buffered-image mode. */
-EXTERN(boolean) jpeg_has_multiple_scans (j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_has_multiple_scans (const j_decompress_ptr cinfo);
 EXTERN(boolean) jpeg_start_output (j_decompress_ptr cinfo, int scan_number);
 EXTERN(boolean) jpeg_finish_output (j_decompress_ptr cinfo);
-EXTERN(boolean) jpeg_input_complete (j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_input_complete (const j_decompress_ptr cinfo);
 EXTERN(void) jpeg_new_colormap (j_decompress_ptr cinfo);
 EXTERN(int) jpeg_consume_input (j_decompress_ptr cinfo);
 /* Return value is one of: */
@@ -1068,7 +1068,7 @@ EXTERN(void) jpeg_set_marker_processor (j_decompress_ptr cinfo,
 EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients (j_decompress_ptr cinfo);
 EXTERN(void) jpeg_write_coefficients (j_compress_ptr cinfo,
                                       jvirt_barray_ptr * coef_arrays);
-EXTERN(void) jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
+EXTERN(void) jpeg_copy_critical_parameters (const j_decompress_ptr srcinfo,
                                             j_compress_ptr dstinfo);
 
 /* If you choose to abort compression or decompression before completing
@@ -1090,21 +1090,21 @@ EXTERN(void) jpeg_destroy (j_common_ptr cinfo);
 EXTERN(boolean) jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired);
 
 /* Accessor functions for extension parameters */
-EXTERN(boolean) jpeg_c_bool_param_supported (j_compress_ptr cinfo,
+EXTERN(boolean) jpeg_c_bool_param_supported (const j_compress_ptr cinfo,
                                              J_BOOLEAN_PARAM param);
 EXTERN(void) jpeg_c_set_bool_param (j_compress_ptr cinfo,
                                     J_BOOLEAN_PARAM param, boolean value);
 EXTERN(boolean) jpeg_c_get_bool_param (j_compress_ptr cinfo,
                                        J_BOOLEAN_PARAM param);
 
-EXTERN(boolean) jpeg_c_float_param_supported (j_compress_ptr cinfo,
+EXTERN(boolean) jpeg_c_float_param_supported (const j_compress_ptr cinfo,
                                               J_FLOAT_PARAM param);
 EXTERN(void) jpeg_c_set_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param,
                                      float value);
 EXTERN(float) jpeg_c_get_float_param (j_compress_ptr cinfo,
                                       J_FLOAT_PARAM param);
 
-EXTERN(boolean) jpeg_c_int_param_supported (j_compress_ptr cinfo,
+EXTERN(boolean) jpeg_c_int_param_supported (const j_compress_ptr cinfo,
                                             J_INT_PARAM param);
 EXTERN(void) jpeg_c_set_int_param (j_compress_ptr cinfo, J_INT_PARAM param,
                                    int value);

From 4966e1eec546395cca7541f5c28b5d086f8ed681 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kornel=20Lesin=CC=81ski?= <kornel@geekhood.net>
Date: Sat, 24 Jan 2015 16:27:57 +0000
Subject: [PATCH 096/108] Const on getters

---
 jcext.c   | 6 +++---
 jpeglib.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/jcext.c b/jcext.c
index 1a7da407..a23a957e 100644
--- a/jcext.c
+++ b/jcext.c
@@ -70,7 +70,7 @@ jpeg_c_set_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param,
 
 
 GLOBAL(boolean)
-jpeg_c_get_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
+jpeg_c_get_bool_param (const j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
 {
   switch(param) {
   case JBOOLEAN_OPTIMIZE_SCANS:
@@ -131,7 +131,7 @@ jpeg_c_set_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param, float value)
 
 
 GLOBAL(float)
-jpeg_c_get_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param)
+jpeg_c_get_float_param (const j_compress_ptr cinfo, J_FLOAT_PARAM param)
 {
   switch (param) {
   case JFLOAT_LAMBDA_LOG_SCALE1:
@@ -198,7 +198,7 @@ jpeg_c_set_int_param (j_compress_ptr cinfo, J_INT_PARAM param, int value)
 
 
 GLOBAL(int)
-jpeg_c_get_int_param (j_compress_ptr cinfo, J_INT_PARAM param)
+jpeg_c_get_int_param (const j_compress_ptr cinfo, J_INT_PARAM param)
 {
   switch (param) {
   case JINT_COMPRESS_PROFILE:
diff --git a/jpeglib.h b/jpeglib.h
index 1ebced88..48f9c3b2 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -1094,21 +1094,21 @@ EXTERN(boolean) jpeg_c_bool_param_supported (const j_compress_ptr cinfo,
                                              J_BOOLEAN_PARAM param);
 EXTERN(void) jpeg_c_set_bool_param (j_compress_ptr cinfo,
                                     J_BOOLEAN_PARAM param, boolean value);
-EXTERN(boolean) jpeg_c_get_bool_param (j_compress_ptr cinfo,
+EXTERN(boolean) jpeg_c_get_bool_param (const j_compress_ptr cinfo,
                                        J_BOOLEAN_PARAM param);
 
 EXTERN(boolean) jpeg_c_float_param_supported (const j_compress_ptr cinfo,
                                               J_FLOAT_PARAM param);
 EXTERN(void) jpeg_c_set_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param,
                                      float value);
-EXTERN(float) jpeg_c_get_float_param (j_compress_ptr cinfo,
+EXTERN(float) jpeg_c_get_float_param (const j_compress_ptr cinfo,
                                       J_FLOAT_PARAM param);
 
 EXTERN(boolean) jpeg_c_int_param_supported (const j_compress_ptr cinfo,
                                             J_INT_PARAM param);
 EXTERN(void) jpeg_c_set_int_param (j_compress_ptr cinfo, J_INT_PARAM param,
                                    int value);
-EXTERN(int) jpeg_c_get_int_param (j_compress_ptr cinfo, J_INT_PARAM param);
+EXTERN(int) jpeg_c_get_int_param (const j_compress_ptr cinfo, J_INT_PARAM param);
 
 
 /* These marker codes are exported since applications and data source modules

From 02939f53a0557a98335dc415d797d463e508a393 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Thu, 12 Feb 2015 12:50:06 +0100
Subject: [PATCH 097/108] Fix overflow issue #157

DCT coefficients are clipped when preprocessing for deringing is used
as they can overflow
---
 jcdctmgr.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/jcdctmgr.c b/jcdctmgr.c
index 17127b0f..f601ebd6 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -22,6 +22,7 @@
 #include "jpeglib.h"
 #include "jdct.h"               /* Private declarations for DCT subsystem */
 #include "jsimddct.h"
+#include "jchuff.h"
 #include <assert.h>
 #include <math.h>
 
@@ -726,6 +727,17 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
     
     /* Quantize/descale the coefficients, and store into coef_blocks[] */
     (*do_quantize) (coef_blocks[bi], divisors, workspace);
+
+    if (do_preprocess) {
+      int i;
+      int maxval = (1 << MAX_COEF_BITS) - 1;
+      for (i = 0; i < 64; i++) {
+        if (coef_blocks[bi][i] < -maxval)
+          coef_blocks[bi][i] = -maxval;
+        if (coef_blocks[bi][i] > maxval)
+          coef_blocks[bi][i] = maxval;
+      }
+    }
   }
 }
 
@@ -845,12 +857,22 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
 
     /* Quantize/descale the coefficients, and store into coef_blocks[] */
     (*do_quantize) (coef_blocks[bi], divisors, workspace);
+    
+    if (do_preprocess) {
+      int i;
+      int maxval = (1 << MAX_COEF_BITS) - 1;
+      for (i = 0; i < 64; i++) {
+        if (coef_blocks[bi][i] < -maxval)
+          coef_blocks[bi][i] = -maxval;
+        if (coef_blocks[bi][i] > maxval)
+          coef_blocks[bi][i] = maxval;
+      }
+    }
   }
 }
 
 #endif /* DCT_FLOAT_SUPPORTED */
 
-#include "jchuff.h"
 #include "jpeg_nbits_table.h"
 
 static const float jpeg_lambda_weights_flat[64] = {
@@ -997,6 +1019,11 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
         int bits;
 
         dc_candidate[k][bi] = qval - DC_TRELLIS_CANDIDATES/2 + k;
+        if (dc_candidate[k][bi] >= (1<<MAX_COEF_BITS))
+          dc_candidate[k][bi] = (1<<MAX_COEF_BITS)-1;
+        if (dc_candidate[k][bi] <= -(1<<MAX_COEF_BITS))
+          dc_candidate[k][bi] = -(1<<MAX_COEF_BITS)+1;
+
         delta = dc_candidate[k][bi] * q - x;
         dc_candidate_dist = delta * delta * lambda_dc;
         dc_candidate[k][bi] *= 1 + 2*sign;
@@ -1076,6 +1103,9 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
         continue;
       }
 
+      if (qval >= (1<<MAX_COEF_BITS))
+        qval = (1<<MAX_COEF_BITS)-1;
+      
       num_candidates = jpeg_nbits_table[qval];
       for (k = 0; k < num_candidates; k++) {
         int delta;

From fbf0a5fbc5bc9f37b30639494b4eaa442d178a9d Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Thu, 12 Feb 2015 20:01:38 +0100
Subject: [PATCH 098/108] Larger number of DC trellis candidates

See #147
Couldn't merge provided patch, so rewrote it. Also applies change to
quantize_trellis_arith()
---
 jcdctmgr.c | 50 +++++++++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 21 deletions(-)

diff --git a/jcdctmgr.c b/jcdctmgr.c
index f601ebd6..0b82e8d3 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -897,7 +897,12 @@ static const float jpeg_lambda_weights_csf_luma[64] = {
   0.43454f, 0.42146f, 0.34609f, 0.24072f, 0.15975f, 0.10701f, 0.07558f, 0.05875f,
 };
 
-#define DC_TRELLIS_CANDIDATES 3
+#define DC_TRELLIS_MAX_CANDIDATES 9
+
+LOCAL(int) get_num_dc_trellis_candidates(int dc_quantval) {
+  /* Higher qualities can tolerate higher DC distortion */
+  return MIN(DC_TRELLIS_MAX_CANDIDATES, (2 + 60 / dc_quantval)|1);
+}
 
 GLOBAL(void)
 quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actbl, JBLOCKROW coef_blocks, JBLOCKROW src, JDIMENSION num_blocks,
@@ -930,12 +935,13 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
   int zero_run;
   int run_bits;
   int rate;
-  float *accumulated_dc_cost[DC_TRELLIS_CANDIDATES];
-  int *dc_cost_backtrack[DC_TRELLIS_CANDIDATES];
-  JCOEF *dc_candidate[DC_TRELLIS_CANDIDATES];
+  float *accumulated_dc_cost[DC_TRELLIS_MAX_CANDIDATES];
+  int *dc_cost_backtrack[DC_TRELLIS_MAX_CANDIDATES];
+  JCOEF *dc_candidate[DC_TRELLIS_MAX_CANDIDATES];
   int mode = 1;
   float lambda_table[DCTSIZE2];
-
+  const int dc_trellis_candidates = get_num_dc_trellis_candidates(qtbl->quantval[0]);
+  
   Ss = cinfo->Ss;
   Se = cinfo->Se;
   if (Ss == 0)
@@ -958,8 +964,9 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
     accumulated_block_cost[0] = 0;
     requires_eob[0] = 0;
   }
+  
   if (cinfo->master->trellis_quant_dc) {
-    for (i = 0; i < DC_TRELLIS_CANDIDATES; i++) {
+    for (i = 0; i < dc_trellis_candidates; i++) {
       accumulated_dc_cost[i] = (float *)malloc(num_blocks * sizeof(float));
       dc_cost_backtrack[i] = (int *)malloc(num_blocks * sizeof(int));
       dc_candidate[i] = (JCOEF *)malloc(num_blocks * sizeof(JCOEF));
@@ -1013,12 +1020,12 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
       float dc_candidate_dist;
 
       qval = (x + q/2) / q; /* quantized value (round nearest) */
-      for (k = 0; k < DC_TRELLIS_CANDIDATES; k++) {
+      for (k = 0; k < dc_trellis_candidates; k++) {
         int delta;
         int dc_delta;
         int bits;
 
-        dc_candidate[k][bi] = qval - DC_TRELLIS_CANDIDATES/2 + k;
+        dc_candidate[k][bi] = qval - dc_trellis_candidates/2 + k;
         if (dc_candidate[k][bi] >= (1<<MAX_COEF_BITS))
           dc_candidate[k][bi] = (1<<MAX_COEF_BITS)-1;
         if (dc_candidate[k][bi] <= -(1<<MAX_COEF_BITS))
@@ -1060,7 +1067,7 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
           accumulated_dc_cost[k][0] = cost;
           dc_cost_backtrack[k][0] = -1;
         } else {
-          for (l = 0; l < DC_TRELLIS_CANDIDATES; l++) {
+          for (l = 0; l < dc_trellis_candidates; l++) {
             dc_delta = dc_candidate[k][bi] - dc_candidate[l][bi-1];
 
             /* Derive number of suffix bits */
@@ -1270,7 +1277,7 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
   
   if (cinfo->master->trellis_quant_dc) {
     j = 0;
-    for (i = 1; i < DC_TRELLIS_CANDIDATES; i++) {
+    for (i = 1; i < dc_trellis_candidates; i++) {
       if (accumulated_dc_cost[i][num_blocks-1] < accumulated_dc_cost[j][num_blocks-1])
         j = i;
     }
@@ -1282,7 +1289,7 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
     /* Save DC predictor */
     *last_dc_val = coef_blocks[num_blocks-1][0];
 
-    for (i = 0; i < DC_TRELLIS_CANDIDATES; i++) {
+    for (i = 0; i < dc_trellis_candidates; i++) {
       free(accumulated_dc_cost[i]);
       free(dc_cost_backtrack[i]);
       free(dc_candidate[i]);
@@ -1315,13 +1322,14 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
   float cost;
   float run_bits;
   int rate;
-  float *accumulated_dc_cost[DC_TRELLIS_CANDIDATES];
-  int *dc_cost_backtrack[DC_TRELLIS_CANDIDATES];
-  JCOEF *dc_candidate[DC_TRELLIS_CANDIDATES];
-  int *dc_context[DC_TRELLIS_CANDIDATES];
+  float *accumulated_dc_cost[DC_TRELLIS_MAX_CANDIDATES];
+  int *dc_cost_backtrack[DC_TRELLIS_MAX_CANDIDATES];
+  JCOEF *dc_candidate[DC_TRELLIS_MAX_CANDIDATES];
+  int *dc_context[DC_TRELLIS_MAX_CANDIDATES];
   
   int mode = 1;
   float lambda_table[DCTSIZE2];
+  const int dc_trellis_candidates = get_num_dc_trellis_candidates(qtbl->quantval[0]);
   
   Ss = cinfo->Ss;
   Se = cinfo->Se;
@@ -1331,7 +1339,7 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
     return;
   
   if (cinfo->master->trellis_quant_dc) {
-    for (i = 0; i < DC_TRELLIS_CANDIDATES; i++) {
+    for (i = 0; i < dc_trellis_candidates; i++) {
       accumulated_dc_cost[i] = (float *)malloc(num_blocks * sizeof(float));
       dc_cost_backtrack[i] = (int *)malloc(num_blocks * sizeof(int));
       dc_candidate[i] = (JCOEF *)malloc(num_blocks * sizeof(JCOEF));
@@ -1389,14 +1397,14 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
       qval = (x + q/2) / q; /* quantized value (round nearest) */
       
       /* loop over candidates in current block */
-      for (k = 0; k < DC_TRELLIS_CANDIDATES; k++) {
+      for (k = 0; k < dc_trellis_candidates; k++) {
         int delta;
         int dc_delta;
         float bits;
         int m;
         int v2;
         
-        dc_candidate[k][bi] = qval - DC_TRELLIS_CANDIDATES/2 + k;
+        dc_candidate[k][bi] = qval - dc_trellis_candidates/2 + k;
         delta = dc_candidate[k][bi] * q - x;
         dc_candidate_dist = delta * delta * lambda_dc;
         dc_candidate[k][bi] *= 1 + 2*sign;
@@ -1420,7 +1428,7 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
         }
         
         /* loop of candidates from previous block */
-        for (l = 0; l < (bi == 0 ? 1 : DC_TRELLIS_CANDIDATES); l++) {
+        for (l = 0; l < (bi == 0 ? 1 : dc_trellis_candidates); l++) {
           int dc_pred = (bi == 0 ? *last_dc_val : dc_candidate[l][bi-1]);
           int updated_dc_context = 0;
           int st = (bi == 0) ? 0 : dc_context[l][bi-1];
@@ -1606,7 +1614,7 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
   
   if (cinfo->master->trellis_quant_dc) {
     j = 0;
-    for (i = 1; i < DC_TRELLIS_CANDIDATES; i++) {
+    for (i = 1; i < dc_trellis_candidates; i++) {
       if (accumulated_dc_cost[i][num_blocks-1] < accumulated_dc_cost[j][num_blocks-1])
         j = i;
     }
@@ -1618,7 +1626,7 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
     /* Save DC predictor */
     *last_dc_val = coef_blocks[num_blocks-1][0];
     
-    for (i = 0; i < DC_TRELLIS_CANDIDATES; i++) {
+    for (i = 0; i < dc_trellis_candidates; i++) {
       free(accumulated_dc_cost[i]);
       free(dc_cost_backtrack[i]);
       free(dc_candidate[i]);

From 1da5cf4251951bd2246038270c6249a6c8f5ec97 Mon Sep 17 00:00:00 2001
From: Tristan Matthews <tmatth@videolan.org>
Date: Fri, 13 Mar 2015 13:23:45 -0500
Subject: [PATCH 099/108] rdpng: convert 16-bit input to 8-bit

This is instead of assuming 8-bit input and producing borked images.
---
 rdpng.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/rdpng.c b/rdpng.c
index fef8aee5..34ac3081 100644
--- a/rdpng.c
+++ b/rdpng.c
@@ -76,6 +76,10 @@ start_input_png (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
         cinfo->in_color_space = JCS_RGB;
         cinfo->input_components = 3;
     }
+
+    if (bit_depth == 16)
+        png_set_strip_16(source->png_ptr);
+
     cinfo->data_precision = 8;
     cinfo->image_width = width;
     cinfo->image_height = height;

From cc11b90b227bdd0ac8d1553a011ef729a6e7ea6d Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Mon, 23 Mar 2015 13:33:14 -0500
Subject: [PATCH 100/108] Fix #153

Make sure BMP height and width don't exceed positive signed 32-bit
range even when 64-bit variables are being used.
---
 rdbmp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdbmp.c b/rdbmp.c
index ffedeed4..cf359e76 100644
--- a/rdbmp.c
+++ b/rdbmp.c
@@ -381,7 +381,7 @@ start_input_bmp (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
     return;
   }
 
-  if (biWidth <= 0 || biHeight <= 0)
+  if (biWidth <= 0 || biHeight <= 0 || biWidth > 0x7fffffffL || biHeight > 0x7fffffffL)
     ERREXIT(cinfo, JERR_BMP_EMPTY);
   if (biPlanes != 1)
     ERREXIT(cinfo, JERR_BMP_BADPLANES);

From f8a5b80cb001b478e73767dae7d08e609ed0982b Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Mon, 23 Mar 2015 14:05:13 -0500
Subject: [PATCH 101/108] cjpeg option for baseline quant tables

Add command line option -quant-baseline to cjpeg to force quantization
table entries to be in 1-255 range for JPEG baseline compatibility. See
related discussion in #145
---
 cjpeg.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/cjpeg.c b/cjpeg.c
index 5695e7d0..bb73ee98 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -207,6 +207,7 @@ usage (void)
   fprintf(stderr, "  -dct float     Use floating-point DCT method%s\n",
           (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
 #endif
+  fprintf(stderr, "  -quant-baseline Use 8-bit quantization table entries for baseline JPEG compatibility\n");
   fprintf(stderr, "  -quant-table N Use predefined quantization table N:\n");
   fprintf(stderr, "                 - 0 JPEG Annex K\n");
   fprintf(stderr, "                 - 1 Flat\n");
@@ -450,7 +451,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
       qtablefile = argv[argn];
       /* We postpone actually reading the file in case -quality comes later. */
 
-    } else if (keymatch(arg, "quant-table", 2)) {
+    } else if (keymatch(arg, "quant-table", 7)) {
       int val;
       if (++argn >= argc)       /* advance to next argument */
         usage();
@@ -461,7 +462,11 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
         usage();
       }
       jpeg_set_quality(cinfo, 75, TRUE);
-      
+
+    } else if (keymatch(arg, "quant-baseline", 7)) {
+      /* Force quantization table to meet baseline requirements */
+      force_baseline = TRUE;
+    
     } else if (keymatch(arg, "restart", 1)) {
       /* Restart interval in MCU rows (or in MCUs with 'b'). */
       long lval;

From 619c20d5e841807fb7b4a32beaca1786c9d834cc Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Thu, 2 Apr 2015 17:45:01 -0500
Subject: [PATCH 102/108] Scan optimization: return error when unable to copy
 data buffer

#166 describes an issue where I/O suspension is not properly handled in
scan optimization. Supporting I/O suspension may be difficult to
achieve here, thus return an error to make it explicit that I/O
suspension is unsupported.
---
 cderror.h  | 2 ++
 jcmaster.c | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/cderror.h b/cderror.h
index 77b8eb64..e7ac9d99 100644
--- a/cderror.h
+++ b/cderror.h
@@ -127,6 +127,8 @@ JMESSAGE(JERR_UNSUPPORTED_FORMAT, "Unsupported output file format")
 JMESSAGE(JERR_PNG_ERROR, "Unable to read PNG file: %s")
 #endif
 
+JMESSAGE(JERR_UNSUPPORTED_SUSPEND, "I/O suspension not supported in scan optimization")
+
 #ifdef JMAKE_ENUM_LIST
 
   JMSG_LASTADDONCODE
diff --git a/jcmaster.c b/jcmaster.c
index bf5bf2f5..938e058b 100644
--- a/jcmaster.c
+++ b/jcmaster.c
@@ -605,7 +605,9 @@ copy_buffer (j_compress_ptr cinfo, int scan_idx)
     size -= cinfo->dest->free_in_buffer;
     cinfo->dest->next_output_byte += cinfo->dest->free_in_buffer;
     cinfo->dest->free_in_buffer = 0;
-    (*cinfo->dest->empty_output_buffer)(cinfo);
+    
+    if (!(*cinfo->dest->empty_output_buffer)(cinfo))
+      ERREXIT(cinfo, JERR_UNSUPPORTED_SUSPEND);
   }
 
   MEMCOPY(cinfo->dest->next_output_byte, src, size);

From c716918d0ac7786f846479e8fd9b13047f0c3904 Mon Sep 17 00:00:00 2001
From: Frank Bossen <frank@bossentech.com>
Date: Wed, 8 Apr 2015 01:36:02 +0200
Subject: [PATCH 103/108] Fix previous commit

Define JERR_UNSUPPORTED_SUSPEND in correct header file
---
 cderror.h | 2 --
 jerror.h  | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cderror.h b/cderror.h
index e7ac9d99..77b8eb64 100644
--- a/cderror.h
+++ b/cderror.h
@@ -127,8 +127,6 @@ JMESSAGE(JERR_UNSUPPORTED_FORMAT, "Unsupported output file format")
 JMESSAGE(JERR_PNG_ERROR, "Unable to read PNG file: %s")
 #endif
 
-JMESSAGE(JERR_UNSUPPORTED_SUSPEND, "I/O suspension not supported in scan optimization")
-
 #ifdef JMAKE_ENUM_LIST
 
   JMSG_LASTADDONCODE
diff --git a/jerror.h b/jerror.h
index cb5d6f84..e2d33875 100644
--- a/jerror.h
+++ b/jerror.h
@@ -210,6 +210,8 @@ JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 JMESSAGE(JERR_BAD_PARAM, "Bogus parameter")
 JMESSAGE(JERR_BAD_PARAM_VALUE, "Bogus parameter value")
 
+JMESSAGE(JERR_UNSUPPORTED_SUSPEND, "I/O suspension not supported in scan optimization")  
+
 #ifdef JMAKE_ENUM_LIST
 
   JMSG_LASTMSGCODE

From 6b05623682be5bcc93aa6c3366e20118ff41a12b Mon Sep 17 00:00:00 2001
From: Vittorio Giovara <vittorio.giovara@gmail.com>
Date: Thu, 16 Apr 2015 16:30:34 +0100
Subject: [PATCH 104/108] jpegtran: Do not leak the input and output buffers

Signed-off-by: Vittorio Giovara <vittorio.giovara@gmail.com>
---
 jpegtran.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/jpegtran.c b/jpegtran.c
index ebda3cfd..f40c3ccf 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -624,6 +624,9 @@ main (int argc, char **argv)
   end_progress_monitor((j_common_ptr) &dstinfo);
 #endif
 
+  free(inbuffer);
+  free(outbuffer);
+
   /* All done. */
   exit(jsrcerr.num_warnings + jdsterr.num_warnings ?EXIT_WARNING:EXIT_SUCCESS);
   return 0;                     /* suppress no-return-value warnings */

From 6c1538470fcd17e818739e26f9b887c9f0b34793 Mon Sep 17 00:00:00 2001
From: Arjun Sreedharan <arjun024@gmail.com>
Date: Sat, 25 Apr 2015 02:17:45 +0530
Subject: [PATCH 105/108] yuvjpeg: fix memory leak when @image_buffer
 allocation fails

Make sure @yuv_buffer is freed before return.

Signed-off-by: Arjun Sreedharan <arjun024@gmail.com>
---
 yuvjpeg.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/yuvjpeg.c b/yuvjpeg.c
index 11437941..b21683c5 100644
--- a/yuvjpeg.c
+++ b/yuvjpeg.c
@@ -196,6 +196,7 @@ int main(int argc, char *argv[]) {
   image_buffer =
    malloc(frame_width*frame_height + 2*(frame_width/2)*(frame_height/2));
   if (!image_buffer) {
+    free(yuv_buffer);
     fprintf(stderr, "Memory allocation failure!\n");
     return 1;
   }

From 7e8ed0d448e44ebac7dc7b4963c36209924a0b8b Mon Sep 17 00:00:00 2001
From: Tristan Matthews <tmatth@videolan.org>
Date: Sun, 3 May 2015 21:35:39 -0400
Subject: [PATCH 106/108] jpegyuv: fix memory leak when @image_buffer
 allocation fails

Make sure @yuv_buffer is freed before return.
---
 jpegyuv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/jpegyuv.c b/jpegyuv.c
index 86c737d8..85c2360c 100644
--- a/jpegyuv.c
+++ b/jpegyuv.c
@@ -112,6 +112,7 @@ int main(int argc, char *argv[]) {
 
   image_buffer = malloc(frame_width*16 + 2*(frame_width/2)*8);
   if (!image_buffer) {
+    free(yuv_buffer);
     fprintf(stderr, "Memory allocation failure!\n");
     return 1;
   }

From ee39375c85d3968431946713d3edc481d191ff93 Mon Sep 17 00:00:00 2001
From: Tristan Matthews <tmatth@videolan.org>
Date: Mon, 4 May 2015 09:44:11 -0400
Subject: [PATCH 107/108] jpegyuv: fix memory leak when path is invalid

---
 jpegyuv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/jpegyuv.c b/jpegyuv.c
index 85c2360c..9375568c 100644
--- a/jpegyuv.c
+++ b/jpegyuv.c
@@ -164,6 +164,7 @@ int main(int argc, char *argv[]) {
   yuv_fd = fopen(yuv_path, "wb");
   if (!yuv_fd) {
     fprintf(stderr, "Invalid path to YUV file!");
+    free(yuv_buffer);
     return 1;
   }
   if (fwrite(yuv_buffer, yuv_size, 1, yuv_fd) != 1) {

From 5198654f739552ed24c7f014574d1e74ee9ef8ac Mon Sep 17 00:00:00 2001
From: Josh Aas <joshmoz@fastmail.com>
Date: Mon, 18 May 2015 09:28:09 -0500
Subject: [PATCH 108/108] Bump version number to 3.1.

---
 CMakeLists.txt | 2 +-
 configure.ac   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1f2e5f52..afaf9a49 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,7 @@ if(POLICY CMP0022)
 endif()
 
 project(mozjpeg C)
-set(VERSION 3.0)
+set(VERSION 3.1)
 
 if(CYGWIN OR NOT CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
   execute_process(COMMAND "date" "+%Y%m%d" OUTPUT_VARIABLE BUILD)
diff --git a/configure.ac b/configure.ac
index 4d6f3932..a2b4618f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.56])
-AC_INIT([mozjpeg], [3.0])
+AC_INIT([mozjpeg], [3.1])
 BUILD=`date +%Y%m%d`
 
 AM_INIT_AUTOMAKE([-Wall foreign dist-bzip2])