Merge branch 'trellis'

2014-05-09 18:28:26 -04:00
parent b36bec4d71 87254c1c44
commit 93da07241d
11 changed files with 664 additions and 34 deletions
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -170,6 +170,12 @@ usage (void)
 #endif
  fprintf(stderr, "  -revert        Revert to standard defaults (instead of mozjpeg defaults)\n");
  fprintf(stderr, "  -fastcrush     Disable progressive scan optimization\n");
+  fprintf(stderr, "  -multidcscan   Use multiple DC scans (may be incompatible with some JPEG decoders)\n");
+  fprintf(stderr, "  -notrellis     Disable trellis optimization\n");
+  fprintf(stderr, "  -tune-psnr     Tune trellis optimization for PSNR\n");
+  fprintf(stderr, "  -tune-hvs-psnr Tune trellis optimization for PSNR-HVS (default)\n");
+  fprintf(stderr, "  -tune-ssim     Tune trellis optimization for SSIM\n");
+  fprintf(stderr, "  -tune-ms-ssim  Tune trellis optimization for MS-SSIM\n");
  fprintf(stderr, "Switches for advanced users:\n");
 #ifdef C_ARITH_CODING_SUPPORTED
  fprintf(stderr, "  -arithmetic    Use arithmetic coding\n");
@@ -302,6 +308,10 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
    } else if (keymatch(arg, "fastcrush", 4)) {
      cinfo->optimize_scans = FALSE;

+    } else if (keymatch(arg, "flat", 4)) {
+      cinfo->use_flat_quant_tbl = TRUE;
+      jpeg_set_quality(cinfo, 75, TRUE);
+      
    } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
      /* Force a monochrome JPEG file to be generated. */
      jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
@@ -310,6 +320,16 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
      /* Force an RGB JPEG file to be generated. */
      jpeg_set_colorspace(cinfo, JCS_RGB);

+    } else if (keymatch(arg, "lambda1", 7)) {
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      cinfo->lambda_log_scale1 = atof(argv[argn]);
+      
+    } else if (keymatch(arg, "lambda2", 7)) {
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      cinfo->lambda_log_scale2 = atof(argv[argn]);
+      
    } else if (keymatch(arg, "maxmemory", 3)) {
      /* Maximum memory in Kb (or Mb with 'm'). */
      long lval;
@@ -323,6 +343,9 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
 	lval *= 1000L;
      cinfo->mem->max_memory_to_use = lval * 1000L;

+    } else if (keymatch(arg, "multidcscan", 3)) {
+      cinfo->one_dc_scan = FALSE;
+      
    } else if (keymatch(arg, "optimize", 1) || keymatch(arg, "optimise", 1)) {
      /* Enable entropy parm optimization. */
 #ifdef ENTROPY_OPT_SUPPORTED
@@ -446,6 +469,38 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
      /* Input file is Targa format. */
      is_targa = TRUE;

+    } else if (keymatch(arg, "notrellis", 1)) {
+      /* disable trellis quantization */
+      cinfo->trellis_quant = FALSE;
+      
+    } else if (keymatch(arg, "tune-psnr", 6)) {
+      cinfo->use_flat_quant_tbl = TRUE;
+      cinfo->lambda_log_scale1 = 9.0;
+      cinfo->lambda_log_scale2 = 0.0;
+      cinfo->use_lambda_weight_tbl = FALSE;
+      jpeg_set_quality(cinfo, 75, TRUE);
+      
+    } else if (keymatch(arg, "tune-ssim", 6)) {
+      cinfo->use_flat_quant_tbl = TRUE;
+      cinfo->lambda_log_scale1 = 12.0;
+      cinfo->lambda_log_scale2 = 13.5;
+      cinfo->use_lambda_weight_tbl = FALSE;
+      jpeg_set_quality(cinfo, 75, TRUE);
+      
+    } else if (keymatch(arg, "tune-ms-ssim", 6)) {
+      cinfo->use_flat_quant_tbl = TRUE;
+      cinfo->lambda_log_scale1 = 10.5;
+      cinfo->lambda_log_scale2 = 13.0;
+      cinfo->use_lambda_weight_tbl = TRUE;
+      jpeg_set_quality(cinfo, 75, TRUE);
+      
+    } else if (keymatch(arg, "tune-hvs-psnr", 6)) {
+      cinfo->use_flat_quant_tbl = FALSE;
+      cinfo->lambda_log_scale1 = 16.0;
+      cinfo->lambda_log_scale2 = 15.5;
+      cinfo->use_lambda_weight_tbl = TRUE;
+      jpeg_set_quality(cinfo, 75, TRUE);
+      
    } else {
      usage();			/* bogus switch */
    }
--- a/configure.ac
+++ b/configure.ac
@@ -93,6 +93,7 @@ if test "x${SUNCC}" = "xyes"; then
 fi

 # Checks for libraries.
+AC_CHECK_LIB([m],[pow])

 # Checks for header files.
 AC_HEADER_STDC
--- a/jccoefct.c
+++ b/jccoefct.c
@@ -2,6 +2,8 @@
 * jccoefct.c
 *
 * Copyright (C) 1994-1997, Thomas G. Lane.
+ * mozjpeg Modifications:
+ * Copyright (C) 2014, Mozilla Corporation.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -13,7 +15,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-
+#include "jchuff.h"

 /* We use a full-image coefficient buffer when doing Huffman optimization,
 * and also for writing multiple-scan JPEG files.  In all cases, the DCT
@@ -52,6 +54,10 @@ typedef struct {

  /* In multi-pass modes, we need a virtual block array for each component. */
  jvirt_barray_ptr whole_image[MAX_COMPONENTS];
+
+  /* when using trellis quantization, need to keep a copy of all unquantized coefficients */
+  jvirt_barray_ptr whole_image_uq[MAX_COMPONENTS];
+
 } my_coef_controller;

 typedef my_coef_controller * my_coef_ptr;
@@ -66,6 +72,8 @@ METHODDEF(boolean) compress_first_pass
 METHODDEF(boolean) compress_output
    JPP((j_compress_ptr cinfo, JSAMPIMAGE input_buf));
 #endif
+METHODDEF(boolean) compress_trellis_pass
+    JPP((j_compress_ptr cinfo, JSAMPIMAGE input_buf));


 LOCAL(void)
@@ -122,6 +130,12 @@ start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
    coef->pub.compress_data = compress_output;
    break;
 #endif
+  case JBUF_REQUANT:
+    if (coef->whole_image[0] == NULL)
+      ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+    coef->pub.compress_data = compress_trellis_pass;
+    break;
+      
  default:
    ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
    break;
@@ -177,7 +191,7 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 	    (*cinfo->fdct->forward_DCT) (cinfo, compptr,
 					 input_buf[compptr->component_index],
 					 coef->MCU_buffer[blkn],
-					 ypos, xpos, (JDIMENSION) blockcnt);
+					 ypos, xpos, (JDIMENSION) blockcnt, NULL);
 	    if (blockcnt < compptr->MCU_width) {
 	      /* Create some dummy blocks at the right edge of the image. */
 	      jzero_far((void FAR *) coef->MCU_buffer[blkn + blockcnt],
@@ -252,6 +266,7 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
  jpeg_component_info *compptr;
  JBLOCKARRAY buffer;
  JBLOCKROW thisblockrow, lastblockrow;
+  JBLOCKARRAY buffer_dst;

  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
@@ -260,6 +275,12 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
      ((j_common_ptr) cinfo, coef->whole_image[ci],
       coef->iMCU_row_num * compptr->v_samp_factor,
       (JDIMENSION) compptr->v_samp_factor, TRUE);
+    
+    buffer_dst = (*cinfo->mem->access_virt_barray)
+      ((j_common_ptr) cinfo, coef->whole_image_uq[ci],
+       coef->iMCU_row_num * compptr->v_samp_factor,
+       (JDIMENSION) compptr->v_samp_factor, TRUE);
+    
    /* Count non-dummy DCT block rows in this iMCU row. */
    if (coef->iMCU_row_num < last_iMCU_row)
      block_rows = compptr->v_samp_factor;
@@ -282,7 +303,7 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
      (*cinfo->fdct->forward_DCT) (cinfo, compptr,
 				   input_buf[ci], thisblockrow,
 				   (JDIMENSION) (block_row * DCTSIZE),
-				   (JDIMENSION) 0, blocks_across);
+				   (JDIMENSION) 0, blocks_across, buffer_dst[block_row]);
      if (ndummy > 0) {
 	/* Create dummy blocks at the right edge of the image. */
 	thisblockrow += blocks_across; /* => first dummy block */
@@ -326,6 +347,101 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
  return compress_output(cinfo, input_buf);
 }

+METHODDEF(boolean)
+compress_trellis_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+{
+  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
+  JDIMENSION blocks_across, MCUs_across, MCUindex;
+  int bi, ci, h_samp_factor, block_row, block_rows, ndummy;
+  JCOEF lastDC;
+  jpeg_component_info *compptr;
+  JBLOCKARRAY buffer;
+  JBLOCKROW thisblockrow, lastblockrow;
+  JBLOCKARRAY buffer_dst;
+  
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    c_derived_tbl actbl_data;
+    c_derived_tbl *actbl = &actbl_data;
+    compptr = cinfo->cur_comp_info[ci];
+
+    jpeg_make_c_derived_tbl(cinfo, FALSE, compptr->ac_tbl_no, &actbl);
+    
+    /* Align the virtual buffer for this component. */
+    buffer = (*cinfo->mem->access_virt_barray)
+    ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index],
+     coef->iMCU_row_num * compptr->v_samp_factor,
+     (JDIMENSION) compptr->v_samp_factor, TRUE);
+    
+    buffer_dst = (*cinfo->mem->access_virt_barray)
+    ((j_common_ptr) cinfo, coef->whole_image_uq[compptr->component_index],
+     coef->iMCU_row_num * compptr->v_samp_factor,
+     (JDIMENSION) compptr->v_samp_factor, TRUE);
+    
+    /* Count non-dummy DCT block rows in this iMCU row. */
+    if (coef->iMCU_row_num < last_iMCU_row)
+      block_rows = compptr->v_samp_factor;
+    else {
+      /* NB: can't use last_row_height here, since may not be set! */
+      block_rows = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+      if (block_rows == 0) block_rows = compptr->v_samp_factor;
+    }
+    blocks_across = compptr->width_in_blocks;
+    h_samp_factor = compptr->h_samp_factor;
+    /* Count number of dummy blocks to be added at the right margin. */
+    ndummy = (int) (blocks_across % h_samp_factor);
+    if (ndummy > 0)
+      ndummy = h_samp_factor - ndummy;
+    /* Perform DCT for all non-dummy blocks in this iMCU row.  Each call
+     * on forward_DCT processes a complete horizontal row of DCT blocks.
+     */
+    for (block_row = 0; block_row < block_rows; block_row++) {
+      thisblockrow = buffer[block_row];
+      quantize_trellis(cinfo, actbl, thisblockrow, buffer_dst[block_row], blocks_across, cinfo->quant_tbl_ptrs[compptr->quant_tbl_no], cinfo->norm_src[compptr->quant_tbl_no], cinfo->norm_coef[compptr->quant_tbl_no]);
+      
+      if (ndummy > 0) {
+	/* Create dummy blocks at the right edge of the image. */
+	thisblockrow += blocks_across; /* => first dummy block */
+	jzero_far((void FAR *) thisblockrow, ndummy * SIZEOF(JBLOCK));
+	lastDC = thisblockrow[-1][0];
+	for (bi = 0; bi < ndummy; bi++) {
+	  thisblockrow[bi][0] = lastDC;
+	}
+      }
+    }
+    /* If at end of image, create dummy block rows as needed.
+     * The tricky part here is that within each MCU, we want the DC values
+     * of the dummy blocks to match the last real block's DC value.
+     * This squeezes a few more bytes out of the resulting file...
+     */
+    if (coef->iMCU_row_num == last_iMCU_row) {
+      blocks_across += ndummy;	/* include lower right corner */
+      MCUs_across = blocks_across / h_samp_factor;
+      for (block_row = block_rows; block_row < compptr->v_samp_factor;
+	   block_row++) {
+	thisblockrow = buffer[block_row];
+	lastblockrow = buffer[block_row-1];
+	jzero_far((void FAR *) thisblockrow,
+		  (size_t) (blocks_across * SIZEOF(JBLOCK)));
+	for (MCUindex = 0; MCUindex < MCUs_across; MCUindex++) {
+	  lastDC = lastblockrow[h_samp_factor-1][0];
+	  for (bi = 0; bi < h_samp_factor; bi++) {
+	    thisblockrow[bi][0] = lastDC;
+	  }
+	  thisblockrow += h_samp_factor; /* advance to next MCU in row */
+	  lastblockrow += h_samp_factor;
+	}
+      }
+    }
+  }
+
+  /* NB: compress_output will increment iMCU_row_num if successful.
+   * A suspension return will result in redoing all the work above next time.
+   */
+  
+  /* Emit data to the entropy encoder, sharing code with subsequent passes */
+  return compress_output(cinfo, input_buf);
+}

 /*
 * Process some data in subsequent passes of a multi-pass case.
@@ -377,6 +493,7 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 	  }
 	}
      }
+
      /* Try to write the MCU. */
      if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
 	/* Suspension forced; update state counters and exit */
@@ -429,6 +546,14 @@ jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer)
 	 (JDIMENSION) jround_up((long) compptr->height_in_blocks,
 				(long) compptr->v_samp_factor),
 	 (JDIMENSION) compptr->v_samp_factor);
+      
+      coef->whole_image_uq[ci] = (*cinfo->mem->request_virt_barray)
+        ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
+         (JDIMENSION) jround_up((long) compptr->width_in_blocks,
+                                (long) compptr->h_samp_factor),
+         (JDIMENSION) jround_up((long) compptr->height_in_blocks,
+                                (long) compptr->v_samp_factor),
+         (JDIMENSION) compptr->v_samp_factor);
    }
 #else
    ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -7,6 +7,8 @@
 * Copyright (C) 1999-2006, MIYASAKA Masaru.
 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 * Copyright (C) 2011 D. R. Commander
+ * mozjpeg Modifications:
+ * Copyright (C) 2014, Mozilla Corporation.
 * For conditions of distribution and use, see the accompanying README file.
 *
 * This file contains the forward-DCT management logic.
@@ -20,7 +22,8 @@
 #include "jpeglib.h"
 #include "jdct.h"		/* Private declarations for DCT subsystem */
 #include "jsimddct.h"
-
+#include <assert.h>
+#include <math.h>

 /* Private subobject for this module */

@@ -412,7 +415,7 @@ METHODDEF(void)
 forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
 	     JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
 	     JDIMENSION start_row, JDIMENSION start_col,
-	     JDIMENSION num_blocks)
+	     JDIMENSION num_blocks, JBLOCKROW dst)
 /* This version is used for integer DCT implementations. */
 {
  /* This routine is heavily used, so it's worth coding it tightly. */
@@ -436,6 +439,16 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
    /* Perform the DCT */
    (*do_dct) (workspace);

+    /* Save unquantized transform coefficients for later trellis quantization */
+    if (dst) {
+      int i;
+      for (i = 0; i < DCTSIZE2; i++) {
+        dst[bi][i] = workspace[i];
+        //printf("d%d ", workspace[i]);
+      }
+      //printf("\n");
+    }
+    
    /* Quantize/descale the coefficients, and store into coef_blocks[] */
    (*do_quantize) (coef_blocks[bi], divisors, workspace);
  }
@@ -502,7 +515,7 @@ METHODDEF(void)
 forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
 		   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
 		   JDIMENSION start_row, JDIMENSION start_col,
-		   JDIMENSION num_blocks)
+		   JDIMENSION num_blocks, JBLOCKROW dst)
 /* This version is used for floating-point DCT implementations. */
 {
  /* This routine is heavily used, so it's worth coding it tightly. */
@@ -534,6 +547,290 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,

 #endif /* DCT_FLOAT_SUPPORTED */

+#include "jchuff.h"
+
+static unsigned char jpeg_nbits_table[65536];
+static int jpeg_nbits_table_init = 0;
+
+static const float jpeg_lambda_weights_flat[64] = {
+  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
+};
+
+static const float jpeg_lambda_weights_csf_luma[64] = {
+  3.35630f, 3.59892f, 3.20921f, 2.28102f, 1.42378f, 0.88079f, 0.58190f, 0.43454f,
+  3.59893f, 3.21284f, 2.71282f, 1.98092f, 1.30506f, 0.83852f, 0.56346f, 0.42146f,
+  3.20921f, 2.71282f, 2.12574f, 1.48616f, 0.99660f, 0.66132f, 0.45610f, 0.34609f,
+  2.28102f, 1.98092f, 1.48616f, 0.97492f, 0.64622f, 0.43812f, 0.31074f, 0.24072f,
+  1.42378f, 1.30506f, 0.99660f, 0.64623f, 0.42051f, 0.28446f, 0.20380f, 0.15975f,
+  0.88079f, 0.83852f, 0.66132f, 0.43812f, 0.28446f, 0.19092f, 0.13635f, 0.10701f,
+  0.58190f, 0.56346f, 0.45610f, 0.31074f, 0.20380f, 0.13635f, 0.09674f, 0.07558f,
+  0.43454f, 0.42146f, 0.34609f, 0.24072f, 0.15975f, 0.10701f, 0.07558f, 0.05875f,
+};
+
+GLOBAL(void)
+quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *actbl, JBLOCKROW coef_blocks, JBLOCKROW src, JDIMENSION num_blocks,
+                 JQUANT_TBL * qtbl, double *norm_src, double *norm_coef)
+{
+  int i, j, k;
+  float accumulated_zero_dist[DCTSIZE2];
+  float accumulated_cost[DCTSIZE2];
+  int run_start[DCTSIZE2];
+  int bi;
+  float best_cost;
+  int last_coeff_idx; /* position of last nonzero coefficient */
+  float norm = 0.0;
+  float lambda_base;
+  float lambda;
+  const float *lambda_tbl = (cinfo->use_lambda_weight_tbl) ? jpeg_lambda_weights_csf_luma : jpeg_lambda_weights_flat;
+  int Ss, Se;
+  float *accumulated_zero_block_cost;
+  float *accumulated_block_cost;
+  int *block_run_start;
+  int *requires_eob;
+  int has_eob;
+  float cost_all_zeros;
+  float best_cost_skip;
+
+  Ss = cinfo->Ss;
+  Se = cinfo->Se;
+  if (Ss == 0)
+    Ss = 1;
+  if (Se < Ss)
+    return;
+  if (cinfo->trellis_eob_opt) {
+    accumulated_zero_block_cost = (float *)malloc((num_blocks + 1) * SIZEOF(float));
+    accumulated_block_cost = (float *)malloc((num_blocks + 1) * SIZEOF(float));
+    block_run_start = (int *)malloc(num_blocks * SIZEOF(int));
+    requires_eob = (int *)malloc((num_blocks + 1) * SIZEOF(int));
+    accumulated_zero_block_cost[0] = 0;
+    accumulated_block_cost[0] = 0;
+    requires_eob[0] = 0;
+  }
+  
+  if(!jpeg_nbits_table_init) {
+    for(i = 0; i < 65536; i++) {
+      int nbits = 0, temp = i;
+      while (temp) {temp >>= 1;  nbits++;}
+      jpeg_nbits_table[i] = nbits;
+    }
+    jpeg_nbits_table_init = 1;
+  }
+
+  norm = 0.0;
+  for (i = 1; i < DCTSIZE2; i++) {
+    norm += qtbl->quantval[i] * qtbl->quantval[i];
+  }
+  norm /= 63.0;
+  
+  lambda_base = 1.0 / norm;
+  
+  for (bi = 0; bi < num_blocks; bi++) {
+    
+    norm = 0.0;
+    for (i = 1; i < DCTSIZE2; i++) {
+      norm += src[bi][i] * src[bi][i];
+    }
+    norm /= 63.0;
+    
+    if (cinfo->lambda_log_scale2 > 0.0)
+      lambda = pow(2.0, cinfo->lambda_log_scale1) * lambda_base / (pow(2.0, cinfo->lambda_log_scale2) + norm);
+    else
+      lambda = pow(2.0, cinfo->lambda_log_scale1-12.0) * lambda_base;
+    
+    accumulated_zero_dist[Ss-1] = 0.0;
+    accumulated_cost[Ss-1] = 0.0;
+    
+    for (i = Ss; i <= Se; i++) {
+      int z = jpeg_natural_order[i];
+      
+      int sign = src[bi][z] >> 31;
+      int x = abs(src[bi][z]);
+      int q = 8 * qtbl->quantval[z];
+      int candidate[16];
+      int candidate_bits[16];
+      float candidate_dist[16];
+      int num_candidates;
+      int qval;
+      
+      accumulated_zero_dist[i] = x * x * lambda * lambda_tbl[z] + accumulated_zero_dist[i-1];
+      
+      qval = (x + q/2) / q; /* quantized value (round nearest) */
+
+      if (qval == 0) {
+        coef_blocks[bi][z] = 0;
+        accumulated_cost[i] = 1e38; /* Shouldn't be needed */
+        continue;
+      }
+
+      num_candidates = jpeg_nbits_table[qval];
+      for (k = 0; k < num_candidates; k++) {
+        int delta;
+        candidate[k] = (k < num_candidates - 1) ? (2 << k) - 1 : qval;
+        delta = candidate[k] * q - x;
+        candidate_bits[k] = k+1;
+        candidate_dist[k] = delta * delta * lambda * lambda_tbl[z];
+      }
+      
+      accumulated_cost[i] = 1e38;
+      
+      for (j = Ss-1; j < i; j++) {
+        int zz = jpeg_natural_order[j];
+        if (j != Ss-1 && coef_blocks[bi][zz] == 0)
+          continue;
+        
+        int zero_run = i - 1 - j;
+        if ((zero_run >> 4) && actbl->ehufsi[0xf0] == 0)
+          continue;
+        
+        int run_bits = (zero_run >> 4) * actbl->ehufsi[0xf0];
+        zero_run &= 15;
+
+        for (k = 0; k < num_candidates; k++) {
+          int coef_bits = actbl->ehufsi[16 * zero_run + candidate_bits[k]];
+          if (coef_bits == 0)
+            continue;
+          
+          int rate = coef_bits + candidate_bits[k] + run_bits;
+          float cost = rate + candidate_dist[k];
+          cost += accumulated_zero_dist[i-1] - accumulated_zero_dist[j] + accumulated_cost[j];
+          
+          if (cost < accumulated_cost[i]) {
+            coef_blocks[bi][z] = (candidate[k] ^ sign) - sign;
+            accumulated_cost[i] = cost;
+            run_start[i] = j;
+          }
+        }
+      }
+    }
+    
+    last_coeff_idx = Ss-1;
+    best_cost = accumulated_zero_dist[Se] + actbl->ehufsi[0];
+    cost_all_zeros = accumulated_zero_dist[Se];
+    best_cost_skip = cost_all_zeros;
+    
+    for (i = Ss; i <= Se; i++) {
+      int z = jpeg_natural_order[i];
+      if (coef_blocks[bi][z] != 0) {
+        float cost = accumulated_cost[i] + accumulated_zero_dist[Se] - accumulated_zero_dist[i];
+        float cost_wo_eob = cost;
+        
+        if (i < Se)
+          cost += actbl->ehufsi[0];
+        
+        if (cost < best_cost) {
+          best_cost = cost;
+          last_coeff_idx = i;
+          best_cost_skip = cost_wo_eob;
+        }
+      }
+    }
+    
+    has_eob = (last_coeff_idx < Se) + (last_coeff_idx == Ss-1);
+    
+    /* Zero out coefficients that are part of runs */
+    i = Se;
+    while (i >= Ss)
+    {
+      while (i > last_coeff_idx) {
+        int z = jpeg_natural_order[i];
+        coef_blocks[bi][z] = 0;
+        i--;
+      }
+      last_coeff_idx = run_start[i];
+      i--;
+    }
+    
+    if (cinfo->trellis_eob_opt) {
+      accumulated_zero_block_cost[bi+1] = accumulated_zero_block_cost[bi];
+      accumulated_zero_block_cost[bi+1] += cost_all_zeros;
+      requires_eob[bi+1] = has_eob;
+      
+      best_cost = 1e38;
+      
+      if (has_eob != 2) {
+        for (i = 0; i <= bi; i++) {
+          int zero_block_run;
+          int nbits;
+          float cost;
+          
+          if (requires_eob[i] == 2)
+            continue;
+          
+          cost = best_cost_skip; /* cost of coding a nonzero block */
+          cost += accumulated_zero_block_cost[bi];
+          cost -= accumulated_zero_block_cost[i];
+          cost += accumulated_block_cost[i];
+          zero_block_run = bi - i + requires_eob[i];
+          nbits = jpeg_nbits_table[zero_block_run];
+          cost += actbl->ehufsi[16*nbits] + nbits;
+          
+          if (cost < best_cost) {
+            block_run_start[bi] = i;
+            best_cost = cost;
+            accumulated_block_cost[bi+1] = cost;
+          }
+        }
+      }
+    }
+  }
+  
+  if (cinfo->trellis_eob_opt) {
+    int last_block = num_blocks;
+    best_cost = 1e38;
+    
+    for (i = 0; i <= num_blocks; i++) {
+      int zero_block_run;
+      int nbits;
+      float cost = 0.0;
+      
+      if (requires_eob[i] == 2)
+        continue;
+
+      cost += accumulated_zero_block_cost[num_blocks];
+      cost -= accumulated_zero_block_cost[i];
+      zero_block_run = num_blocks - i + requires_eob[i];
+      nbits = jpeg_nbits_table[zero_block_run];
+      cost += actbl->ehufsi[16*nbits] + nbits;
+      if (cost < best_cost) {
+        best_cost = cost;
+        last_block = i;
+      }
+    }
+    last_block--;
+    bi = num_blocks - 1;
+    while (bi >= 0) {
+      while (bi > last_block) {
+        for (j = Ss; j <= Se; j++) {
+          int z = jpeg_natural_order[j];
+          coef_blocks[bi][z] = 0;
+        }
+        bi--;
+      }
+      last_block = block_run_start[bi]-1;
+      bi--;
+    }
+    free(accumulated_zero_block_cost);
+    free(accumulated_block_cost);
+    free(block_run_start);
+    free(requires_eob);
+  }
+  
+  if (cinfo->trellis_q_opt) {
+    for (bi = 0; bi < num_blocks; bi++) {
+      for (i = 1; i < DCTSIZE2; i++) {
+        norm_src[i] += src[bi][i] * coef_blocks[bi][i];
+        norm_coef[i] += 8 * coef_blocks[bi][i] * coef_blocks[bi][i];
+      }
+    }
+  }
+}

 /*
 * Initialize FDCT manager.
--- a/jchuff.h
+++ b/jchuff.h
@@ -2,6 +2,8 @@
 * jchuff.h
 *
 * Copyright (C) 1991-1997, Thomas G. Lane.
+ * mozjpeg Modifications:
+ * Copyright (C) 2014, Mozilla Corporation.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -45,3 +47,7 @@ EXTERN(void) jpeg_make_c_derived_tbl
 /* Generate an optimal table definition given the specified counts */
 EXTERN(void) jpeg_gen_optimal_table
 	JPP((j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[]));
+
+EXTERN(void) quantize_trellis
+        JPP((j_compress_ptr cinfo, c_derived_tbl *actbl, JBLOCKROW coef_blocks, JBLOCKROW src, JDIMENSION num_blocks,
+                 JQUANT_TBL * qtbl, double *norm_src, double *norm_coef));
--- a/jcmaster.c
+++ b/jcmaster.c
@@ -27,7 +27,8 @@
 typedef enum {
 	main_pass,		/* input data, also do first output step */
 	huff_opt_pass,		/* Huffman code optimization pass */
-	output_pass		/* data output pass */
+	output_pass,		/* data output pass */
+        trellis_pass            /* trellis quantization pass */
 } c_pass_type;

 typedef struct {
@@ -41,6 +42,7 @@ typedef struct {
  int scan_number;		/* current index in scan_info[] */
  
  /* fields for scan optimisation */
+  int pass_number_scan_opt_base; /* pass number where scan optimization begins */
  unsigned char * scan_buffer[64]; /* buffer for a given scan */
  unsigned long scan_size[64]; /* size for a given scan */
  unsigned long best_cost; /* bit count for best frequency split */
@@ -326,9 +328,21 @@ select_scan_parameters (j_compress_ptr cinfo)
  int ci;

 #ifdef C_MULTISCAN_FILES_SUPPORTED
-  if (cinfo->scan_info != NULL) {
+  my_master_ptr master = (my_master_ptr) cinfo->master;
+  if (master->pass_number < master->pass_number_scan_opt_base) {
+    cinfo->comps_in_scan = 1;
+    if (cinfo->use_scans_in_trellis) {
+      cinfo->cur_comp_info[0] = &cinfo->comp_info[master->pass_number/(4*cinfo->trellis_num_loops)];
+      cinfo->Ss = (master->pass_number%4 < 2) ? 1 : cinfo->trellis_freq_split+1;
+      cinfo->Se = (master->pass_number%4 < 2) ? cinfo->trellis_freq_split : DCTSIZE2-1;
+    } else {
+      cinfo->cur_comp_info[0] = &cinfo->comp_info[master->pass_number/(2*cinfo->trellis_num_loops)];
+      cinfo->Ss = 1;
+      cinfo->Se = DCTSIZE2-1;
+    }
+  }
+  else if (cinfo->scan_info != NULL) {
    /* Prepare for current scan --- the script is already validated */
-    my_master_ptr master = (my_master_ptr) cinfo->master;
    const jpeg_scan_info * scanptr = cinfo->scan_info + master->scan_number;

    cinfo->comps_in_scan = scanptr->comps_in_scan;
@@ -467,6 +481,7 @@ METHODDEF(void)
 prepare_for_pass (j_compress_ptr cinfo)
 {
  my_master_ptr master = (my_master_ptr) cinfo->master;
+  cinfo->trellis_passes = master->pass_number < master->pass_number_scan_opt_base;

  switch (master->pass_type) {
  case main_pass:
@@ -534,6 +549,22 @@ prepare_for_pass (j_compress_ptr cinfo)
    (*cinfo->marker->write_scan_header) (cinfo);
    master->pub.call_pass_startup = FALSE;
    break;
+  case trellis_pass:
+    if (master->pass_number%(cinfo->num_components*(cinfo->use_scans_in_trellis?4:2)) == 1 && cinfo->trellis_q_opt) {
+      int i, j;
+
+      for (i = 0; i < NUM_QUANT_TBLS; i++) {
+        for (j = 1; j < DCTSIZE2; j++) {
+          cinfo->norm_src[i][j] = 0.0;
+          cinfo->norm_coef[i][j] = 0.0;
+        }
+      }
+    }
+    (*cinfo->entropy->start_pass) (cinfo, TRUE);
+    (*cinfo->coef->start_pass) (cinfo, JBUF_REQUANT);
+    master->pub.call_pass_startup = FALSE;
+    break;
+      
  default:
    ERREXIT(cinfo, JERR_NOT_COMPILED);
  }
@@ -575,6 +606,16 @@ copy_buffer (j_compress_ptr cinfo, int scan_idx)
  
  unsigned long size = master->scan_size[scan_idx];
  unsigned char * src = master->scan_buffer[scan_idx];
+  int i;
+  
+  if (cinfo->err->trace_level > 0) {
+    fprintf(stderr, "SCAN ");
+    for (i = 0; i < cinfo->scan_info[scan_idx].comps_in_scan; i++)
+      fprintf(stderr, "%s%d", (i==0)?"":",", cinfo->scan_info[scan_idx].component_index[i]);
+    fprintf(stderr, ": %d %d", cinfo->scan_info[scan_idx].Ss, cinfo->scan_info[scan_idx].Se);
+    fprintf(stderr, " %d %d", cinfo->scan_info[scan_idx].Ah, cinfo->scan_info[scan_idx].Al);
+    fprintf(stderr, "\n");
+  }
  
  while (size >= cinfo->dest->free_in_buffer)
  {
@@ -615,7 +656,7 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
        master->best_Al_luma = Al;
      } else {
        master->scan_number = luma_freq_split_scan_start - 1;
-        master->pass_number = 2 * master->scan_number + 1;
+        master->pass_number = 2 * master->scan_number + 1 + master->pass_number_scan_opt_base;
      }
    }
  
@@ -640,7 +681,7 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
          (idx == 3 && master->best_freq_split_idx_luma != 2) ||
          (idx == 4 && master->best_freq_split_idx_luma != 4)) {
        master->scan_number = cinfo->num_scans_luma - 1;
-        master->pass_number = 2 * master->scan_number + 1;
+        master->pass_number = 2 * master->scan_number + 1 + master->pass_number_scan_opt_base;
        master->pub.is_last_pass = (master->pass_number == master->total_passes - 1);
      }
    }
@@ -672,7 +713,7 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
          master->best_Al_chroma = Al;
        } else {
          master->scan_number = chroma_freq_split_scan_start - 1;
-          master->pass_number = 2 * master->scan_number + 1;
+          master->pass_number = 2 * master->scan_number + 1 + master->pass_number_scan_opt_base;
        }
      }

@@ -700,7 +741,7 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
            (idx == 3 && master->best_freq_split_idx_chroma != 2) ||
            (idx == 4 && master->best_freq_split_idx_chroma != 4)) {
          master->scan_number = cinfo->num_scans - 1;
-          master->pass_number = 2 * master->scan_number + 1;
+          master->pass_number = 2 * master->scan_number + 1 + master->pass_number_scan_opt_base;
          master->pub.is_last_pass = (master->pass_number == master->total_passes - 1);
        }
      }
@@ -713,7 +754,7 @@ select_scans (j_compress_ptr cinfo, int next_scan_number)
    
    copy_buffer(cinfo, 0);

-    if (cinfo->num_scans > cinfo->num_scans_luma) {
+    if (cinfo->num_scans > cinfo->num_scans_luma && !cinfo->one_dc_scan) {
      base_scan_idx = cinfo->num_scans_luma;
      
      if (master->interleave_chroma_dc)
@@ -791,13 +832,17 @@ finish_pass_master (j_compress_ptr cinfo)
    /* next pass is either output of scan 0 (after optimization)
     * or output of scan 1 (if no optimization).
     */
-    master->pass_type = output_pass;
-    if (! cinfo->optimize_coding)
-      master->scan_number++;
+    if (cinfo->trellis_quant)
+      master->pass_type = trellis_pass;
+    else {
+      master->pass_type = output_pass;
+      if (! cinfo->optimize_coding)
+        master->scan_number++;
+    }
    break;
  case huff_opt_pass:
    /* next pass is always output of current scan */
-    master->pass_type = output_pass;
+    master->pass_type = (master->pass_number < master->pass_number_scan_opt_base-1) ? trellis_pass : output_pass;
    break;
  case output_pass:
    /* next pass is either optimization or output of next scan */
@@ -811,6 +856,24 @@ finish_pass_master (j_compress_ptr cinfo)

    master->scan_number++;
    break;
+  case trellis_pass:
+    master->pass_type = (cinfo->optimize_coding || master->pass_number < master->pass_number_scan_opt_base-1) ? huff_opt_pass : output_pass;
+      
+    if ((master->pass_number+1)%(cinfo->num_components*(cinfo->use_scans_in_trellis?4:2)) == 0 && cinfo->trellis_q_opt) {
+      int i, j;
+
+      for (i = 0; i < NUM_QUANT_TBLS; i++) {
+        for (j = 1; j < DCTSIZE2; j++) {
+          if (cinfo->norm_coef[i][j] != 0.0) {
+            int q = (int)(cinfo->norm_src[i][j] / cinfo->norm_coef[i][j] + 0.5);
+            if (q > 254) q = 254;
+            if (q < 1) q = 1;
+            cinfo->quant_tbl_ptrs[i]->quantval[j] = q;
+          }
+        }
+      }
+    }
+    break;
  }

  master->pass_number++;
@@ -870,6 +933,13 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
  else
    master->total_passes = cinfo->num_scans;
  
+  if (cinfo->trellis_quant) {
+    if (cinfo->progressive_mode)
+      master->total_passes += ((cinfo->use_scans_in_trellis) ? 4 : 2) * cinfo->num_components * cinfo->trellis_num_loops;
+    else
+      master->total_passes += 1;
+  }
+  
  if (cinfo->optimize_scans) {
    int i;
    master->best_Al_chroma = 0;
@@ -877,4 +947,9 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
    for (i = 0; i < cinfo->num_scans; i++)
      master->scan_buffer[i] = NULL;
  }
+  
+  if (cinfo->trellis_quant)
+    master->pass_number_scan_opt_base = ((cinfo->use_scans_in_trellis) ? 4 : 2) * cinfo->num_components * cinfo->trellis_num_loops;
+  else
+    master->pass_number_scan_opt_base = 0;
 }
--- a/jcparam.c
+++ b/jcparam.c
@@ -90,6 +90,16 @@ static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
  99,  99,  99,  99,  99,  99,  99,  99
 };

+static const unsigned int flat_quant_tbl[DCTSIZE2] = {
+  16,  16,  16,  16,  16,  16,  16,  16,
+  16,  16,  16,  16,  16,  16,  16,  16,
+  16,  16,  16,  16,  16,  16,  16,  16,
+  16,  16,  16,  16,  16,  16,  16,  16,
+  16,  16,  16,  16,  16,  16,  16,  16,
+  16,  16,  16,  16,  16,  16,  16,  16,
+  16,  16,  16,  16,  16,  16,  16,  16,
+  16,  16,  16,  16,  16,  16,  16,  16
+};

 #if JPEG_LIB_VERSION >= 70
 GLOBAL(void)
@@ -101,9 +111,9 @@ jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
 {
  /* Set up two quantization tables using the specified scaling */
  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
-		       cinfo->q_scale_factor[0], force_baseline);
+                       cinfo->q_scale_factor[0], force_baseline);
  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
-		       cinfo->q_scale_factor[1], force_baseline);
+                       cinfo->q_scale_factor[1], force_baseline);
 }
 #endif

@@ -118,10 +128,17 @@ jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
 */
 {
  /* Set up two quantization tables using the specified scaling */
-  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
-		       scale_factor, force_baseline);
-  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
-		       scale_factor, force_baseline);
+  if (cinfo->use_flat_quant_tbl) {
+    jpeg_add_quant_table(cinfo, 0, flat_quant_tbl,
+                         scale_factor, force_baseline);
+    jpeg_add_quant_table(cinfo, 1, flat_quant_tbl,
+                         scale_factor, force_baseline);
+  } else {
+    jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
+                         scale_factor, force_baseline);
+    jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
+                         scale_factor, force_baseline);
+  }
 }


@@ -325,6 +342,8 @@ jpeg_set_defaults (j_compress_ptr cinfo)
  }

 #ifdef C_PROGRESSIVE_SUPPORTED
+  cinfo->scan_info = NULL;
+  cinfo->num_scans = 0;
  if (!cinfo->use_moz_defaults) {
    /* Default is no multiple-scan output */
    cinfo->scan_info = NULL;
@@ -399,6 +418,8 @@ jpeg_set_defaults (j_compress_ptr cinfo)

  jpeg_default_colorspace(cinfo);
  
+  cinfo->one_dc_scan = TRUE;
+  
 #ifdef C_PROGRESSIVE_SUPPORTED
  if (cinfo->use_moz_defaults) {
    cinfo->optimize_scans = TRUE;
@@ -406,6 +427,16 @@ jpeg_set_defaults (j_compress_ptr cinfo)
  } else
    cinfo->optimize_scans = FALSE;
 #endif
+  
+  cinfo->trellis_quant = cinfo->use_moz_defaults;
+  cinfo->lambda_log_scale1 = 16.0;
+  cinfo->lambda_log_scale2 = 15.5;
+  
+  cinfo->use_lambda_weight_tbl = TRUE;
+  cinfo->use_scans_in_trellis = FALSE;
+  cinfo->trellis_freq_split = 8;
+  cinfo->trellis_num_loops = 1;
+  cinfo->trellis_q_opt = FALSE;
 }


@@ -673,7 +704,10 @@ jpeg_search_progression (j_compress_ptr cinfo)
  /* last 4 done conditionally */
  
  /* luma DC by itself */
-  scanptr = fill_dc_scans(scanptr, 1, 0, 0);
+  if (cinfo->one_dc_scan)
+    scanptr = fill_dc_scans(scanptr, ncomps, 0, 0);
+  else
+    scanptr = fill_dc_scans(scanptr, 1, 0, 0);
  
  scanptr = fill_a_scan(scanptr, 0, 1, 8, 0, 0);
  scanptr = fill_a_scan(scanptr, 0, 9, 63, 0, 0);
@@ -761,7 +795,7 @@ jpeg_simple_progression (j_compress_ptr cinfo)
    nscans = 10;
  } else {
    /* All-purpose script for other color spaces. */
-    if (cinfo->use_moz_defaults) {
+    if (cinfo->use_moz_defaults == TRUE) {
      if (ncomps > MAX_COMPS_IN_SCAN)
        nscans = 5 * ncomps;	/* 2 DC + 4 AC scans per component */
      else
@@ -793,11 +827,15 @@ jpeg_simple_progression (j_compress_ptr cinfo)

  if (ncomps == 3 && cinfo->jpeg_color_space == JCS_YCbCr) {
    /* Custom script for YCbCr color images. */
-    if (cinfo->use_moz_defaults) {
+    if (cinfo->use_moz_defaults == TRUE) {
      /* scan defined in jpeg_scan_rgb.txt in jpgcrush */
      /* Initial DC scan */
-      scanptr = fill_dc_scans(scanptr, 1, 0, 0);
-      scanptr = fill_a_scan_pair(scanptr, 1, 0, 0, 0, 0);
+      if (cinfo->one_dc_scan)
+        scanptr = fill_dc_scans(scanptr, ncomps, 0, 0);
+      else {
+        scanptr = fill_dc_scans(scanptr, 1, 0, 0);
+        scanptr = fill_a_scan_pair(scanptr, 1, 0, 0, 0, 0);
+      }
      /* Low frequency AC scans */
      scanptr = fill_a_scan(scanptr, 0, 1, 8, 0, 2);
      scanptr = fill_a_scan(scanptr, 1, 1, 8, 0, 0);
@@ -832,7 +870,7 @@ jpeg_simple_progression (j_compress_ptr cinfo)
    }
  } else {
    /* All-purpose script for other color spaces. */
-    if (cinfo->use_moz_defaults) {
+    if (cinfo->use_moz_defaults == TRUE) {
      /* scan defined in jpeg_scan_bw.txt in jpgcrush */
      /* DC component, no successive approximation */
      scanptr = fill_dc_scans(scanptr, ncomps, 0, 0);
--- a/jcphuff.c
+++ b/jcphuff.c
@@ -169,6 +169,14 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
 	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				      257 * SIZEOF(long));
      MEMZERO(entropy->count_ptrs[tbl], 257 * SIZEOF(long));
+      if (cinfo->trellis_passes) {
+        /* When generating tables for trellis passes, make sure that all */
+        /* codewords have an assigned length */
+        int i, j;
+        for (i = 0; i < 16; i++)
+          for (j = 0; j < 12; j++)
+          entropy->count_ptrs[tbl][16*i+j] = 1;
+      }
    } else {
      /* Compute derived values for Huffman table */
      /* We may do this more than once for a table, but it's not expensive */
--- a/jpegint.h
+++ b/jpegint.h
@@ -3,6 +3,8 @@
 *
 * Copyright (C) 1991-1997, Thomas G. Lane.
 * Modified 1997-2009 by Guido Vollbeding.
+ * mozjpeg Modifications:
+ * Copyright (C) 2014, Mozilla Corporation.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -19,7 +21,9 @@ typedef enum {			/* Operating modes for buffer controllers */
 	/* Remaining modes require a full-image buffer to have been created */
 	JBUF_SAVE_SOURCE,	/* Run source subobject only, save output */
 	JBUF_CRANK_DEST,	/* Run dest subobject only, using saved data */
-	JBUF_SAVE_AND_PASS	/* Run both subobjects, save output */
+	JBUF_SAVE_AND_PASS,	/* Run both subobjects, save output */
+        JBUF_REQUANT            /* Requantize */
+
 } J_BUF_MODE;

 /* Values of global_state field (jdapi.c has some dependencies on ordering!) */
@@ -107,7 +111,7 @@ struct jpeg_forward_dct {
 			      jpeg_component_info * compptr,
 			      JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
 			      JDIMENSION start_row, JDIMENSION start_col,
-			      JDIMENSION num_blocks));
+			      JDIMENSION num_blocks, JBLOCKROW dst));
 };

 /* Entropy encoding */
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -376,8 +376,22 @@ struct jpeg_compress_struct {
  int smoothing_factor;		/* 1..100, or 0 for no input smoothing */
  J_DCT_METHOD dct_method;	/* DCT algorithm selector */

-  boolean use_moz_defaults; /* TRUE if using Mozilla defaults */
+  boolean use_moz_defaults; /* TRUE=use Mozilla defaults */
  boolean optimize_scans; /* TRUE=optimize progressive coding scans */
+  boolean one_dc_scan; /* TRUE=use a single DC scan interleaving all components */
+  boolean trellis_quant; /* TRUE=use trellis quantization */
+  boolean trellis_eob_opt; /* TRUE=optimize for sequences of EOB */
+  boolean use_flat_quant_tbl; /* TRUE=use flat quantization table */
+  boolean use_lambda_weight_tbl; /* TRUE=use lambda weighting table */
+  boolean use_scans_in_trellis; /* TRUE=use scans in trellis optimization */
+  boolean trellis_passes; /* TRUE=currently doing trellis-related passes */
+  boolean trellis_q_opt; /* TRUE=optimize quant table in trellis loop */
+  
+  double norm_src[NUM_QUANT_TBLS][DCTSIZE2];
+  double norm_coef[NUM_QUANT_TBLS][DCTSIZE2];
+
+  int trellis_freq_split; /* splitting point for frequency in trellis quantization */
+  int trellis_num_loops; /* number of trellis loops */
  
  int num_scans_luma; /* # of entries in scan_info array pertaining to luma (used when optimize_scans is TRUE */
  int num_scans_luma_dc;
@@ -387,6 +401,9 @@ struct jpeg_compress_struct {
  int Al_max_luma; /* maximum value of Al tested when optimizing scans (luma) */
  int Al_max_chroma; /* maximum value of Al tested when optimizing scans (chroma) */

+  float lambda_log_scale1;
+  float lambda_log_scale2;
+  
  /* The restart interval can be specified in absolute MCUs by setting
   * restart_interval, or in MCU rows by setting restart_in_rows
   * (in which case the correct restart_interval will be figured
--- a/jversion.h
+++ b/jversion.h
@@ -5,6 +5,8 @@
 * Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
 * Modifications:
 * Copyright (C) 2010, 2012-2013, D. R. Commander.
+ * mozjpeg Modifications:
+ * Copyright (C) 2014, Mozilla Corporation.
 * For conditions of distribution and use, see the accompanying README file.
 *
 * This file contains software version identification.
@@ -29,4 +31,6 @@
 			"Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
 			"Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \
 			"Copyright (C) 2009-2013 D. R. Commander\n" \
-			"Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)"
+			"Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
+                        "Copyright (C) 2014 Mozilla Corporation\n"
+