Re-factor and re-license under the libjpeg BSD-style license. Justification: the accelerated Huffman encoding optimizations in libjpeg-turbo were all developed by me as an independent developer. The structure of the inline Huffman encoding macros was originally borrowed from similar routines in the TurboJPEG/mediaLib codec, which is part of VirtualGL and TurboVNC. Thus, although the code for these macros was not copied verbatim, they were still thought to be a derivative work of TurboJPEG/mediaLib, and I assigned the copyright and license from TurboJPEG/mediaLib to them. I have re-written these routines from first principles by breaking down the libjpeg out-of-line routines. Although the new code bears algorithmic similarities to the TurboJPEG/mediaLib macros, it can now clearly be shown to be derived from the out-of-line routines and thus, in my opinion, it can no longer be considered a derivative of TurboJPEG/mediaLib. -- DRC

2011-04-26 22:08:31 +00:00
parent 2540beb951
commit ed7ec8322c
1 changed files with 167 additions and 185 deletions
--- a/jchuff.c
+++ b/jchuff.c
@@ -2,6 +2,7 @@
 * jchuff.c
 *
 * Copyright (C) 1991-1997, Thomas G. Lane.
 * Copyright (C) 2009-2011, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -14,21 +15,6 @@
 * permanent JPEG objects only upon successful completion of an MCU.
 */
 /* Modifications:
 * Copyright (C)2007 Sun Microsystems, Inc.
 * Copyright (C)2009 D. R. Commander
 *
 * This library is free software and may be redistributed and/or modified under
 * the terms of the wxWindows Library License, Version 3.1 or (at your option)
 * any later version.  The full license is in the LICENSE.txt file included
 * with this distribution.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * wxWindows Library License for more details.
 */
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
@@ -36,12 +22,13 @@
 #include <limits.h>
 static unsigned char jpeg_first_bit_table[65536];
-static int jpeg_first_bit_table_init=0;
+static int jpeg_first_bit_table_init = 0;
 #ifndef min
 #define min(a,b) ((a)<(b)?(a):(b))
 #endif
 /* Expanded entropy encoder object for Huffman encoding.
 *
 * The savable_state subrecord contains fields that change within an MCU,
@@ -181,7 +168,6 @@ start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
  }
  /* Initialize bit buffer to empty */
  entropy->saved.put_buffer = 0;
  entropy->saved.put_bits = 0;
@@ -325,30 +311,27 @@ dump_buffer (working_state * state)
 /* Outputting bits to the file */
-/* Only the right 24 bits of put_buffer are used; the valid bits are
+/* These macros perform the same task as the emit_bits() function in the
- * left-justified in this part.  At most 16 bits can be passed to emit_bits
+ * original libjpeg code.  In addition to reducing overhead by explicitly
- * in one call, and we never retain more than 7 bits in put_buffer
+ * inlining the code, additional performance is achieved by taking into
- * between calls, so 24 bits are sufficient.
+ * account the size of the bit buffer and waiting until it is almost full
 * before emptying it.  This mostly benefits 64-bit platforms, since 6
 * bytes can be stored in a 64-bit bit buffer before it has to be emptied.
 */
 /***************************************************************/
 #define EMIT_BYTE() { \
-  if (0xFF == (*buffer++ =  (unsigned char)(put_buffer >> (put_bits -= 8))))  \
+  JOCTET c; \
  put_bits -= 8; \
  c = (JOCTET)GETJOCTET(put_buffer >> put_bits); \
  *buffer++ = c; \
  if (c == 0xFF)  /* need to stuff a zero byte? */ \
    *buffer++ = 0; \
 }
-/***************************************************************/
+#define PUT_BITS(code, size) { \
 #define DUMP_BITS_(code, size) {                                \
  put_bits += size; \
  put_buffer = (put_buffer << size) | code; \
-  if (put_bits > 7)                                             \
+}
    while(put_bits > 7)                                         \
      EMIT_BYTE()                                               \
 }
 /***************************************************************/
 #define CHECKBUF15() { \
  if (put_bits > 15) { \
@@ -357,6 +340,15 @@ dump_buffer (working_state * state)
  } \
 }
 #define CHECKBUF31() { \
  if (put_bits > 31) { \
    EMIT_BYTE() \
    EMIT_BYTE() \
    EMIT_BYTE() \
    EMIT_BYTE() \
  } \
 }
 #define CHECKBUF47() { \
  if (put_bits > 47) { \
    EMIT_BYTE() \
@@ -368,87 +360,37 @@ dump_buffer (working_state * state)
  } \
 }
-#define CHECKBUF31() {                                          \
+#if __WORDSIZE==64 || defined(_WIN64)
-  if (put_bits > 31) {                                          \
+
-    EMIT_BYTE()                                                 \
+#define EMIT_BITS(code, size) { \
-    EMIT_BYTE()                                                 \
+  CHECKBUF47() \
-    EMIT_BYTE()                                                 \
+  PUT_BITS(code, size) \
    EMIT_BYTE()                                                 \
  }                                                             \
 }
-/***************************************************************/
+#define EMIT_CODE(code, size) { \
-
+  temp2 &= (((INT32) 1)<<nbits) - 1; \
 #define DUMP_BITS_NOCHECK(code, size) {                         \
  put_bits += size;                                             \
  put_buffer = (put_buffer << size) | code;                     \
 }
 #if __WORDSIZE==64 || defined(_WIN64)
 #define DUMP_BITS(code, size) {                                 \
  CHECKBUF47()                                                  \
  put_bits += size;                                             \
  put_buffer = (put_buffer << size) | code;                     \
 }
 #else
 #define DUMP_BITS(code, size) {                                 \
  put_bits += size;                                             \
  put_buffer = (put_buffer << size) | code;                     \
  CHECKBUF15()                                                  \
 }
 #endif
 /***************************************************************/
 #define DUMP_SINGLE_VALUE(ht, codevalue) { \
  size = ht->ehufsi[codevalue];            \
  code = ht->ehufco[codevalue];            \
                                           \
  DUMP_BITS(code, size)                    \
 }
 /***************************************************************/
 #define DUMP_VALUE_SLOW(ht, codevalue, t, nbits) { \
  size = ht->ehufsi[codevalue];               \
  code = ht->ehufco[codevalue];               \
  t &= ~(-1 << nbits);                        \
  DUMP_BITS_NOCHECK(code, size)               \
  CHECKBUF15()                                \
  DUMP_BITS_NOCHECK(t, nbits)                 \
  CHECKBUF15()                                \
 }
 #if __WORDSIZE==64 || defined(_WIN64)
 #define DUMP_VALUE(ht, codevalue, t, nbits) { \
  size = ht->ehufsi[codevalue];               \
  code = ht->ehufco[codevalue];               \
  t &= ~(-1 << nbits);                        \
  CHECKBUF31() \
-  DUMP_BITS_NOCHECK(code, size)               \
+  PUT_BITS(code, size) \
-  DUMP_BITS_NOCHECK(t, nbits)                 \
+  PUT_BITS(temp2, nbits) \
 }
 #else
-#define DUMP_VALUE(ht, codevalue, t, nbits) { \
+#define EMIT_BITS(code, size) { \
-  size = ht->ehufsi[codevalue];               \
+  PUT_BITS(code, size) \
  code = ht->ehufco[codevalue];               \
  t &= ~(-1 << nbits);                        \
  DUMP_BITS_NOCHECK(code, size)               \
  CHECKBUF15() \
-  DUMP_BITS_NOCHECK(t, nbits)                 \
+}
 #define EMIT_CODE(code, size) { \
  temp2 &= (((INT32) 1)<<nbits) - 1; \
  PUT_BITS(code, size) \
  CHECKBUF15() \
  PUT_BITS(temp2, nbits) \
  CHECKBUF15() \
 }
 #endif
 /***************************************************************/
 #define BUFSIZE (DCTSIZE2 * 2)
@@ -481,12 +423,11 @@ dump_buffer (working_state * state)
  } \
 }
 /***************************************************************/
 LOCAL(boolean)
 flush_bits (working_state * state)
 {
-  unsigned char _buffer[BUFSIZE], *buffer;
+  JOCTET _buffer[BUFSIZE], *buffer;
  size_t put_buffer;  int put_bits;
  size_t bytes, bytestocopy;  int localbuf = 0;
@@ -494,7 +435,9 @@ flush_bits (working_state * state)
  put_bits = state->cur.put_bits;
  LOAD_BUFFER()
-  DUMP_BITS_(0x7F, 7)
+  /* fill any partial byte with ones */
  PUT_BITS(0x7F, 7)
  while (put_bits >= 8) EMIT_BYTE()
  state->cur.put_buffer = 0;	/* and reset bit-buffer to empty */
  state->cur.put_bits = 0;
@@ -503,16 +446,17 @@ flush_bits (working_state * state)
  return TRUE;
 }
 /* Encode a single block's worth of coefficients */
 LOCAL(boolean)
 encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
 		  c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
-  int temp, temp2;
+  int temp, temp2, temp3;
  int nbits;
-  int r, sflag, size, code;
+  int r, code, size;
-  unsigned char _buffer[BUFSIZE], *buffer;
+  JOCTET _buffer[BUFSIZE], *buffer;
  size_t put_buffer;  int put_bits;
  int code_0xf0 = actbl->ehufco[0xf0], size_0xf0 = actbl->ehufsi[0xf0];
  size_t bytes, bytestocopy;  int localbuf = 0;
@@ -525,50 +469,88 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
  temp = temp2 = block[0] - last_dc_val;
-  sflag = temp >> 31;
+ /* This is a well-known technique for obtaining the absolute value without a
-  temp -= ((temp + temp) & sflag);
+  * branch.  It is derived from an assembly language technique presented in
-  temp2 += sflag;
+  * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by
  * Agner Fog.
  */
  temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
  temp ^= temp3;
  temp -= temp3;
  /* For a negative input, want temp2 = bitwise complement of abs(input) */
  /* This code assumes we are on a two's complement machine */
  temp2 += temp3;
  /* Find the number of bits needed for the magnitude of the coefficient */
  nbits = jpeg_first_bit_table[temp];
-  DUMP_VALUE_SLOW(dctbl, nbits, temp2, nbits)
+
  /* Emit the Huffman-coded symbol for the number of bits */
  code = dctbl->ehufco[nbits];
  size = dctbl->ehufsi[nbits];
  PUT_BITS(code, size)
  CHECKBUF15()
  /* Mask off any extra bits in code */
  temp2 &= (((INT32) 1)<<nbits) - 1;
  /* Emit that number of bits of the value, if positive, */
  /* or the complement of its magnitude, if negative. */
  PUT_BITS(temp2, nbits)
  CHECKBUF15()
  /* Encode the AC coefficients per section F.1.2.2 */
  r = 0;			/* r = run length of zeros */
-#define innerloop(order) {  \
+/* Manually unroll the k loop to eliminate the counter variable.  This
-  temp2  = *(JCOEF*)((unsigned char*)block + order);  \
+ * improves performance greatly on systems with a limited number of
-  if(temp2 == 0) r++;  \
+ * registers (such as x86.)
-  else {  \
+ */
-    temp = (JCOEF)temp2;  \
+#define kloop(jpeg_natural_order_of_k) {  \
-    sflag = temp >> 31;  \
+  if ((temp = block[jpeg_natural_order_of_k]) == 0) { \
-    temp = (temp ^ sflag) - sflag;  \
+    r++; \
-    temp2 += sflag;  \
+  } else { \
    temp2 = temp; \
    /* Branch-less absolute value, bitwise complement, etc., same as above */ \
    temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); \
    temp ^= temp3; \
    temp -= temp3; \
    temp2 += temp3; \
    nbits = jpeg_first_bit_table[temp]; \
-    for(; r > 15; r -= 16) DUMP_BITS(code_0xf0, size_0xf0)  \
+    /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
-    sflag = (r << 4) + nbits;  \
+    while (r > 15) { \
-    DUMP_VALUE(actbl, sflag, temp2, nbits)  \
+      EMIT_BITS(code_0xf0, size_0xf0) \
      r -= 16; \
    } \
    /* Emit Huffman symbol for run length / number of bits */ \
    temp3 = (r << 4) + nbits;  \
    code = actbl->ehufco[temp3]; \
    size = actbl->ehufsi[temp3]; \
    EMIT_CODE(code, size) \
    r = 0;  \
-  }}
+  } \
 }
-  innerloop(2*1);   innerloop(2*8);   innerloop(2*16);  innerloop(2*9);
+  /* One iteration for each value in jpeg_natural_order[] */
-  innerloop(2*2);   innerloop(2*3);   innerloop(2*10);  innerloop(2*17);
+  kloop(1);   kloop(8);   kloop(16);  kloop(9);   kloop(2);   kloop(3);
-  innerloop(2*24);  innerloop(2*32);  innerloop(2*25);  innerloop(2*18);
+  kloop(10);  kloop(17);  kloop(24);  kloop(32);  kloop(25);  kloop(18);
-  innerloop(2*11);  innerloop(2*4);   innerloop(2*5);   innerloop(2*12);
+  kloop(11);  kloop(4);   kloop(5);   kloop(12);  kloop(19);  kloop(26);
-  innerloop(2*19);  innerloop(2*26);  innerloop(2*33);  innerloop(2*40);
+  kloop(33);  kloop(40);  kloop(48);  kloop(41);  kloop(34);  kloop(27);
-  innerloop(2*48);  innerloop(2*41);  innerloop(2*34);  innerloop(2*27);
+  kloop(20);  kloop(13);  kloop(6);   kloop(7);   kloop(14);  kloop(21);
-  innerloop(2*20);  innerloop(2*13);  innerloop(2*6);   innerloop(2*7);
+  kloop(28);  kloop(35);  kloop(42);  kloop(49);  kloop(56);  kloop(57);
-  innerloop(2*14);  innerloop(2*21);  innerloop(2*28);  innerloop(2*35);
+  kloop(50);  kloop(43);  kloop(36);  kloop(29);  kloop(22);  kloop(15);
-  innerloop(2*42);  innerloop(2*49);  innerloop(2*56);  innerloop(2*57);
+  kloop(23);  kloop(30);  kloop(37);  kloop(44);  kloop(51);  kloop(58);
-  innerloop(2*50);  innerloop(2*43);  innerloop(2*36);  innerloop(2*29);
+  kloop(59);  kloop(52);  kloop(45);  kloop(38);  kloop(31);  kloop(39);
-  innerloop(2*22);  innerloop(2*15);  innerloop(2*23);  innerloop(2*30);
+  kloop(46);  kloop(53);  kloop(60);  kloop(61);  kloop(54);  kloop(47);
-  innerloop(2*37);  innerloop(2*44);  innerloop(2*51);  innerloop(2*58);
+  kloop(55);  kloop(62);  kloop(63);
  innerloop(2*59);  innerloop(2*52);  innerloop(2*45);  innerloop(2*38);
  innerloop(2*31);  innerloop(2*39);  innerloop(2*46);  innerloop(2*53);
  innerloop(2*60);  innerloop(2*61);  innerloop(2*54);  innerloop(2*47);
  innerloop(2*55);  innerloop(2*62);  innerloop(2*63);
  /* If the last coef(s) were zero, emit an end-of-block code */
-  if (r > 0) DUMP_SINGLE_VALUE(actbl, 0x0)
+  if (r > 0) {
    code = actbl->ehufco[0];
    size = actbl->ehufsi[0];
    EMIT_BITS(code, size)
  }
  state->cur.put_buffer = put_buffer;
  state->cur.put_bits = put_bits;