Prog Huff enc: bit count/branchless abs val opts

Ported from baseline Huffman encoder. This improves overall compression performance by ~3-9% in my testing.
2018-02-14 17:22:00 -06:00
parent 985ef4f9f1
commit 37bae1a0e9
1 changed files with 59 additions and 33 deletions
--- a/jcphuff.c
+++ b/jcphuff.c
@@ -4,7 +4,7 @@
 * This file was part of the Independent JPEG Group's software:
 * Copyright (C) 1995-1997, Thomas G. Lane.
 * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2011, 2015, 2018, D. R. Commander.
 * For conditions of distribution and use, see the accompanying README.ijg
 * file.
 *
@@ -19,9 +19,41 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jchuff.h"             /* Declarations shared with jchuff.c */
 #include <limits.h>
 #ifdef C_PROGRESSIVE_SUPPORTED
 /*
 * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
 * used for bit counting rather than the lookup table.  This will reduce the
 * memory footprint by 64k, which is important for some mobile applications
 * that create many isolated instances of libjpeg-turbo (web browsers, for
 * instance.)  This may improve performance on some mobile platforms as well.
 * This feature is enabled by default only on ARM processors, because some x86
 * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
 * shown to have a significant performance impact even on the x86 chips that
 * have a fast implementation of it.  When building for ARMv6, you can
 * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
 * flags (this defines __thumb__).
 */
 /* NOTE: Both GCC and Clang define __GNUC__ */
 #if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
 #if !defined __thumb__ || defined __thumb2__
 #define USE_CLZ_INTRINSIC
 #endif
 #endif
 #ifdef USE_CLZ_INTRINSIC
 #define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
 #define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
 #else
 #include "jpeg_nbits_table.h"
 #define JPEG_NBITS(x) (jpeg_nbits_table[x])
 #define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
 #endif
 /* Expanded entropy encoder object for progressive Huffman encoding. */
 typedef struct {
@@ -320,9 +352,7 @@ emit_eobrun (phuff_entropy_ptr entropy)
  if (entropy->EOBRUN > 0) {    /* if there is any pending EOBRUN */
    temp = entropy->EOBRUN;
-    nbits = 0;
+    nbits = JPEG_NBITS_NONZERO(temp) - 1;
    while ((temp >>= 1))
      nbits++;
    /* safety check: shouldn't happen given limited correction-bit buffer */
    if (nbits > 14)
      ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
@@ -378,7 +408,7 @@ METHODDEF(boolean)
 encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp, temp2;
+  register int temp, temp2, temp3;
  register int nbits;
  int blkn, ci;
  int Al = cinfo->Al;
@@ -410,20 +440,20 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
    entropy->last_dc_val[ci] = temp2;
    /* Encode the DC coefficient difference per section G.1.2.1 */
-    temp2 = temp;
+
-    if (temp < 0) {
+    /* This is a well-known technique for obtaining the absolute value without
-      temp = -temp;             /* temp is abs value of input */
+     * a branch.  It is derived from an assembly language technique presented
-      /* For a negative input, want temp2 = bitwise complement of abs(input) */
+     * in "How to Optimize for the Pentium Processors", Copyright (c) 1996,
-      /* This code assumes we are on a two's complement machine */
+     * 1997 by Agner Fog.
-      temp2--;
+     */
-    }
+    temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
    temp ^= temp3;
    temp -= temp3;              /* temp is abs value of input */
    /* For a negative input, want temp2 = bitwise complement of abs(input) */
    temp2 = temp ^ temp3;
    /* Find the number of bits needed for the magnitude of the coefficient */
-    nbits = 0;
+    nbits = JPEG_NBITS(temp);
    while (temp) {
      nbits++;
      temp >>= 1;
    }
    /* Check for out-of-range coefficient values.
     * Since we're encoding a difference, the range limit is twice as much.
     */
@@ -465,7 +495,7 @@ METHODDEF(boolean)
 encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp, temp2;
+  register int temp, temp2, temp3;
  register int nbits;
  register int r, k;
  int Se = cinfo->Se;
@@ -497,15 +527,12 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     * in C, we shift after obtaining the absolute value; so the code is
     * interwoven with finding the abs value (temp) and output bits (temp2).
     */
-    if (temp < 0) {
+    temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-      temp = -temp;             /* temp is abs value of input */
+    temp ^= temp3;
-      temp >>= Al;              /* apply the point transform */
+    temp -= temp3;              /* temp is abs value of input */
-      /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
+    temp >>= Al;                /* apply the point transform */
-      temp2 = ~temp;
+    /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
-    } else {
+    temp2 = temp ^ temp3;
      temp >>= Al;              /* apply the point transform */
      temp2 = temp;
    }
    /* Watch out for case that nonzero coef is zero after point transform */
    if (temp == 0) {
      r++;
@@ -522,9 +549,7 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
    }
    /* Find the number of bits needed for the magnitude of the coefficient */
-    nbits = 1;                  /* there must be at least one 1 bit */
+    nbits = JPEG_NBITS_NONZERO(temp);  /* there must be at least one 1 bit */
    while ((temp >>= 1))
      nbits++;
    /* Check for out-of-range coefficient values */
    if (nbits > MAX_COEF_BITS)
      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
@@ -619,7 +644,7 @@ METHODDEF(boolean)
 encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp;
+  register int temp, temp3;
  register int r, k;
  int EOB;
  char *BR_buffer;
@@ -650,8 +675,9 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     * is an integer division with rounding towards 0.  To do this portably
     * in C, we shift after obtaining the absolute value.
     */
-    if (temp < 0)
+    temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-      temp = -temp;             /* temp is abs value of input */
+    temp ^= temp3;
    temp -= temp3;              /* temp is abs value of input */
    temp >>= Al;                /* apply the point transform */
    absvalues[k] = temp;        /* save abs value for main pass */
    if (temp == 1)