Prog Huff enc: bit count/branchless abs val opts

Ported from baseline Huffman encoder.  This improves overall compression
performance by ~3-9% in my testing.
This commit is contained in:
DRC
2018-02-14 17:22:00 -06:00
parent 985ef4f9f1
commit 37bae1a0e9

View File

@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software: * This file was part of the Independent JPEG Group's software:
* Copyright (C) 1995-1997, Thomas G. Lane. * Copyright (C) 1995-1997, Thomas G. Lane.
* libjpeg-turbo Modifications: * libjpeg-turbo Modifications:
* Copyright (C) 2015, D. R. Commander. * Copyright (C) 2011, 2015, 2018, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg * For conditions of distribution and use, see the accompanying README.ijg
* file. * file.
* *
@@ -19,9 +19,41 @@
#include "jinclude.h" #include "jinclude.h"
#include "jpeglib.h" #include "jpeglib.h"
#include "jchuff.h" /* Declarations shared with jchuff.c */ #include "jchuff.h" /* Declarations shared with jchuff.c */
#include <limits.h>
#ifdef C_PROGRESSIVE_SUPPORTED #ifdef C_PROGRESSIVE_SUPPORTED
/*
* NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
* used for bit counting rather than the lookup table. This will reduce the
* memory footprint by 64k, which is important for some mobile applications
* that create many isolated instances of libjpeg-turbo (web browsers, for
* instance.) This may improve performance on some mobile platforms as well.
* This feature is enabled by default only on ARM processors, because some x86
* chips have a slow implementation of bsr, and the use of clz/bsr cannot be
* shown to have a significant performance impact even on the x86 chips that
* have a fast implementation of it. When building for ARMv6, you can
* explicitly disable the use of clz/bsr by adding -mthumb to the compiler
* flags (this defines __thumb__).
*/
/* NOTE: Both GCC and Clang define __GNUC__ */
#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
#if !defined __thumb__ || defined __thumb2__
#define USE_CLZ_INTRINSIC
#endif
#endif
#ifdef USE_CLZ_INTRINSIC
#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
#else
#include "jpeg_nbits_table.h"
#define JPEG_NBITS(x) (jpeg_nbits_table[x])
#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
#endif
/* Expanded entropy encoder object for progressive Huffman encoding. */ /* Expanded entropy encoder object for progressive Huffman encoding. */
typedef struct { typedef struct {
@@ -320,9 +352,7 @@ emit_eobrun (phuff_entropy_ptr entropy)
if (entropy->EOBRUN > 0) { /* if there is any pending EOBRUN */ if (entropy->EOBRUN > 0) { /* if there is any pending EOBRUN */
temp = entropy->EOBRUN; temp = entropy->EOBRUN;
nbits = 0; nbits = JPEG_NBITS_NONZERO(temp) - 1;
while ((temp >>= 1))
nbits++;
/* safety check: shouldn't happen given limited correction-bit buffer */ /* safety check: shouldn't happen given limited correction-bit buffer */
if (nbits > 14) if (nbits > 14)
ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE); ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
@@ -378,7 +408,7 @@ METHODDEF(boolean)
encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{ {
phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
register int temp, temp2; register int temp, temp2, temp3;
register int nbits; register int nbits;
int blkn, ci; int blkn, ci;
int Al = cinfo->Al; int Al = cinfo->Al;
@@ -410,20 +440,20 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
entropy->last_dc_val[ci] = temp2; entropy->last_dc_val[ci] = temp2;
/* Encode the DC coefficient difference per section G.1.2.1 */ /* Encode the DC coefficient difference per section G.1.2.1 */
temp2 = temp;
if (temp < 0) { /* This is a well-known technique for obtaining the absolute value without
temp = -temp; /* temp is abs value of input */ * a branch. It is derived from an assembly language technique presented
/* For a negative input, want temp2 = bitwise complement of abs(input) */ * in "How to Optimize for the Pentium Processors", Copyright (c) 1996,
/* This code assumes we are on a two's complement machine */ * 1997 by Agner Fog.
temp2--; */
} temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
temp ^= temp3;
temp -= temp3; /* temp is abs value of input */
/* For a negative input, want temp2 = bitwise complement of abs(input) */
temp2 = temp ^ temp3;
/* Find the number of bits needed for the magnitude of the coefficient */ /* Find the number of bits needed for the magnitude of the coefficient */
nbits = 0; nbits = JPEG_NBITS(temp);
while (temp) {
nbits++;
temp >>= 1;
}
/* Check for out-of-range coefficient values. /* Check for out-of-range coefficient values.
* Since we're encoding a difference, the range limit is twice as much. * Since we're encoding a difference, the range limit is twice as much.
*/ */
@@ -465,7 +495,7 @@ METHODDEF(boolean)
encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{ {
phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
register int temp, temp2; register int temp, temp2, temp3;
register int nbits; register int nbits;
register int r, k; register int r, k;
int Se = cinfo->Se; int Se = cinfo->Se;
@@ -497,15 +527,12 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
* in C, we shift after obtaining the absolute value; so the code is * in C, we shift after obtaining the absolute value; so the code is
* interwoven with finding the abs value (temp) and output bits (temp2). * interwoven with finding the abs value (temp) and output bits (temp2).
*/ */
if (temp < 0) { temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
temp = -temp; /* temp is abs value of input */ temp ^= temp3;
temp >>= Al; /* apply the point transform */ temp -= temp3; /* temp is abs value of input */
/* For a negative coef, want temp2 = bitwise complement of abs(coef) */ temp >>= Al; /* apply the point transform */
temp2 = ~temp; /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
} else { temp2 = temp ^ temp3;
temp >>= Al; /* apply the point transform */
temp2 = temp;
}
/* Watch out for case that nonzero coef is zero after point transform */ /* Watch out for case that nonzero coef is zero after point transform */
if (temp == 0) { if (temp == 0) {
r++; r++;
@@ -522,9 +549,7 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
} }
/* Find the number of bits needed for the magnitude of the coefficient */ /* Find the number of bits needed for the magnitude of the coefficient */
nbits = 1; /* there must be at least one 1 bit */ nbits = JPEG_NBITS_NONZERO(temp); /* there must be at least one 1 bit */
while ((temp >>= 1))
nbits++;
/* Check for out-of-range coefficient values */ /* Check for out-of-range coefficient values */
if (nbits > MAX_COEF_BITS) if (nbits > MAX_COEF_BITS)
ERREXIT(cinfo, JERR_BAD_DCT_COEF); ERREXIT(cinfo, JERR_BAD_DCT_COEF);
@@ -619,7 +644,7 @@ METHODDEF(boolean)
encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{ {
phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
register int temp; register int temp, temp3;
register int r, k; register int r, k;
int EOB; int EOB;
char *BR_buffer; char *BR_buffer;
@@ -650,8 +675,9 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
* is an integer division with rounding towards 0. To do this portably * is an integer division with rounding towards 0. To do this portably
* in C, we shift after obtaining the absolute value. * in C, we shift after obtaining the absolute value.
*/ */
if (temp < 0) temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
temp = -temp; /* temp is abs value of input */ temp ^= temp3;
temp -= temp3; /* temp is abs value of input */
temp >>= Al; /* apply the point transform */ temp >>= Al; /* apply the point transform */
absvalues[k] = temp; /* save abs value for main pass */ absvalues[k] = temp; /* save abs value for main pass */
if (temp == 1) if (temp == 1)