C/SSE2 optimization of encode_mcu_AC_first()

This commit adds C and SSE2 optimizations for the encode_mcu_AC_first()
function used in progressive Huffman encoding.

The image used for testing can be retrieved from this page:
https://blog.cloudflare.com/doubling-the-speed-of-jpegtran

All timings done on `Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz`
clang version is `Apple LLVM version 9.0.0 (clang-900.0.39.2)`
gcc-5 version is `gcc-5 (Homebrew GCC 5.5.0) 5.5.0`
gcc-7 version is `gcc-7 (Homebrew GCC 7.2.0) 7.2.0`

Here are the results in comparison to libjpeg-turbo@293263c using
`time ./jpegtran -outfile /dev/null -progressive -optimise -copy none print_poster_0025.jpg`

C
clang x86_64: +19%
gcc-5 x86_64: +80%
gcc-7 x86_64: +57%
clang i386: +5%
gcc-5 i386: +59%
gcc-7 i386: +51%

SSE2
clang x86_64: +79%
gcc-5 x86_64: +158%
gcc-7 x86_64: +122%
clang i386: +71%
gcc-5 i386: +134%
gcc-7 i386: +135%

Discussion in libjpeg-turbo/libjpeg-turbo#46
This commit is contained in:
mayeut
2018-03-22 11:36:43 -05:00
committed by DRC
parent 16bd984557
commit 5b177b3cab
14 changed files with 641 additions and 56 deletions

201
jcphuff.c
View File

@@ -73,6 +73,10 @@
typedef struct {
struct jpeg_entropy_encoder pub; /* public fields */
/* Pointer to routine to prepare data for encode_mcu_AC_first() */
void (*AC_first_prepare) (const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *zerobits);
/* Pointer to routine to prepare data for encode_mcu_AC_refine() */
int (*AC_refine_prepare) (const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
@@ -144,6 +148,9 @@ typedef phuff_entropy_encoder *phuff_entropy_ptr;
/* Forward declarations */
METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
METHODDEF(void) encode_mcu_AC_first_prepare
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *values, size_t *zerobits);
METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo,
@@ -208,6 +215,10 @@ start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics)
entropy->pub.encode_mcu = encode_mcu_DC_first;
else
entropy->pub.encode_mcu = encode_mcu_AC_first;
if (jsimd_can_encode_mcu_AC_first_prepare())
entropy->AC_first_prepare = jsimd_encode_mcu_AC_first_prepare;
else
entropy->AC_first_prepare = encode_mcu_AC_first_prepare;
} else {
if (is_DC_band)
entropy->pub.encode_mcu = encode_mcu_DC_refine;
@@ -541,21 +552,116 @@ encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
}
/*
* Data preparation for encode_mcu_AC_first().
*/
#define COMPUTE_ABSVALUES_AC_FIRST(Sl) { \
for (k = 0; k < Sl; k++) { \
temp = block[jpeg_natural_order_start[k]]; \
if (temp == 0) \
continue; \
/* We must apply the point transform by Al. For AC coefficients this \
* is an integer division with rounding towards 0. To do this portably \
* in C, we shift after obtaining the absolute value; so the code is \
* interwoven with finding the abs value (temp) and output bits (temp2). \
*/ \
temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
temp ^= temp2; \
temp -= temp2; /* temp is abs value of input */ \
temp >>= Al; /* apply the point transform */ \
/* Watch out for case that nonzero coef is zero after point transform */ \
if (temp == 0) \
continue; \
/* For a negative coef, want temp2 = bitwise complement of abs(coef) */ \
temp2 ^= temp; \
values[k] = temp; \
values[k + DCTSIZE2] = temp2; \
zerobits |= ((size_t)1U) << k; \
} \
}
METHODDEF(void)
encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *bits)
{
register int k, temp, temp2;
size_t zerobits = 0U;
int Sl0 = Sl;
#if SIZEOF_SIZE_T == 4
if (Sl0 > 32)
Sl0 = 32;
#endif
COMPUTE_ABSVALUES_AC_FIRST(Sl0);
bits[0] = zerobits;
#if SIZEOF_SIZE_T == 4
zerobits = 0U;
if (Sl > 32) {
Sl -= 32;
jpeg_natural_order_start += 32;
values += 32;
COMPUTE_ABSVALUES_AC_FIRST(Sl);
}
bits[1] = zerobits;
#endif
}
/*
* MCU encoding for AC initial scan (either spectral selection,
* or first pass of successive approximation).
*/
#define ENCODE_COEFS_AC_FIRST(label) { \
while (zerobits) { \
r = count_zeroes(&zerobits); \
cvalue += r; \
label \
temp = cvalue[0]; \
temp2 = cvalue[DCTSIZE2]; \
\
/* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
while (r > 15) { \
emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
r -= 16; \
} \
\
/* Find the number of bits needed for the magnitude of the coefficient */ \
nbits = JPEG_NBITS_NONZERO(temp); /* there must be at least one 1 bit */ \
/* Check for out-of-range coefficient values */ \
if (nbits > MAX_COEF_BITS) \
ERREXIT(cinfo, JERR_BAD_DCT_COEF); \
\
/* Count/emit Huffman symbol for run length / number of bits */ \
emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits); \
\
/* Emit that number of bits of the value, if positive, */ \
/* or the complement of its magnitude, if negative. */ \
emit_bits(entropy, (unsigned int)temp2, nbits); \
\
cvalue++; \
zerobits >>= 1; \
} \
}
METHODDEF(boolean)
encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
register int temp, temp2, temp3;
register int nbits;
register int r, k;
int Se = cinfo->Se;
register int temp, temp2;
register int nbits, r;
int Sl = cinfo->Se - cinfo->Ss + 1;
int Al = cinfo->Al;
JBLOCKROW block;
JCOEF values_unaligned[2 * DCTSIZE2 + 15];
JCOEF *values;
const JCOEF *cvalue;
size_t zerobits;
size_t bits[8 / SIZEOF_SIZE_T];
entropy->next_output_byte = cinfo->dest->next_output_byte;
entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -565,61 +671,48 @@ encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
if (entropy->restarts_to_go == 0)
emit_restart(entropy, entropy->next_restart_num);
/* Encode the MCU data block */
block = MCU_data[0];
#ifdef WITH_SIMD
cvalue = values = (JCOEF *)PAD((size_t)values_unaligned, 16);
#else
/* Not using SIMD, so alignment is not needed */
cvalue = values = values_unaligned;
#endif
/* Prepare data */
entropy->AC_first_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
Sl, Al, values, bits);
zerobits = bits[0];
#if SIZEOF_SIZE_T == 4
zerobits |= bits[1];
#endif
/* Emit any pending EOBRUN */
if (zerobits && (entropy->EOBRUN > 0))
emit_eobrun(entropy);
#if SIZEOF_SIZE_T == 4
zerobits = bits[0];
#endif
/* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
r = 0; /* r = run length of zeros */
ENCODE_COEFS_AC_FIRST();
for (k = cinfo->Ss; k <= Se; k++) {
if ((temp = (*block)[jpeg_natural_order[k]]) == 0) {
r++;
continue;
}
/* We must apply the point transform by Al. For AC coefficients this
* is an integer division with rounding towards 0. To do this portably
* in C, we shift after obtaining the absolute value; so the code is
* interwoven with finding the abs value (temp) and output bits (temp2).
*/
temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
temp ^= temp3;
temp -= temp3; /* temp is abs value of input */
temp >>= Al; /* apply the point transform */
/* For a negative coef, want temp2 = bitwise complement of abs(coef) */
temp2 = temp ^ temp3;
/* Watch out for case that nonzero coef is zero after point transform */
if (temp == 0) {
r++;
continue;
}
/* Emit any pending EOBRUN */
if (entropy->EOBRUN > 0)
emit_eobrun(entropy);
/* if run length > 15, must emit special run-length-16 codes (0xF0) */
while (r > 15) {
emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
r -= 16;
}
/* Find the number of bits needed for the magnitude of the coefficient */
nbits = JPEG_NBITS_NONZERO(temp); /* there must be at least one 1 bit */
/* Check for out-of-range coefficient values */
if (nbits > MAX_COEF_BITS)
ERREXIT(cinfo, JERR_BAD_DCT_COEF);
/* Count/emit Huffman symbol for run length / number of bits */
emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits);
/* Emit that number of bits of the value, if positive, */
/* or the complement of its magnitude, if negative. */
emit_bits(entropy, (unsigned int)temp2, nbits);
r = 0; /* reset zero run length */
#if SIZEOF_SIZE_T == 4
zerobits = bits[1];
if (zerobits) {
int diff = ((values + DCTSIZE2 / 2) - cvalue);
r = count_zeroes(&zerobits);
r += diff;
cvalue += r;
goto first_iter_ac_first;
}
if (r > 0) { /* If there are trailing zeroes, */
ENCODE_COEFS_AC_FIRST(first_iter_ac_first:);
#endif
if (cvalue < (values + Sl)) { /* If there are trailing zeroes, */
entropy->EOBRUN++; /* count an EOB */
if (entropy->EOBRUN == 0x7FFF)
emit_eobrun(entropy); /* force it out to avoid overflow */