Greatly improve performance of Huffman decoding

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@64 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
DRC
2009-09-28 00:33:02 +00:00
parent 6c0e1fc72e
commit e2816648d8
2 changed files with 269 additions and 99 deletions

241
jdhuff.c
View File

@@ -14,6 +14,21 @@
* storage only upon successful completion of an MCU.
*/
/* Modifications:
* Copyright (C)2007 Sun Microsystems, Inc.
* Copyright (C)2009 D. R. Commander
*
* This library is free software and may be redistributed and/or modified under
* the terms of the wxWindows Library License, Version 3.1 or (at your option)
* any later version. The full license is in the LICENSE.txt file included
* with this distribution.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* wxWindows Library License for more details.
*/
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
@@ -234,7 +249,8 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
* with that code.
*/
MEMZERO(dtbl->look_nbits, SIZEOF(dtbl->look_nbits));
for (i = 0; i < (1 << HUFF_LOOKAHEAD); i++)
dtbl->lookup[i] = (HUFF_LOOKAHEAD + 1) << HUFF_LOOKAHEAD;
p = 0;
for (l = 1; l <= HUFF_LOOKAHEAD; l++) {
@@ -243,8 +259,7 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
/* Generate left-justified code followed by all possible bit sequences */
lookbits = huffcode[p] << (HUFF_LOOKAHEAD-l);
for (ctr = 1 << (HUFF_LOOKAHEAD-l); ctr > 0; ctr--) {
dtbl->look_nbits[lookbits] = l;
dtbl->look_sym[lookbits] = htbl->huffval[p];
dtbl->lookup[lookbits] = (l << HUFF_LOOKAHEAD) | htbl->huffval[p];
lookbits++;
}
}
@@ -438,9 +453,10 @@ jpeg_huff_decode (bitread_working_state * state,
* On some machines, a shift and add will be faster than a table lookup.
*/
#define AVOID_TABLES
#ifdef AVOID_TABLES
#define HUFF_EXTEND(x,s) ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
#define HUFF_EXTEND(x,s) ((x) + ((((x) - (1<<((s)-1))) >> 31) & (((-1)<<(s)) + 1)))
#else
@@ -498,47 +514,19 @@ process_restart (j_decompress_ptr cinfo)
}
/*
* Decode and return one MCU's worth of Huffman-compressed coefficients.
* The coefficients are reordered from zigzag order into natural array order,
* but are not dequantized.
*
* The i'th block of the MCU is stored into the block pointed to by
* MCU_data[i]. WE ASSUME THIS AREA HAS BEEN ZEROED BY THE CALLER.
* (Wholesale zeroing is usually a little faster than retail...)
*
* Returns FALSE if data source requested suspension. In that case no
* changes have been made to permanent state. (Exception: some output
* coefficients may already have been assigned. This is harmless for
* this module, since we'll just re-assign them on the next call.)
*/
METHODDEF(boolean)
decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
LOCAL(boolean)
decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
{
huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
int blkn;
BITREAD_STATE_VARS;
int blkn;
savable_state state;
/* Process restart marker if needed; may have to suspend */
if (cinfo->restart_interval) {
if (entropy->restarts_to_go == 0)
if (! process_restart(cinfo))
return FALSE;
}
/* If we've run out of data, just leave the MCU set to zeroes.
* This way, we return uniform gray for the remainder of the segment.
*/
if (! entropy->pub.insufficient_data) {
/* Outer loop handles each block in the MCU */
/* Load up working state */
BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
ASSIGN_STATE(state, entropy->saved);
/* Outer loop handles each block in the MCU */
for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
JBLOCKROW block = MCU_data[blkn];
d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
@@ -611,13 +599,192 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
k += 15;
}
}
}
}
/* Completed MCU, so update state */
BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
ASSIGN_STATE(entropy->saved, state);
return TRUE;
}
/***************************************************************/
#define ADD_BYTE { \
int val0 = *(buffer++); \
int val1 = *(buffer); \
\
bits_left += 8; \
get_buffer = (get_buffer << 8) | (val0); \
if (val0 == 0xFF) { \
buffer++; \
if (val1 != 0) { \
buffer -= 2; \
get_buffer &= ~0xFF; \
} \
} \
}
/***************************************************************/
#if __WORDSIZE == 64
#define ENSURE_SHORT \
if (bits_left < 16) { \
ADD_BYTE ADD_BYTE ADD_BYTE ADD_BYTE ADD_BYTE ADD_BYTE \
}
#else
#define ENSURE_SHORT if (bits_left < 16) { ADD_BYTE ADD_BYTE }
#endif
/***************************************************************/
#define HUFF_DECODE_FAST(symbol, size, htbl) { \
ENSURE_SHORT \
symbol = PEEK_BITS(HUFF_LOOKAHEAD); \
symbol = htbl->lookup[symbol]; \
size = symbol >> 8; \
bits_left -= size; \
symbol = symbol & ((1 << HUFF_LOOKAHEAD) - 1); \
if (size == HUFF_LOOKAHEAD + 1) { \
symbol = (get_buffer >> bits_left) & ((1 << (size)) - 1); \
while (symbol > htbl->maxcode[size]) { \
symbol <<= 1; \
symbol |= GET_BITS(1); \
size++; \
} \
symbol = htbl->pub->huffval[ (int) (symbol + htbl->valoffset[size]) ]; \
} \
}
/***************************************************************/
LOCAL(boolean)
decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
{
huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
BITREAD_STATE_VARS;
JOCTET *buffer;
int blkn;
savable_state state;
/* Outer loop handles each block in the MCU */
/* Load up working state */
BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
buffer = (JOCTET *) br_state.next_input_byte;
ASSIGN_STATE(state, entropy->saved);
for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
JBLOCKROW block = MCU_data[blkn];
d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
register int s, k, r, l;
HUFF_DECODE_FAST(s, l, dctbl);
if (s) {
ENSURE_SHORT
r = GET_BITS(s);
s = HUFF_EXTEND(r, s);
}
if (entropy->dc_needed[blkn]) {
int ci = cinfo->MCU_membership[blkn];
s += state.last_dc_val[ci];
state.last_dc_val[ci] = s;
(*block)[0] = (JCOEF) s;
}
if (entropy->ac_needed[blkn]) {
for (k = 1; k < DCTSIZE2; k++) {
HUFF_DECODE_FAST(s, l, actbl);
r = s >> 4;
s &= 15;
if (s) {
k += r;
ENSURE_SHORT
r = GET_BITS(s);
s = HUFF_EXTEND(r, s);
(*block)[jpeg_natural_order[k]] = (JCOEF) s;
} else {
if (r != 15) break;
k += 15;
}
}
} else {
for (k = 1; k < DCTSIZE2; k++) {
HUFF_DECODE_FAST(s, l, actbl);
r = s >> 4;
s &= 15;
if (s) {
k += r;
ENSURE_SHORT
DROP_BITS(s);
} else {
if (r != 15) break;
k += 15;
}
}
}
}
br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte);
br_state.next_input_byte = buffer;
BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
ASSIGN_STATE(entropy->saved, state);
return TRUE;
}
/*
* Decode and return one MCU's worth of Huffman-compressed coefficients.
* The coefficients are reordered from zigzag order into natural array order,
* but are not dequantized.
*
* The i'th block of the MCU is stored into the block pointed to by
* MCU_data[i]. WE ASSUME THIS AREA HAS BEEN ZEROED BY THE CALLER.
* (Wholesale zeroing is usually a little faster than retail...)
*
* Returns FALSE if data source requested suspension. In that case no
* changes have been made to permanent state. (Exception: some output
* coefficients may already have been assigned. This is harmless for
* this module, since we'll just re-assign them on the next call.)
*/
#define BUFSIZE (DCTSIZE2 * 2)
METHODDEF(boolean)
decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
{
huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
/* Process restart marker if needed; may have to suspend */
if (cinfo->restart_interval) {
if (entropy->restarts_to_go == 0)
if (! process_restart(cinfo))
return FALSE;
}
/* If we've run out of data, just leave the MCU set to zeroes.
* This way, we return uniform gray for the remainder of the segment.
*/
if (! entropy->pub.insufficient_data) {
if (cinfo->src->bytes_in_buffer >= BUFSIZE) {
if (!decode_mcu_fast(cinfo, MCU_data)) return FALSE;
}
else {
if (!decode_mcu_slow(cinfo, MCU_data)) return FALSE;
}
}
/* Account for restart interval (no-op if not using restarts) */

View File

@@ -36,13 +36,17 @@ typedef struct {
/* Link to public Huffman table (needed only in jpeg_huff_decode) */
JHUFF_TBL *pub;
/* Lookahead tables: indexed by the next HUFF_LOOKAHEAD bits of
/* Lookahead table: indexed by the next HUFF_LOOKAHEAD bits of
* the input data stream. If the next Huffman code is no more
* than HUFF_LOOKAHEAD bits long, we can obtain its length and
* the corresponding symbol directly from these tables.
* the corresponding symbol directly from this tables.
*
* The lower 8 bits of each table entry contain the number of
* bits in the corresponding Huffman code, or HUFF_LOOKAHEAD + 1
* if too long. The next 8 bits of each entry contain the
* symbol.
*/
int look_nbits[1<<HUFF_LOOKAHEAD]; /* # bits, or 0 if too long */
UINT8 look_sym[1<<HUFF_LOOKAHEAD]; /* symbol, or unused */
int lookup[1<<HUFF_LOOKAHEAD];
} d_derived_tbl;
/* Expand a Huffman table definition into the derived format */
@@ -69,8 +73,8 @@ EXTERN(void) jpeg_make_d_derived_tbl
* necessary.
*/
typedef INT32 bit_buf_type; /* type of bit-extraction buffer */
#define BIT_BUF_SIZE 32 /* size of buffer in bits */
typedef long bit_buf_type; /* type of bit-extraction buffer */
#define BIT_BUF_SIZE __WORDSIZE /* size of buffer in bits */
/* If long is > 32 bits on your machine, and shifting/masking longs is
* reasonably fast, making bit_buf_type be long and setting BIT_BUF_SIZE
@@ -183,11 +187,10 @@ EXTERN(boolean) jpeg_fill_bit_buffer
} \
} \
look = PEEK_BITS(HUFF_LOOKAHEAD); \
if ((nb = htbl->look_nbits[look]) != 0) { \
if ((nb = (htbl->lookup[look] >> HUFF_LOOKAHEAD)) <= HUFF_LOOKAHEAD) { \
DROP_BITS(nb); \
result = htbl->look_sym[look]; \
result = htbl->lookup[look] & ((1 << HUFF_LOOKAHEAD) - 1); \
} else { \
nb = HUFF_LOOKAHEAD+1; \
slowlabel: \
if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
{ failaction; } \