"Optimise" quantization step by replacing the division by a multiplication.
This has no measurable difference right now but makes it possible to do SIMD implementations of this stage. git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@16 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
174
jcdctmgr.c
174
jcdctmgr.c
@@ -2,6 +2,7 @@
|
||||
* jcdctmgr.c
|
||||
*
|
||||
* Copyright (C) 1994-1996, Thomas G. Lane.
|
||||
* Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* This file is part of the Independent JPEG Group's software.
|
||||
* For conditions of distribution and use, see the accompanying README file.
|
||||
@@ -64,6 +65,128 @@ typedef struct {
|
||||
typedef my_fdct_controller * my_fdct_ptr;
|
||||
|
||||
|
||||
/*
|
||||
* Find the highest bit in an integer through binary search.
|
||||
*/
|
||||
LOCAL(int)
|
||||
fls (UINT16 val)
|
||||
{
|
||||
int bit;
|
||||
|
||||
bit = 16;
|
||||
|
||||
if (!val)
|
||||
return 0;
|
||||
|
||||
if (!(val & 0xff00)) {
|
||||
bit -= 8;
|
||||
val <<= 8;
|
||||
}
|
||||
if (!(val & 0xf000)) {
|
||||
bit -= 4;
|
||||
val <<= 4;
|
||||
}
|
||||
if (!(val & 0xc000)) {
|
||||
bit -= 2;
|
||||
val <<= 2;
|
||||
}
|
||||
if (!(val & 0x8000)) {
|
||||
bit -= 1;
|
||||
val <<= 1;
|
||||
}
|
||||
|
||||
return bit;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute values to do a division using reciprocal.
|
||||
*
|
||||
* This implementation is based on an algorithm described in
|
||||
* "How to optimize for the Pentium family of microprocessors"
|
||||
* (http://www.agner.org/assem/).
|
||||
* More information about the basic algorithm can be found in
|
||||
* the paper "Integer Division Using Reciprocals" by Robert Alverson.
|
||||
*
|
||||
* The basic idea is to replace x/d by x * d^-1. In order to store
|
||||
* d^-1 with enough precision we shift it left a few places. It turns
|
||||
* out that this algoright gives just enough precision, and also fits
|
||||
* into DCTELEM:
|
||||
*
|
||||
* b = (the number of significant bits in divisor) - 1
|
||||
* r = (word size) + b
|
||||
* f = 2^r / divisor
|
||||
*
|
||||
* f will not be an integer for most cases, so we need to compensate
|
||||
* for the rounding error introduced:
|
||||
*
|
||||
* no fractional part:
|
||||
*
|
||||
* result = input >> r
|
||||
*
|
||||
* fractional part of f < 0.5:
|
||||
*
|
||||
* round f down to nearest integer
|
||||
* result = ((input + 1) * f) >> r
|
||||
*
|
||||
* fractional part of f > 0.5:
|
||||
*
|
||||
* round f up to nearest integer
|
||||
* result = (input * f) >> r
|
||||
*
|
||||
* This is the original algorithm that gives truncated results. But we
|
||||
* want properly rounded results, so we replace "input" with
|
||||
* "input + divisor/2".
|
||||
*
|
||||
* In order to allow SIMD implementations we also tweak the values to
|
||||
* allow the same calculation to be made at all times:
|
||||
*
|
||||
* dctbl[0] = f rounded to nearest integer
|
||||
* dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
|
||||
* dctbl[2] = 1 << ((word size) * 2 - r)
|
||||
* dctbl[3] = r - (word size)
|
||||
*
|
||||
* dctbl[2] is for stupid instruction sets where the shift operation
|
||||
* isn't member wise (e.g. MMX).
|
||||
*
|
||||
* The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
|
||||
* is that most SIMD implementations have a "multiply and store top
|
||||
* half" operation.
|
||||
*
|
||||
* Lastly, we store each of the values in their own table instead
|
||||
* of in a consecutive manner, yet again in order to allow SIMD
|
||||
* routines.
|
||||
*/
|
||||
LOCAL(void)
|
||||
compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
|
||||
{
|
||||
UDCTELEM2 fq, fr;
|
||||
UDCTELEM c;
|
||||
int b, r;
|
||||
|
||||
b = fls(divisor) - 1;
|
||||
r = sizeof(DCTELEM) * 8 + b;
|
||||
|
||||
fq = ((UDCTELEM2)1 << r) / divisor;
|
||||
fr = ((UDCTELEM2)1 << r) % divisor;
|
||||
|
||||
c = divisor / 2; /* for rounding */
|
||||
|
||||
if (fr == 0) { /* divisor is power of two */
|
||||
/* fq will be one bit too large to fit in DCTELEM, so adjust */
|
||||
fq >>= 1;
|
||||
r--;
|
||||
} else if (fr <= (divisor / 2)) { /* fractional part is < 0.5 */
|
||||
c++;
|
||||
} else { /* fractional part is > 0.5 */
|
||||
fq++;
|
||||
}
|
||||
|
||||
dtbl[DCTSIZE2 * 0] = (DCTELEM) fq; /* reciprocal */
|
||||
dtbl[DCTSIZE2 * 1] = (DCTELEM) c; /* correction + roundfactor */
|
||||
dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r)); /* scale */
|
||||
dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize for a processing pass.
|
||||
* Verify that all referenced Q-tables are present, and set up
|
||||
@@ -101,11 +224,11 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
|
||||
if (fdct->divisors[qtblno] == NULL) {
|
||||
fdct->divisors[qtblno] = (DCTELEM *)
|
||||
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
|
||||
DCTSIZE2 * SIZEOF(DCTELEM));
|
||||
(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
|
||||
}
|
||||
dtbl = fdct->divisors[qtblno];
|
||||
for (i = 0; i < DCTSIZE2; i++) {
|
||||
dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
|
||||
compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]);
|
||||
}
|
||||
break;
|
||||
#endif
|
||||
@@ -135,14 +258,14 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
|
||||
if (fdct->divisors[qtblno] == NULL) {
|
||||
fdct->divisors[qtblno] = (DCTELEM *)
|
||||
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
|
||||
DCTSIZE2 * SIZEOF(DCTELEM));
|
||||
(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
|
||||
}
|
||||
dtbl = fdct->divisors[qtblno];
|
||||
for (i = 0; i < DCTSIZE2; i++) {
|
||||
dtbl[i] = (DCTELEM)
|
||||
compute_reciprocal(
|
||||
DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
|
||||
(INT32) aanscales[i]),
|
||||
CONST_BITS-3);
|
||||
CONST_BITS-3), &dtbl[i]);
|
||||
}
|
||||
}
|
||||
break;
|
||||
@@ -233,41 +356,30 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
|
||||
METHODDEF(void)
|
||||
quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
|
||||
{
|
||||
register DCTELEM temp, qval;
|
||||
register int i;
|
||||
register JCOEFPTR output_ptr = coef_block;
|
||||
int i;
|
||||
DCTELEM temp;
|
||||
UDCTELEM recip, corr, shift;
|
||||
UDCTELEM2 product;
|
||||
JCOEFPTR output_ptr = coef_block;
|
||||
|
||||
for (i = 0; i < DCTSIZE2; i++) {
|
||||
qval = divisors[i];
|
||||
temp = workspace[i];
|
||||
|
||||
/* Divide the coefficient value by qval, ensuring proper rounding.
|
||||
* Since C does not specify the direction of rounding for negative
|
||||
* quotients, we have to force the dividend positive for portability.
|
||||
*
|
||||
* In most files, at least half of the output values will be zero
|
||||
* (at default quantization settings, more like three-quarters...)
|
||||
* so we should ensure that this case is fast. On many machines,
|
||||
* a comparison is enough cheaper than a divide to make a special test
|
||||
* a win. Since both inputs will be nonnegative, we need only test
|
||||
* for a < b to discover whether a/b is 0.
|
||||
* If your machine's division is fast enough, define FAST_DIVIDE.
|
||||
*/
|
||||
#ifdef FAST_DIVIDE
|
||||
#define DIVIDE_BY(a,b) a /= b
|
||||
#else
|
||||
#define DIVIDE_BY(a,b) if (a >= b) a /= b; else a = 0
|
||||
#endif
|
||||
recip = divisors[i + DCTSIZE2 * 0];
|
||||
corr = divisors[i + DCTSIZE2 * 1];
|
||||
shift = divisors[i + DCTSIZE2 * 3];
|
||||
|
||||
if (temp < 0) {
|
||||
temp = -temp;
|
||||
temp += qval>>1; /* for rounding */
|
||||
DIVIDE_BY(temp, qval);
|
||||
product = (UDCTELEM2)(temp + corr) * recip;
|
||||
product >>= shift + sizeof(DCTELEM)*8;
|
||||
temp = product;
|
||||
temp = -temp;
|
||||
} else {
|
||||
temp += qval>>1; /* for rounding */
|
||||
DIVIDE_BY(temp, qval);
|
||||
product = (UDCTELEM2)(temp + corr) * recip;
|
||||
product >>= shift + sizeof(DCTELEM)*8;
|
||||
temp = product;
|
||||
}
|
||||
|
||||
output_ptr[i] = (JCOEF) temp;
|
||||
}
|
||||
}
|
||||
|
||||
7
jdct.h
7
jdct.h
@@ -23,13 +23,18 @@
|
||||
* have a range of +-8K for 8-bit data, +-128K for 12-bit data. This
|
||||
* convention improves accuracy in integer implementations and saves some
|
||||
* work in floating-point ones.
|
||||
* Quantization of the output coefficients is done by jcdctmgr.c.
|
||||
* Quantization of the output coefficients is done by jcdctmgr.c. This
|
||||
* step requires an unsigned type and also one with twice the bits.
|
||||
*/
|
||||
|
||||
#if BITS_IN_JSAMPLE == 8
|
||||
typedef int DCTELEM; /* 16 or 32 bits is fine */
|
||||
typedef unsigned int UDCTELEM;
|
||||
typedef unsigned long long UDCTELEM2;
|
||||
#else
|
||||
typedef INT32 DCTELEM; /* must have 32 bits */
|
||||
typedef UINT32 UDCTELEM;
|
||||
typedef unsigned long long UDCTELEM2;
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user