"Optimise" quantization step by replacing the division by a multiplication.

This has no measurable difference right now but makes it possible to do
SIMD implementations of this stage.


git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@16 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
Pierre Ossman
2009-03-09 13:23:04 +00:00
parent 2ae181c7b8
commit dedc42e268
2 changed files with 149 additions and 32 deletions

View File

@@ -2,6 +2,7 @@
* jcdctmgr.c
*
* Copyright (C) 1994-1996, Thomas G. Lane.
* Copyright (C) 1999-2006, MIYASAKA Masaru.
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
@@ -64,6 +65,128 @@ typedef struct {
typedef my_fdct_controller * my_fdct_ptr;
/*
* Find the highest bit in an integer through binary search.
*/
LOCAL(int)
fls (UINT16 val)
{
int bit;
bit = 16;
if (!val)
return 0;
if (!(val & 0xff00)) {
bit -= 8;
val <<= 8;
}
if (!(val & 0xf000)) {
bit -= 4;
val <<= 4;
}
if (!(val & 0xc000)) {
bit -= 2;
val <<= 2;
}
if (!(val & 0x8000)) {
bit -= 1;
val <<= 1;
}
return bit;
}
/*
* Compute values to do a division using reciprocal.
*
* This implementation is based on an algorithm described in
* "How to optimize for the Pentium family of microprocessors"
* (http://www.agner.org/assem/).
* More information about the basic algorithm can be found in
* the paper "Integer Division Using Reciprocals" by Robert Alverson.
*
* The basic idea is to replace x/d by x * d^-1. In order to store
* d^-1 with enough precision we shift it left a few places. It turns
* out that this algoright gives just enough precision, and also fits
* into DCTELEM:
*
* b = (the number of significant bits in divisor) - 1
* r = (word size) + b
* f = 2^r / divisor
*
* f will not be an integer for most cases, so we need to compensate
* for the rounding error introduced:
*
* no fractional part:
*
* result = input >> r
*
* fractional part of f < 0.5:
*
* round f down to nearest integer
* result = ((input + 1) * f) >> r
*
* fractional part of f > 0.5:
*
* round f up to nearest integer
* result = (input * f) >> r
*
* This is the original algorithm that gives truncated results. But we
* want properly rounded results, so we replace "input" with
* "input + divisor/2".
*
* In order to allow SIMD implementations we also tweak the values to
* allow the same calculation to be made at all times:
*
* dctbl[0] = f rounded to nearest integer
* dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
* dctbl[2] = 1 << ((word size) * 2 - r)
* dctbl[3] = r - (word size)
*
* dctbl[2] is for stupid instruction sets where the shift operation
* isn't member wise (e.g. MMX).
*
* The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
* is that most SIMD implementations have a "multiply and store top
* half" operation.
*
* Lastly, we store each of the values in their own table instead
* of in a consecutive manner, yet again in order to allow SIMD
* routines.
*/
LOCAL(void)
compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
{
UDCTELEM2 fq, fr;
UDCTELEM c;
int b, r;
b = fls(divisor) - 1;
r = sizeof(DCTELEM) * 8 + b;
fq = ((UDCTELEM2)1 << r) / divisor;
fr = ((UDCTELEM2)1 << r) % divisor;
c = divisor / 2; /* for rounding */
if (fr == 0) { /* divisor is power of two */
/* fq will be one bit too large to fit in DCTELEM, so adjust */
fq >>= 1;
r--;
} else if (fr <= (divisor / 2)) { /* fractional part is < 0.5 */
c++;
} else { /* fractional part is > 0.5 */
fq++;
}
dtbl[DCTSIZE2 * 0] = (DCTELEM) fq; /* reciprocal */
dtbl[DCTSIZE2 * 1] = (DCTELEM) c; /* correction + roundfactor */
dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r)); /* scale */
dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
}
/*
* Initialize for a processing pass.
* Verify that all referenced Q-tables are present, and set up
@@ -101,11 +224,11 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
if (fdct->divisors[qtblno] == NULL) {
fdct->divisors[qtblno] = (DCTELEM *)
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
DCTSIZE2 * SIZEOF(DCTELEM));
(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
}
dtbl = fdct->divisors[qtblno];
for (i = 0; i < DCTSIZE2; i++) {
dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]);
}
break;
#endif
@@ -135,14 +258,14 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
if (fdct->divisors[qtblno] == NULL) {
fdct->divisors[qtblno] = (DCTELEM *)
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
DCTSIZE2 * SIZEOF(DCTELEM));
(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
}
dtbl = fdct->divisors[qtblno];
for (i = 0; i < DCTSIZE2; i++) {
dtbl[i] = (DCTELEM)
compute_reciprocal(
DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
(INT32) aanscales[i]),
CONST_BITS-3);
CONST_BITS-3), &dtbl[i]);
}
}
break;
@@ -233,41 +356,30 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
METHODDEF(void)
quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
{
register DCTELEM temp, qval;
register int i;
register JCOEFPTR output_ptr = coef_block;
int i;
DCTELEM temp;
UDCTELEM recip, corr, shift;
UDCTELEM2 product;
JCOEFPTR output_ptr = coef_block;
for (i = 0; i < DCTSIZE2; i++) {
qval = divisors[i];
temp = workspace[i];
/* Divide the coefficient value by qval, ensuring proper rounding.
* Since C does not specify the direction of rounding for negative
* quotients, we have to force the dividend positive for portability.
*
* In most files, at least half of the output values will be zero
* (at default quantization settings, more like three-quarters...)
* so we should ensure that this case is fast. On many machines,
* a comparison is enough cheaper than a divide to make a special test
* a win. Since both inputs will be nonnegative, we need only test
* for a < b to discover whether a/b is 0.
* If your machine's division is fast enough, define FAST_DIVIDE.
*/
#ifdef FAST_DIVIDE
#define DIVIDE_BY(a,b) a /= b
#else
#define DIVIDE_BY(a,b) if (a >= b) a /= b; else a = 0
#endif
recip = divisors[i + DCTSIZE2 * 0];
corr = divisors[i + DCTSIZE2 * 1];
shift = divisors[i + DCTSIZE2 * 3];
if (temp < 0) {
temp = -temp;
temp += qval>>1; /* for rounding */
DIVIDE_BY(temp, qval);
product = (UDCTELEM2)(temp + corr) * recip;
product >>= shift + sizeof(DCTELEM)*8;
temp = product;
temp = -temp;
} else {
temp += qval>>1; /* for rounding */
DIVIDE_BY(temp, qval);
product = (UDCTELEM2)(temp + corr) * recip;
product >>= shift + sizeof(DCTELEM)*8;
temp = product;
}
output_ptr[i] = (JCOEF) temp;
}
}

7
jdct.h
View File

@@ -23,13 +23,18 @@
* have a range of +-8K for 8-bit data, +-128K for 12-bit data. This
* convention improves accuracy in integer implementations and saves some
* work in floating-point ones.
* Quantization of the output coefficients is done by jcdctmgr.c.
* Quantization of the output coefficients is done by jcdctmgr.c. This
* step requires an unsigned type and also one with twice the bits.
*/
#if BITS_IN_JSAMPLE == 8
typedef int DCTELEM; /* 16 or 32 bits is fine */
typedef unsigned int UDCTELEM;
typedef unsigned long long UDCTELEM2;
#else
typedef INT32 DCTELEM; /* must have 32 bits */
typedef UINT32 UDCTELEM;
typedef unsigned long long UDCTELEM2;
#endif