36% faster encode_mcu_AC_first(), by eliminating an unpredictable branch in the inner loop
Which corresponds to 15% faster overall encoding in "progressive scan optimization" mode. (And a negligible speedup in -fastcrush mode.)
This commit is contained in:
23
jcphuff.c
23
jcphuff.c
@@ -468,6 +468,7 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
|
||||
int Se = cinfo->Se;
|
||||
int Al = cinfo->Al;
|
||||
JBLOCKROW block;
|
||||
int deadzone = (1 << Al) - 1;
|
||||
|
||||
entropy->next_output_byte = cinfo->dest->next_output_byte;
|
||||
entropy->free_in_buffer = cinfo->dest->free_in_buffer;
|
||||
@@ -485,29 +486,21 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
|
||||
r = 0; /* r = run length of zeros */
|
||||
|
||||
for (k = cinfo->Ss; k <= Se; k++) {
|
||||
if ((temp = (*block)[jpeg_natural_order[k]]) == 0) {
|
||||
temp = (*block)[jpeg_natural_order[k]];
|
||||
if ((unsigned)(temp + deadzone) <= 2*deadzone) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* We must apply the point transform by Al. For AC coefficients this
|
||||
* is an integer division with rounding towards 0. To do this portably
|
||||
* in C, we shift after obtaining the absolute value; so the code is
|
||||
* interwoven with finding the abs value (temp) and output bits (temp2).
|
||||
*/
|
||||
if (temp < 0) {
|
||||
temp = -temp; /* temp is abs value of input */
|
||||
temp >>= Al; /* apply the point transform */
|
||||
/* For a negative coef, want temp2 = bitwise complement of abs(coef) */
|
||||
temp2 = ~temp;
|
||||
} else {
|
||||
temp >>= Al; /* apply the point transform */
|
||||
temp2 = temp;
|
||||
}
|
||||
/* Watch out for case that nonzero coef is zero after point transform */
|
||||
if (temp == 0) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
int sign = temp >> 31;
|
||||
temp += sign;
|
||||
temp2 = temp >> Al;
|
||||
temp = (temp ^ sign) >> Al;
|
||||
|
||||
/* Emit any pending EOBRUN */
|
||||
if (entropy->EOBRUN > 0)
|
||||
|
||||
Reference in New Issue
Block a user