36% faster encode_mcu_AC_first(), by eliminating an unpredictable branch in the inner loop

Which corresponds to 15% faster overall encoding in "progressive scan optimization" mode.
(And a negligible speedup in -fastcrush mode.)
This commit is contained in:
Loren Merritt
2014-03-07 00:03:44 +00:00
parent f4c556031c
commit 8db334cde2

View File

@@ -468,6 +468,7 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
int Se = cinfo->Se;
int Al = cinfo->Al;
JBLOCKROW block;
int deadzone = (1 << Al) - 1;
entropy->next_output_byte = cinfo->dest->next_output_byte;
entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -485,29 +486,21 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
r = 0; /* r = run length of zeros */
for (k = cinfo->Ss; k <= Se; k++) {
if ((temp = (*block)[jpeg_natural_order[k]]) == 0) {
temp = (*block)[jpeg_natural_order[k]];
if ((unsigned)(temp + deadzone) <= 2*deadzone) {
r++;
continue;
}
/* We must apply the point transform by Al. For AC coefficients this
* is an integer division with rounding towards 0. To do this portably
* in C, we shift after obtaining the absolute value; so the code is
* interwoven with finding the abs value (temp) and output bits (temp2).
*/
if (temp < 0) {
temp = -temp; /* temp is abs value of input */
temp >>= Al; /* apply the point transform */
/* For a negative coef, want temp2 = bitwise complement of abs(coef) */
temp2 = ~temp;
} else {
temp >>= Al; /* apply the point transform */
temp2 = temp;
}
/* Watch out for case that nonzero coef is zero after point transform */
if (temp == 0) {
r++;
continue;
}
int sign = temp >> 31;
temp += sign;
temp2 = temp >> Al;
temp = (temp ^ sign) >> Al;
/* Emit any pending EOBRUN */
if (entropy->EOBRUN > 0)