The loop in jsimd_quantize_neon() is only executed twice and should be unrolled for AArch64 targets. GCC does that by default, but Clang 11 and later versions available at the time of this writing do not. This patch adds an unroll pragma when targetting AArch64 with Clang. We do not use the unroll pragma for AArch32 targets, because it causes the Clang-generated assembly code to exhaust the available Neon registers (32 x 64-bit) and spill to the stack. (DRC: Referring to the discussion in #570, this is likely due to compiler confusion that results in poor register allocation. It is possible to eliminate the spillage and reduce the instruction count by loading the data on a just-in-time basis, thus explicitly interleaving compute and I/O, but the performance implications of that are currently unknown.) The effects of unrolling the quantization loop are: 1) elimination of the loop control flow overhead and 2) enabling the use of LDP/STP instructions that work from a single base pointer, instead of using double the number of LDR/STR instructions, each requiring an address calculation. Closes #570
194 lines
8.8 KiB
C
194 lines
8.8 KiB
C
/*
|
|
* jquanti-neon.c - sample data conversion and quantization (Arm Neon)
|
|
*
|
|
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
|
|
*
|
|
* This software is provided 'as-is', without any express or implied
|
|
* warranty. In no event will the authors be held liable for any damages
|
|
* arising from the use of this software.
|
|
*
|
|
* Permission is granted to anyone to use this software for any purpose,
|
|
* including commercial applications, and to alter it and redistribute it
|
|
* freely, subject to the following restrictions:
|
|
*
|
|
* 1. The origin of this software must not be misrepresented; you must not
|
|
* claim that you wrote the original software. If you use this software
|
|
* in a product, an acknowledgment in the product documentation would be
|
|
* appreciated but is not required.
|
|
* 2. Altered source versions must be plainly marked as such, and must not be
|
|
* misrepresented as being the original software.
|
|
* 3. This notice may not be removed or altered from any source distribution.
|
|
*/
|
|
|
|
#define JPEG_INTERNALS
|
|
#include "../../jinclude.h"
|
|
#include "../../jpeglib.h"
|
|
#include "../../jsimd.h"
|
|
#include "../../jdct.h"
|
|
#include "../../jsimddct.h"
|
|
#include "../jsimd.h"
|
|
|
|
#include <arm_neon.h>
|
|
|
|
|
|
/* After downsampling, the resulting sample values are in the range [0, 255],
|
|
* but the Discrete Cosine Transform (DCT) operates on values centered around
|
|
* 0.
|
|
*
|
|
* To prepare sample values for the DCT, load samples into a DCT workspace,
|
|
* subtracting CENTERJSAMPLE (128). The samples, now in the range [-128, 127],
|
|
* are also widened from 8- to 16-bit.
|
|
*
|
|
* The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
|
|
*/
|
|
|
|
void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
|
|
DCTELEM *workspace)
|
|
{
|
|
uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
|
|
uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
|
|
uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
|
|
uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
|
|
uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
|
|
uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
|
|
uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
|
|
uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
|
|
|
|
int16x8_t row0 =
|
|
vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
|
|
int16x8_t row1 =
|
|
vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
|
|
int16x8_t row2 =
|
|
vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
|
|
int16x8_t row3 =
|
|
vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
|
|
int16x8_t row4 =
|
|
vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
|
|
int16x8_t row5 =
|
|
vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
|
|
int16x8_t row6 =
|
|
vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
|
|
int16x8_t row7 =
|
|
vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
|
|
|
|
vst1q_s16(workspace + 0 * DCTSIZE, row0);
|
|
vst1q_s16(workspace + 1 * DCTSIZE, row1);
|
|
vst1q_s16(workspace + 2 * DCTSIZE, row2);
|
|
vst1q_s16(workspace + 3 * DCTSIZE, row3);
|
|
vst1q_s16(workspace + 4 * DCTSIZE, row4);
|
|
vst1q_s16(workspace + 5 * DCTSIZE, row5);
|
|
vst1q_s16(workspace + 6 * DCTSIZE, row6);
|
|
vst1q_s16(workspace + 7 * DCTSIZE, row7);
|
|
}
|
|
|
|
|
|
/* After the DCT, the resulting array of coefficient values needs to be divided
|
|
* by an array of quantization values.
|
|
*
|
|
* To avoid a slow division operation, the DCT coefficients are multiplied by
|
|
* the (scaled) reciprocals of the quantization values and then right-shifted.
|
|
*
|
|
* The equivalent scalar C function quantize() can be found in jcdctmgr.c.
|
|
*/
|
|
|
|
void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
|
|
DCTELEM *workspace)
|
|
{
|
|
JCOEFPTR out_ptr = coef_block;
|
|
UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
|
|
UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
|
|
DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
|
|
int i;
|
|
|
|
#if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
|
|
#pragma unroll
|
|
#endif
|
|
for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
|
|
/* Load DCT coefficients. */
|
|
int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
|
|
int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
|
|
int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
|
|
int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
|
|
/* Load reciprocals of quantization values. */
|
|
uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
|
|
uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
|
|
uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
|
|
uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
|
|
uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
|
|
uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
|
|
uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
|
|
uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
|
|
int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
|
|
int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
|
|
int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
|
|
int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
|
|
|
|
/* Extract sign from coefficients. */
|
|
int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
|
|
int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
|
|
int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
|
|
int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
|
|
/* Get absolute value of DCT coefficients. */
|
|
uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
|
|
uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
|
|
uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
|
|
uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
|
|
/* Add correction. */
|
|
abs_row0 = vaddq_u16(abs_row0, corr0);
|
|
abs_row1 = vaddq_u16(abs_row1, corr1);
|
|
abs_row2 = vaddq_u16(abs_row2, corr2);
|
|
abs_row3 = vaddq_u16(abs_row3, corr3);
|
|
|
|
/* Multiply DCT coefficients by quantization reciprocals. */
|
|
int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
|
|
vget_low_u16(recip0)));
|
|
int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
|
|
vget_high_u16(recip0)));
|
|
int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
|
|
vget_low_u16(recip1)));
|
|
int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
|
|
vget_high_u16(recip1)));
|
|
int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
|
|
vget_low_u16(recip2)));
|
|
int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
|
|
vget_high_u16(recip2)));
|
|
int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
|
|
vget_low_u16(recip3)));
|
|
int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
|
|
vget_high_u16(recip3)));
|
|
/* Narrow back to 16-bit. */
|
|
row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
|
|
row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
|
|
row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
|
|
row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
|
|
|
|
/* Since VSHR only supports an immediate as its second argument, negate the
|
|
* shift value and shift left.
|
|
*/
|
|
row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
|
|
vnegq_s16(shift0)));
|
|
row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
|
|
vnegq_s16(shift1)));
|
|
row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
|
|
vnegq_s16(shift2)));
|
|
row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
|
|
vnegq_s16(shift3)));
|
|
|
|
/* Restore sign to original product. */
|
|
row0 = veorq_s16(row0, sign_row0);
|
|
row0 = vsubq_s16(row0, sign_row0);
|
|
row1 = veorq_s16(row1, sign_row1);
|
|
row1 = vsubq_s16(row1, sign_row1);
|
|
row2 = veorq_s16(row2, sign_row2);
|
|
row2 = vsubq_s16(row2, sign_row2);
|
|
row3 = veorq_s16(row3, sign_row3);
|
|
row3 = vsubq_s16(row3, sign_row3);
|
|
|
|
/* Store quantized coefficients to memory. */
|
|
vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
|
|
vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
|
|
vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
|
|
vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
|
|
}
|
|
}
|