Neon: Intrinsics impl. of h2v1 & h2v2 plain upsamp
There was no previous GAS implementation. NOTE: This doesn't produce much of a speedup when using -O3, because -O3 already enables Neon autovectorization, which works well for the scalar C implementation of plain upsampling. However, the Neon SIMD implementation will benefit other optimization levels.
This commit is contained in:
@@ -399,12 +399,33 @@ jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v1_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -412,12 +433,16 @@ GLOBAL(void)
|
||||
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
|
||||
input_data, output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
|
||||
input_data, output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
|
||||
@@ -467,12 +467,33 @@ jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v1_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -480,12 +501,16 @@ GLOBAL(void)
|
||||
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
|
||||
input_data, output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
|
||||
input_data, output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
|
||||
@@ -465,3 +465,105 @@ void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* The diagram below shows a row of samples produced by h2v1 downsampling.
|
||||
*
|
||||
* s0 s1
|
||||
* +---------+---------+
|
||||
* | | |
|
||||
* | p0 p1 | p2 p3 |
|
||||
* | | |
|
||||
* +---------+---------+
|
||||
*
|
||||
* Samples s0 and s1 were created by averaging the original pixel component
|
||||
* values centered at positions p0-p3 above. To approximate those original
|
||||
* pixel component values, we duplicate the samples horizontally:
|
||||
* p0(upsampled) = p1(upsampled) = s0
|
||||
* p2(upsampled) = p3(upsampled) = s1
|
||||
*/
|
||||
|
||||
void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
JSAMPARRAY output_data = *output_data_ptr;
|
||||
JSAMPROW inptr, outptr;
|
||||
int inrow;
|
||||
unsigned colctr;
|
||||
|
||||
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
|
||||
inptr = input_data[inrow];
|
||||
outptr = output_data[inrow];
|
||||
for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
|
||||
uint8x16_t samples = vld1q_u8(inptr + colctr);
|
||||
/* Duplicate the samples. The store operation below interleaves them so
|
||||
* that adjacent pixel component values take on the same sample value,
|
||||
* per above.
|
||||
*/
|
||||
uint8x16x2_t output_pixels = { { samples, samples } };
|
||||
/* Store pixel component values to memory.
|
||||
* Due to the way sample buffers are allocated, we don't need to worry
|
||||
* about tail cases when output_width is not a multiple of 32. See
|
||||
* "Creation of 2-D sample arrays" in jmemmgr.c for details.
|
||||
*/
|
||||
vst2q_u8(outptr + 2 * colctr, output_pixels);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* The diagram below shows an array of samples produced by h2v2 downsampling.
|
||||
*
|
||||
* s0 s1
|
||||
* +---------+---------+
|
||||
* | p0 p1 | p2 p3 |
|
||||
* sA | | |
|
||||
* | p4 p5 | p6 p7 |
|
||||
* +---------+---------+
|
||||
* | p8 p9 | p10 p11|
|
||||
* sB | | |
|
||||
* | p12 p13| p14 p15|
|
||||
* +---------+---------+
|
||||
*
|
||||
* Samples s0A-s1B were created by averaging the original pixel component
|
||||
* values centered at positions p0-p15 above. To approximate those original
|
||||
* pixel component values, we duplicate the samples both horizontally and
|
||||
* vertically:
|
||||
* p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
|
||||
* p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
|
||||
* p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
|
||||
* p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
|
||||
*/
|
||||
|
||||
void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
JSAMPARRAY output_data = *output_data_ptr;
|
||||
JSAMPROW inptr, outptr0, outptr1;
|
||||
int inrow, outrow;
|
||||
unsigned colctr;
|
||||
|
||||
for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
|
||||
inptr = input_data[inrow];
|
||||
outptr0 = output_data[outrow++];
|
||||
outptr1 = output_data[outrow++];
|
||||
|
||||
for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
|
||||
uint8x16_t samples = vld1q_u8(inptr + colctr);
|
||||
/* Duplicate the samples. The store operation below interleaves them so
|
||||
* that adjacent pixel component values take on the same sample value,
|
||||
* per above.
|
||||
*/
|
||||
uint8x16x2_t output_pixels = { { samples, samples } };
|
||||
/* Store pixel component values for both output rows to memory.
|
||||
* Due to the way sample buffers are allocated, we don't need to worry
|
||||
* about tail cases when output_width is not a multiple of 32. See
|
||||
* "Creation of 2-D sample arrays" in jmemmgr.c for details.
|
||||
*/
|
||||
vst2q_u8(outptr0 + 2 * colctr, output_pixels);
|
||||
vst2q_u8(outptr1 + 2 * colctr, output_pixels);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -615,6 +615,13 @@ EXTERN(void) jsimd_h2v2_upsample_avx2
|
||||
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr);
|
||||
|
||||
EXTERN(void) jsimd_h2v1_upsample_neon
|
||||
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr);
|
||||
EXTERN(void) jsimd_h2v2_upsample_neon
|
||||
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr);
|
||||
|
||||
EXTERN(void) jsimd_h2v1_upsample_dspr2
|
||||
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr);
|
||||
|
||||
Reference in New Issue
Block a user