Neon: Intrinsics impl. of h2v1 & h2v2 plain upsamp

There was no previous GAS implementation.

NOTE: This doesn't produce much of a speedup when using -O3, because -O3
already enables Neon autovectorization, which works well for the scalar
C implementation of plain upsampling.  However, the Neon SIMD
implementation will benefit other optimization levels.
This commit is contained in:
Jonathan Wright
2018-06-28 16:17:36 +01:00
committed by DRC
parent ba52a3de32
commit 4574f01f43
4 changed files with 159 additions and 0 deletions

View File

@@ -399,12 +399,33 @@ jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
GLOBAL(int)
jsimd_can_h2v2_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
@@ -412,12 +433,16 @@ GLOBAL(void)
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
}
GLOBAL(void)
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
}
GLOBAL(int)

View File

@@ -467,12 +467,33 @@ jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
GLOBAL(int)
jsimd_can_h2v2_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
@@ -480,12 +501,16 @@ GLOBAL(void)
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
}
GLOBAL(void)
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
}
GLOBAL(int)

View File

@@ -465,3 +465,105 @@ void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
}
}
}
/* The diagram below shows a row of samples produced by h2v1 downsampling.
*
* s0 s1
* +---------+---------+
* | | |
* | p0 p1 | p2 p3 |
* | | |
* +---------+---------+
*
* Samples s0 and s1 were created by averaging the original pixel component
* values centered at positions p0-p3 above. To approximate those original
* pixel component values, we duplicate the samples horizontally:
* p0(upsampled) = p1(upsampled) = s0
* p2(upsampled) = p3(upsampled) = s1
*/
void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr, outptr;
int inrow;
unsigned colctr;
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
inptr = input_data[inrow];
outptr = output_data[inrow];
for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
uint8x16_t samples = vld1q_u8(inptr + colctr);
/* Duplicate the samples. The store operation below interleaves them so
* that adjacent pixel component values take on the same sample value,
* per above.
*/
uint8x16x2_t output_pixels = { { samples, samples } };
/* Store pixel component values to memory.
* Due to the way sample buffers are allocated, we don't need to worry
* about tail cases when output_width is not a multiple of 32. See
* "Creation of 2-D sample arrays" in jmemmgr.c for details.
*/
vst2q_u8(outptr + 2 * colctr, output_pixels);
}
}
}
/* The diagram below shows an array of samples produced by h2v2 downsampling.
*
* s0 s1
* +---------+---------+
* | p0 p1 | p2 p3 |
* sA | | |
* | p4 p5 | p6 p7 |
* +---------+---------+
* | p8 p9 | p10 p11|
* sB | | |
* | p12 p13| p14 p15|
* +---------+---------+
*
* Samples s0A-s1B were created by averaging the original pixel component
* values centered at positions p0-p15 above. To approximate those original
* pixel component values, we duplicate the samples both horizontally and
* vertically:
* p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
* p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
* p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
* p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
*/
void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr, outptr0, outptr1;
int inrow, outrow;
unsigned colctr;
for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
inptr = input_data[inrow];
outptr0 = output_data[outrow++];
outptr1 = output_data[outrow++];
for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
uint8x16_t samples = vld1q_u8(inptr + colctr);
/* Duplicate the samples. The store operation below interleaves them so
* that adjacent pixel component values take on the same sample value,
* per above.
*/
uint8x16x2_t output_pixels = { { samples, samples } };
/* Store pixel component values for both output rows to memory.
* Due to the way sample buffers are allocated, we don't need to worry
* about tail cases when output_width is not a multiple of 32. See
* "Creation of 2-D sample arrays" in jmemmgr.c for details.
*/
vst2q_u8(outptr0 + 2 * colctr, output_pixels);
vst2q_u8(outptr1 + 2 * colctr, output_pixels);
}
}
}

View File

@@ -615,6 +615,13 @@ EXTERN(void) jsimd_h2v2_upsample_avx2
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v1_upsample_neon
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v2_upsample_neon
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v1_upsample_dspr2
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);