Neon: Intrinsics impl. of h2v1 & h2v2 plain upsamp
There was no previous GAS implementation. NOTE: This doesn't produce much of a speedup when using -O3, because -O3 already enables Neon autovectorization, which works well for the scalar C implementation of plain upsampling. However, the Neon SIMD implementation will benefit other optimization levels.
This commit is contained in:
@@ -399,12 +399,33 @@ jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
|
|||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
jsimd_can_h2v2_upsample(void)
|
jsimd_can_h2v2_upsample(void)
|
||||||
{
|
{
|
||||||
|
init_simd();
|
||||||
|
|
||||||
|
/* The code is optimised for these values only */
|
||||||
|
if (BITS_IN_JSAMPLE != 8)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(JDIMENSION) != 4)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (simd_support & JSIMD_NEON)
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
jsimd_can_h2v1_upsample(void)
|
jsimd_can_h2v1_upsample(void)
|
||||||
{
|
{
|
||||||
|
init_simd();
|
||||||
|
|
||||||
|
/* The code is optimised for these values only */
|
||||||
|
if (BITS_IN_JSAMPLE != 8)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(JDIMENSION) != 4)
|
||||||
|
return 0;
|
||||||
|
if (simd_support & JSIMD_NEON)
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -412,12 +433,16 @@ GLOBAL(void)
|
|||||||
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||||
{
|
{
|
||||||
|
jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
|
||||||
|
input_data, output_data_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(void)
|
GLOBAL(void)
|
||||||
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||||
{
|
{
|
||||||
|
jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
|
||||||
|
input_data, output_data_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
|
|||||||
@@ -467,12 +467,33 @@ jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
|
|||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
jsimd_can_h2v2_upsample(void)
|
jsimd_can_h2v2_upsample(void)
|
||||||
{
|
{
|
||||||
|
init_simd();
|
||||||
|
|
||||||
|
/* The code is optimised for these values only */
|
||||||
|
if (BITS_IN_JSAMPLE != 8)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(JDIMENSION) != 4)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (simd_support & JSIMD_NEON)
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
jsimd_can_h2v1_upsample(void)
|
jsimd_can_h2v1_upsample(void)
|
||||||
{
|
{
|
||||||
|
init_simd();
|
||||||
|
|
||||||
|
/* The code is optimised for these values only */
|
||||||
|
if (BITS_IN_JSAMPLE != 8)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(JDIMENSION) != 4)
|
||||||
|
return 0;
|
||||||
|
if (simd_support & JSIMD_NEON)
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -480,12 +501,16 @@ GLOBAL(void)
|
|||||||
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||||
{
|
{
|
||||||
|
jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
|
||||||
|
input_data, output_data_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(void)
|
GLOBAL(void)
|
||||||
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||||
{
|
{
|
||||||
|
jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
|
||||||
|
input_data, output_data_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
|
|||||||
@@ -465,3 +465,105 @@ void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* The diagram below shows a row of samples produced by h2v1 downsampling.
|
||||||
|
*
|
||||||
|
* s0 s1
|
||||||
|
* +---------+---------+
|
||||||
|
* | | |
|
||||||
|
* | p0 p1 | p2 p3 |
|
||||||
|
* | | |
|
||||||
|
* +---------+---------+
|
||||||
|
*
|
||||||
|
* Samples s0 and s1 were created by averaging the original pixel component
|
||||||
|
* values centered at positions p0-p3 above. To approximate those original
|
||||||
|
* pixel component values, we duplicate the samples horizontally:
|
||||||
|
* p0(upsampled) = p1(upsampled) = s0
|
||||||
|
* p2(upsampled) = p3(upsampled) = s1
|
||||||
|
*/
|
||||||
|
|
||||||
|
void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
|
||||||
|
JSAMPARRAY input_data,
|
||||||
|
JSAMPARRAY *output_data_ptr)
|
||||||
|
{
|
||||||
|
JSAMPARRAY output_data = *output_data_ptr;
|
||||||
|
JSAMPROW inptr, outptr;
|
||||||
|
int inrow;
|
||||||
|
unsigned colctr;
|
||||||
|
|
||||||
|
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
|
||||||
|
inptr = input_data[inrow];
|
||||||
|
outptr = output_data[inrow];
|
||||||
|
for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
|
||||||
|
uint8x16_t samples = vld1q_u8(inptr + colctr);
|
||||||
|
/* Duplicate the samples. The store operation below interleaves them so
|
||||||
|
* that adjacent pixel component values take on the same sample value,
|
||||||
|
* per above.
|
||||||
|
*/
|
||||||
|
uint8x16x2_t output_pixels = { { samples, samples } };
|
||||||
|
/* Store pixel component values to memory.
|
||||||
|
* Due to the way sample buffers are allocated, we don't need to worry
|
||||||
|
* about tail cases when output_width is not a multiple of 32. See
|
||||||
|
* "Creation of 2-D sample arrays" in jmemmgr.c for details.
|
||||||
|
*/
|
||||||
|
vst2q_u8(outptr + 2 * colctr, output_pixels);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* The diagram below shows an array of samples produced by h2v2 downsampling.
|
||||||
|
*
|
||||||
|
* s0 s1
|
||||||
|
* +---------+---------+
|
||||||
|
* | p0 p1 | p2 p3 |
|
||||||
|
* sA | | |
|
||||||
|
* | p4 p5 | p6 p7 |
|
||||||
|
* +---------+---------+
|
||||||
|
* | p8 p9 | p10 p11|
|
||||||
|
* sB | | |
|
||||||
|
* | p12 p13| p14 p15|
|
||||||
|
* +---------+---------+
|
||||||
|
*
|
||||||
|
* Samples s0A-s1B were created by averaging the original pixel component
|
||||||
|
* values centered at positions p0-p15 above. To approximate those original
|
||||||
|
* pixel component values, we duplicate the samples both horizontally and
|
||||||
|
* vertically:
|
||||||
|
* p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
|
||||||
|
* p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
|
||||||
|
* p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
|
||||||
|
* p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
|
||||||
|
*/
|
||||||
|
|
||||||
|
void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
|
||||||
|
JSAMPARRAY input_data,
|
||||||
|
JSAMPARRAY *output_data_ptr)
|
||||||
|
{
|
||||||
|
JSAMPARRAY output_data = *output_data_ptr;
|
||||||
|
JSAMPROW inptr, outptr0, outptr1;
|
||||||
|
int inrow, outrow;
|
||||||
|
unsigned colctr;
|
||||||
|
|
||||||
|
for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
|
||||||
|
inptr = input_data[inrow];
|
||||||
|
outptr0 = output_data[outrow++];
|
||||||
|
outptr1 = output_data[outrow++];
|
||||||
|
|
||||||
|
for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
|
||||||
|
uint8x16_t samples = vld1q_u8(inptr + colctr);
|
||||||
|
/* Duplicate the samples. The store operation below interleaves them so
|
||||||
|
* that adjacent pixel component values take on the same sample value,
|
||||||
|
* per above.
|
||||||
|
*/
|
||||||
|
uint8x16x2_t output_pixels = { { samples, samples } };
|
||||||
|
/* Store pixel component values for both output rows to memory.
|
||||||
|
* Due to the way sample buffers are allocated, we don't need to worry
|
||||||
|
* about tail cases when output_width is not a multiple of 32. See
|
||||||
|
* "Creation of 2-D sample arrays" in jmemmgr.c for details.
|
||||||
|
*/
|
||||||
|
vst2q_u8(outptr0 + 2 * colctr, output_pixels);
|
||||||
|
vst2q_u8(outptr1 + 2 * colctr, output_pixels);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -615,6 +615,13 @@ EXTERN(void) jsimd_h2v2_upsample_avx2
|
|||||||
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
|
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
|
||||||
JSAMPARRAY *output_data_ptr);
|
JSAMPARRAY *output_data_ptr);
|
||||||
|
|
||||||
|
EXTERN(void) jsimd_h2v1_upsample_neon
|
||||||
|
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
|
||||||
|
JSAMPARRAY *output_data_ptr);
|
||||||
|
EXTERN(void) jsimd_h2v2_upsample_neon
|
||||||
|
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
|
||||||
|
JSAMPARRAY *output_data_ptr);
|
||||||
|
|
||||||
EXTERN(void) jsimd_h2v1_upsample_dspr2
|
EXTERN(void) jsimd_h2v1_upsample_dspr2
|
||||||
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
|
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
|
||||||
JSAMPARRAY *output_data_ptr);
|
JSAMPARRAY *output_data_ptr);
|
||||||
|
|||||||
Reference in New Issue
Block a user