diff --git a/simd/arm/aarch32/jsimd.c b/simd/arm/aarch32/jsimd.c index df13c119..fac55dfb 100644 --- a/simd/arm/aarch32/jsimd.c +++ b/simd/arm/aarch32/jsimd.c @@ -399,12 +399,33 @@ jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, GLOBAL(int) jsimd_can_h2v2_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } GLOBAL(int) jsimd_can_h2v1_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -412,12 +433,16 @@ GLOBAL(void) jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); } GLOBAL(void) jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); } GLOBAL(int) diff --git a/simd/arm/aarch64/jsimd.c b/simd/arm/aarch64/jsimd.c index 74cd6cca..8570b82c 100644 --- a/simd/arm/aarch64/jsimd.c +++ b/simd/arm/aarch64/jsimd.c @@ -467,12 +467,33 @@ jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, GLOBAL(int) jsimd_can_h2v2_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } GLOBAL(int) jsimd_can_h2v1_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -480,12 +501,16 @@ GLOBAL(void) jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); } GLOBAL(void) jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); } GLOBAL(int) diff --git a/simd/arm/jdsample-neon.c b/simd/arm/jdsample-neon.c index b31d6cf4..90ec6782 100644 --- a/simd/arm/jdsample-neon.c +++ b/simd/arm/jdsample-neon.c @@ -465,3 +465,105 @@ void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor, } } } + + +/* The diagram below shows a row of samples produced by h2v1 downsampling. + * + * s0 s1 + * +---------+---------+ + * | | | + * | p0 p1 | p2 p3 | + * | | | + * +---------+---------+ + * + * Samples s0 and s1 were created by averaging the original pixel component + * values centered at positions p0-p3 above. To approximate those original + * pixel component values, we duplicate the samples horizontally: + * p0(upsampled) = p1(upsampled) = s0 + * p2(upsampled) = p3(upsampled) = s1 + */ + +void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr; + int inrow; + unsigned colctr; + + for (inrow = 0; inrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr = output_data[inrow]; + for (colctr = 0; 2 * colctr < output_width; colctr += 16) { + uint8x16_t samples = vld1q_u8(inptr + colctr); + /* Duplicate the samples. The store operation below interleaves them so + * that adjacent pixel component values take on the same sample value, + * per above. + */ + uint8x16x2_t output_pixels = { { samples, samples } }; + /* Store pixel component values to memory. + * Due to the way sample buffers are allocated, we don't need to worry + * about tail cases when output_width is not a multiple of 32. See + * "Creation of 2-D sample arrays" in jmemmgr.c for details. + */ + vst2q_u8(outptr + 2 * colctr, output_pixels); + } + } +} + + +/* The diagram below shows an array of samples produced by h2v2 downsampling. + * + * s0 s1 + * +---------+---------+ + * | p0 p1 | p2 p3 | + * sA | | | + * | p4 p5 | p6 p7 | + * +---------+---------+ + * | p8 p9 | p10 p11| + * sB | | | + * | p12 p13| p14 p15| + * +---------+---------+ + * + * Samples s0A-s1B were created by averaging the original pixel component + * values centered at positions p0-p15 above. To approximate those original + * pixel component values, we duplicate the samples both horizontally and + * vertically: + * p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A + * p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A + * p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B + * p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B + */ + +void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr0, outptr1; + int inrow, outrow; + unsigned colctr; + + for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + for (colctr = 0; 2 * colctr < output_width; colctr += 16) { + uint8x16_t samples = vld1q_u8(inptr + colctr); + /* Duplicate the samples. The store operation below interleaves them so + * that adjacent pixel component values take on the same sample value, + * per above. + */ + uint8x16x2_t output_pixels = { { samples, samples } }; + /* Store pixel component values for both output rows to memory. + * Due to the way sample buffers are allocated, we don't need to worry + * about tail cases when output_width is not a multiple of 32. See + * "Creation of 2-D sample arrays" in jmemmgr.c for details. + */ + vst2q_u8(outptr0 + 2 * colctr, output_pixels); + vst2q_u8(outptr1 + 2 * colctr, output_pixels); + } + } +} diff --git a/simd/jsimd.h b/simd/jsimd.h index ccc9a104..64747c63 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -615,6 +615,13 @@ EXTERN(void) jsimd_h2v2_upsample_avx2 (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v1_upsample_neon + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_neon + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + EXTERN(void) jsimd_h2v1_upsample_dspr2 (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);