Neon: Intrinsics impl. of h1v2 fancy upsamling

There was no previous GAS implementation.
This commit is contained in:
Jonathan Wright
2019-05-08 15:43:26 +01:00
committed by DRC
parent 975307775c
commit 2acfb93c94
8 changed files with 188 additions and 2 deletions

View File

@@ -720,6 +720,8 @@ if(WITH_12BIT)
set(MD5_PPM_RGB_ISLOW f3301d2219783b8b3d942b7239fa50c0) set(MD5_PPM_RGB_ISLOW f3301d2219783b8b3d942b7239fa50c0)
set(MD5_JPEG_422_IFAST_OPT 7322e3bd2f127f7de4b40d4480ce60e4) set(MD5_JPEG_422_IFAST_OPT 7322e3bd2f127f7de4b40d4480ce60e4)
set(MD5_PPM_422_IFAST 79807fa552899e66a04708f533e16950) set(MD5_PPM_422_IFAST 79807fa552899e66a04708f533e16950)
set(MD5_JPEG_440_ISLOW e25c1912e38367be505a89c410c1c2d2)
set(MD5_PPM_440_ISLOW e7d2e26288870cfcb30f3114ad01e380)
set(MD5_PPM_422M_IFAST 07737bfe8a7c1c87aaa393a0098d16b0) set(MD5_PPM_422M_IFAST 07737bfe8a7c1c87aaa393a0098d16b0)
set(MD5_JPEG_420_IFAST_Q100_PROG 008ab68d6ddbba04a8f01deee4e0f9f8) set(MD5_JPEG_420_IFAST_Q100_PROG 008ab68d6ddbba04a8f01deee4e0f9f8)
set(MD5_PPM_420_Q100_IFAST 1b3730122709f53d007255e8dfd3305e) set(MD5_PPM_420_Q100_IFAST 1b3730122709f53d007255e8dfd3305e)
@@ -768,6 +770,8 @@ else()
set(MD5_BMP_RGB_ISLOW_565D 4cfa0928ef3e6bb626d7728c924cfda4) set(MD5_BMP_RGB_ISLOW_565D 4cfa0928ef3e6bb626d7728c924cfda4)
set(MD5_JPEG_422_IFAST_OPT 2540287b79d913f91665e660303ab2c8) set(MD5_JPEG_422_IFAST_OPT 2540287b79d913f91665e660303ab2c8)
set(MD5_PPM_422_IFAST 35bd6b3f833bad23de82acea847129fa) set(MD5_PPM_422_IFAST 35bd6b3f833bad23de82acea847129fa)
set(MD5_JPEG_440_ISLOW 538bc02bd4b4658fd85de6ece6cbeda6)
set(MD5_PPM_440_ISLOW 11e7eab7ef7ef3276934bb7e7b6bb377)
set(MD5_PPM_422M_IFAST 8dbc65323d62cca7c91ba02dd1cfa81d) set(MD5_PPM_422M_IFAST 8dbc65323d62cca7c91ba02dd1cfa81d)
set(MD5_BMP_422M_IFAST_565 3294bd4d9a1f2b3d08ea6020d0db7065) set(MD5_BMP_422M_IFAST_565 3294bd4d9a1f2b3d08ea6020d0db7065)
set(MD5_BMP_422M_IFAST_565D da98c9c7b6039511be4a79a878a9abc1) set(MD5_BMP_422M_IFAST_565D da98c9c7b6039511be4a79a878a9abc1)
@@ -1101,6 +1105,16 @@ foreach(libtype ${TEST_LIBTYPES})
testout_422_ifast.ppm testout_422_ifast_opt.jpg testout_422_ifast.ppm testout_422_ifast_opt.jpg
${MD5_PPM_422_IFAST} cjpeg-${libtype}-422-ifast-opt) ${MD5_PPM_422_IFAST} cjpeg-${libtype}-422-ifast-opt)
# CC: RGB->YCC SAMP: fullsize/h1v2 FDCT: islow ENT: huff
add_bittest(cjpeg 440-islow "-sample;1x2;-dct;int"
testout_440_islow.jpg ${TESTIMAGES}/testorig.ppm
${MD5_JPEG_440_ISLOW})
# CC: YCC->RGB SAMP: fullsize/h1v2 fancy IDCT: islow ENT: huff
add_bittest(djpeg 440-islow "-dct;int"
testout_440_islow.ppm testout_440_islow.jpg
${MD5_PPM_440_ISLOW} cjpeg-${libtype}-440-islow)
# CC: YCC->RGB SAMP: h2v1 merged IDCT: ifast ENT: huff # CC: YCC->RGB SAMP: h2v1 merged IDCT: ifast ENT: huff
add_bittest(djpeg 422m-ifast "-dct;fast;-nosmooth" add_bittest(djpeg 422m-ifast "-dct;fast;-nosmooth"
testout_422m_ifast.ppm testout_422_ifast_opt.jpg testout_422m_ifast.ppm testout_422_ifast_opt.jpg

View File

@@ -8,7 +8,7 @@
* Copyright (C) 2010, 2015-2016, D. R. Commander. * Copyright (C) 2010, 2015-2016, D. R. Commander.
* Copyright (C) 2014, MIPS Technologies, Inc., California. * Copyright (C) 2014, MIPS Technologies, Inc., California.
* Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Google, Inc.
* Copyright (C) 2019, Arm Limited. * Copyright (C) 2019-2020, Arm Limited.
* For conditions of distribution and use, see the accompanying README.ijg * For conditions of distribution and use, see the accompanying README.ijg
* file. * file.
* *
@@ -477,6 +477,11 @@ jinit_upsampler(j_decompress_ptr cinfo)
} else if (h_in_group == h_out_group && } else if (h_in_group == h_out_group &&
v_in_group * 2 == v_out_group && do_fancy) { v_in_group * 2 == v_out_group && do_fancy) {
/* Non-fancy upsampling is handled by the generic method */ /* Non-fancy upsampling is handled by the generic method */
#if defined(__arm__) || defined(__aarch64__)
if (jsimd_can_h1v2_fancy_upsample())
upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
else
#endif
upsample->methods[ci] = h1v2_fancy_upsample; upsample->methods[ci] = h1v2_fancy_upsample;
upsample->pub.need_context_rows = TRUE; upsample->pub.need_context_rows = TRUE;
} else if (h_in_group * 2 == h_out_group && } else if (h_in_group * 2 == h_out_group &&

View File

@@ -4,6 +4,7 @@
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2011, 2014, D. R. Commander. * Copyright (C) 2011, 2014, D. R. Commander.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois. * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
* Copyright (C) 2020, Arm Limited.
* *
* Based on the x86 SIMD extension for IJG JPEG library, * Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru. * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -75,6 +76,7 @@ EXTERN(void) jsimd_int_upsample(j_decompress_ptr cinfo,
EXTERN(int) jsimd_can_h2v2_fancy_upsample(void); EXTERN(int) jsimd_can_h2v2_fancy_upsample(void);
EXTERN(int) jsimd_can_h2v1_fancy_upsample(void); EXTERN(int) jsimd_can_h2v1_fancy_upsample(void);
EXTERN(int) jsimd_can_h1v2_fancy_upsample(void);
EXTERN(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, EXTERN(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo,
jpeg_component_info *compptr, jpeg_component_info *compptr,
@@ -84,6 +86,10 @@ EXTERN(void) jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo,
jpeg_component_info *compptr, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr); JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo,
jpeg_component_info *compptr,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);
EXTERN(int) jsimd_can_h2v2_merged_upsample(void); EXTERN(int) jsimd_can_h2v2_merged_upsample(void);
EXTERN(int) jsimd_can_h2v1_merged_upsample(void); EXTERN(int) jsimd_can_h2v1_merged_upsample(void);

View File

@@ -4,6 +4,7 @@
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2014, D. R. Commander. * Copyright (C) 2009-2011, 2014, D. R. Commander.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois. * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
* Copyright (C) 2020, Arm Limited.
* *
* Based on the x86 SIMD extension for IJG JPEG library, * Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru. * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -169,6 +170,12 @@ jsimd_can_h2v1_fancy_upsample(void)
return 0; return 0;
} }
GLOBAL(int)
jsimd_can_h1v2_fancy_upsample(void)
{
return 0;
}
GLOBAL(void) GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
@@ -181,6 +188,12 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
{ {
} }
GLOBAL(void)
jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
}
GLOBAL(int) GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void) jsimd_can_h2v2_merged_upsample(void)
{ {

View File

@@ -454,6 +454,23 @@ jsimd_can_h2v1_fancy_upsample(void)
return 0; return 0;
} }
GLOBAL(int)
jsimd_can_h1v2_fancy_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(void) GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
@@ -472,6 +489,15 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
output_data_ptr); output_data_ptr);
} }
GLOBAL(void)
jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
}
GLOBAL(int) GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void) jsimd_can_h2v2_merged_upsample(void)
{ {

View File

@@ -522,6 +522,23 @@ jsimd_can_h2v1_fancy_upsample(void)
return 0; return 0;
} }
GLOBAL(int)
jsimd_can_h1v2_fancy_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(void) GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
@@ -540,6 +557,15 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
output_data_ptr); output_data_ptr);
} }
GLOBAL(void)
jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
}
GLOBAL(int) GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void) jsimd_can_h2v2_merged_upsample(void)
{ {

View File

@@ -372,3 +372,96 @@ void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
inrow++; inrow++;
} }
} }
/* The diagram below shows a column of samples produced by h1v2 downsampling
* (or by losslessly rotating or transposing an h2v1-downsampled image.)
*
* +---------+
* | p0 |
* sA | |
* | p1 |
* +---------+
* | p2 |
* sB | |
* | p3 |
* +---------+
* | p4 |
* sC | |
* | p5 |
* +---------+
*
* Samples sA-sC were created by averaging the original pixel component values
* centered at positions p0-p5 above. To approximate those original pixel
* component values, we proportionally blend the adjacent samples in each
* column.
*
* An upsampled pixel component value is computed by blending the sample
* containing the pixel center with the nearest neighboring sample, in the
* ratio 3:1. For example:
* p1(upsampled) = 3/4 * sA + 1/4 * sB
* p2(upsampled) = 3/4 * sB + 1/4 * sA
* When computing the first and last pixel component values in the column,
* there is no adjacent sample to blend, so:
* p0(upsampled) = sA
* p5(upsampled) = sC
*/
void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
JDIMENSION downsampled_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
int inrow, outrow;
unsigned colctr;
/* Set up constants. */
const uint16x8_t one_u16 = vdupq_n_u16(1);
const uint8x8_t three_u8 = vdup_n_u8(3);
inrow = outrow = 0;
while (outrow < max_v_samp_factor) {
inptr0 = input_data[inrow - 1];
inptr1 = input_data[inrow];
inptr2 = input_data[inrow + 1];
/* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
* respectively.
*/
outptr0 = output_data[outrow++];
outptr1 = output_data[outrow++];
inrow++;
/* The size of the input and output buffers is always a multiple of 32
* bytes => no need to worry about buffer overflow when reading/writing
* memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more
* details.
*/
for (colctr = 0; colctr < downsampled_width; colctr += 16) {
/* Load samples. */
uint8x16_t sA = vld1q_u8(inptr0 + colctr);
uint8x16_t sB = vld1q_u8(inptr1 + colctr);
uint8x16_t sC = vld1q_u8(inptr2 + colctr);
/* Blend samples vertically. */
uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
vget_low_u8(sB), three_u8);
uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
vget_high_u8(sB), three_u8);
uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
vget_low_u8(sB), three_u8);
uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
vget_high_u8(sB), three_u8);
/* Add ordered dithering bias to pixel values in even output rows. */
colsum0_l = vaddq_u16(colsum0_l, one_u16);
colsum0_h = vaddq_u16(colsum0_h, one_u16);
/* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
vshrn_n_u16(colsum0_h, 2));
uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
vrshrn_n_u16(colsum1_h, 2));
/* Store pixel component values to memory. */
vst1q_u8(outptr0 + colctr, output_pixels0);
vst1q_u8(outptr1 + colctr, output_pixels1);
}
}
}

View File

@@ -664,6 +664,9 @@ EXTERN(void) jsimd_h2v1_fancy_upsample_neon
EXTERN(void) jsimd_h2v2_fancy_upsample_neon EXTERN(void) jsimd_h2v2_fancy_upsample_neon
(int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr); JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h1v2_fancy_upsample_neon
(int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2 EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2
(int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,