Neon: Intrinsics impl. of h1v2 fancy upsamling
There was no previous GAS implementation.
This commit is contained in:
@@ -720,6 +720,8 @@ if(WITH_12BIT)
|
||||
set(MD5_PPM_RGB_ISLOW f3301d2219783b8b3d942b7239fa50c0)
|
||||
set(MD5_JPEG_422_IFAST_OPT 7322e3bd2f127f7de4b40d4480ce60e4)
|
||||
set(MD5_PPM_422_IFAST 79807fa552899e66a04708f533e16950)
|
||||
set(MD5_JPEG_440_ISLOW e25c1912e38367be505a89c410c1c2d2)
|
||||
set(MD5_PPM_440_ISLOW e7d2e26288870cfcb30f3114ad01e380)
|
||||
set(MD5_PPM_422M_IFAST 07737bfe8a7c1c87aaa393a0098d16b0)
|
||||
set(MD5_JPEG_420_IFAST_Q100_PROG 008ab68d6ddbba04a8f01deee4e0f9f8)
|
||||
set(MD5_PPM_420_Q100_IFAST 1b3730122709f53d007255e8dfd3305e)
|
||||
@@ -768,6 +770,8 @@ else()
|
||||
set(MD5_BMP_RGB_ISLOW_565D 4cfa0928ef3e6bb626d7728c924cfda4)
|
||||
set(MD5_JPEG_422_IFAST_OPT 2540287b79d913f91665e660303ab2c8)
|
||||
set(MD5_PPM_422_IFAST 35bd6b3f833bad23de82acea847129fa)
|
||||
set(MD5_JPEG_440_ISLOW 538bc02bd4b4658fd85de6ece6cbeda6)
|
||||
set(MD5_PPM_440_ISLOW 11e7eab7ef7ef3276934bb7e7b6bb377)
|
||||
set(MD5_PPM_422M_IFAST 8dbc65323d62cca7c91ba02dd1cfa81d)
|
||||
set(MD5_BMP_422M_IFAST_565 3294bd4d9a1f2b3d08ea6020d0db7065)
|
||||
set(MD5_BMP_422M_IFAST_565D da98c9c7b6039511be4a79a878a9abc1)
|
||||
@@ -1101,6 +1105,16 @@ foreach(libtype ${TEST_LIBTYPES})
|
||||
testout_422_ifast.ppm testout_422_ifast_opt.jpg
|
||||
${MD5_PPM_422_IFAST} cjpeg-${libtype}-422-ifast-opt)
|
||||
|
||||
# CC: RGB->YCC SAMP: fullsize/h1v2 FDCT: islow ENT: huff
|
||||
add_bittest(cjpeg 440-islow "-sample;1x2;-dct;int"
|
||||
testout_440_islow.jpg ${TESTIMAGES}/testorig.ppm
|
||||
${MD5_JPEG_440_ISLOW})
|
||||
|
||||
# CC: YCC->RGB SAMP: fullsize/h1v2 fancy IDCT: islow ENT: huff
|
||||
add_bittest(djpeg 440-islow "-dct;int"
|
||||
testout_440_islow.ppm testout_440_islow.jpg
|
||||
${MD5_PPM_440_ISLOW} cjpeg-${libtype}-440-islow)
|
||||
|
||||
# CC: YCC->RGB SAMP: h2v1 merged IDCT: ifast ENT: huff
|
||||
add_bittest(djpeg 422m-ifast "-dct;fast;-nosmooth"
|
||||
testout_422m_ifast.ppm testout_422_ifast_opt.jpg
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
* Copyright (C) 2010, 2015-2016, D. R. Commander.
|
||||
* Copyright (C) 2014, MIPS Technologies, Inc., California.
|
||||
* Copyright (C) 2015, Google, Inc.
|
||||
* Copyright (C) 2019, Arm Limited.
|
||||
* Copyright (C) 2019-2020, Arm Limited.
|
||||
* For conditions of distribution and use, see the accompanying README.ijg
|
||||
* file.
|
||||
*
|
||||
@@ -477,6 +477,11 @@ jinit_upsampler(j_decompress_ptr cinfo)
|
||||
} else if (h_in_group == h_out_group &&
|
||||
v_in_group * 2 == v_out_group && do_fancy) {
|
||||
/* Non-fancy upsampling is handled by the generic method */
|
||||
#if defined(__arm__) || defined(__aarch64__)
|
||||
if (jsimd_can_h1v2_fancy_upsample())
|
||||
upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
|
||||
else
|
||||
#endif
|
||||
upsample->methods[ci] = h1v2_fancy_upsample;
|
||||
upsample->pub.need_context_rows = TRUE;
|
||||
} else if (h_in_group * 2 == h_out_group &&
|
||||
|
||||
6
jsimd.h
6
jsimd.h
@@ -4,6 +4,7 @@
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2011, 2014, D. R. Commander.
|
||||
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
|
||||
* Copyright (C) 2020, Arm Limited.
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library,
|
||||
* Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -75,6 +76,7 @@ EXTERN(void) jsimd_int_upsample(j_decompress_ptr cinfo,
|
||||
|
||||
EXTERN(int) jsimd_can_h2v2_fancy_upsample(void);
|
||||
EXTERN(int) jsimd_can_h2v1_fancy_upsample(void);
|
||||
EXTERN(int) jsimd_can_h1v2_fancy_upsample(void);
|
||||
|
||||
EXTERN(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo,
|
||||
jpeg_component_info *compptr,
|
||||
@@ -84,6 +86,10 @@ EXTERN(void) jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo,
|
||||
jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr);
|
||||
EXTERN(void) jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo,
|
||||
jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr);
|
||||
|
||||
EXTERN(int) jsimd_can_h2v2_merged_upsample(void);
|
||||
EXTERN(int) jsimd_can_h2v1_merged_upsample(void);
|
||||
|
||||
13
jsimd_none.c
13
jsimd_none.c
@@ -4,6 +4,7 @@
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2009-2011, 2014, D. R. Commander.
|
||||
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
|
||||
* Copyright (C) 2020, Arm Limited.
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library,
|
||||
* Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -169,6 +170,12 @@ jsimd_can_h2v1_fancy_upsample(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h1v2_fancy_upsample(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
@@ -181,6 +188,12 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_merged_upsample(void)
|
||||
{
|
||||
|
||||
@@ -454,6 +454,23 @@ jsimd_can_h2v1_fancy_upsample(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h1v2_fancy_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
@@ -472,6 +489,15 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data,
|
||||
output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_merged_upsample(void)
|
||||
{
|
||||
|
||||
@@ -522,6 +522,23 @@ jsimd_can_h2v1_fancy_upsample(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h1v2_fancy_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
@@ -540,6 +557,15 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data,
|
||||
output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_merged_upsample(void)
|
||||
{
|
||||
|
||||
@@ -372,3 +372,96 @@ void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
|
||||
inrow++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* The diagram below shows a column of samples produced by h1v2 downsampling
|
||||
* (or by losslessly rotating or transposing an h2v1-downsampled image.)
|
||||
*
|
||||
* +---------+
|
||||
* | p0 |
|
||||
* sA | |
|
||||
* | p1 |
|
||||
* +---------+
|
||||
* | p2 |
|
||||
* sB | |
|
||||
* | p3 |
|
||||
* +---------+
|
||||
* | p4 |
|
||||
* sC | |
|
||||
* | p5 |
|
||||
* +---------+
|
||||
*
|
||||
* Samples sA-sC were created by averaging the original pixel component values
|
||||
* centered at positions p0-p5 above. To approximate those original pixel
|
||||
* component values, we proportionally blend the adjacent samples in each
|
||||
* column.
|
||||
*
|
||||
* An upsampled pixel component value is computed by blending the sample
|
||||
* containing the pixel center with the nearest neighboring sample, in the
|
||||
* ratio 3:1. For example:
|
||||
* p1(upsampled) = 3/4 * sA + 1/4 * sB
|
||||
* p2(upsampled) = 3/4 * sB + 1/4 * sA
|
||||
* When computing the first and last pixel component values in the column,
|
||||
* there is no adjacent sample to blend, so:
|
||||
* p0(upsampled) = sA
|
||||
* p5(upsampled) = sC
|
||||
*/
|
||||
|
||||
void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
|
||||
JDIMENSION downsampled_width,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
JSAMPARRAY output_data = *output_data_ptr;
|
||||
JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
|
||||
int inrow, outrow;
|
||||
unsigned colctr;
|
||||
/* Set up constants. */
|
||||
const uint16x8_t one_u16 = vdupq_n_u16(1);
|
||||
const uint8x8_t three_u8 = vdup_n_u8(3);
|
||||
|
||||
inrow = outrow = 0;
|
||||
while (outrow < max_v_samp_factor) {
|
||||
inptr0 = input_data[inrow - 1];
|
||||
inptr1 = input_data[inrow];
|
||||
inptr2 = input_data[inrow + 1];
|
||||
/* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
|
||||
* respectively.
|
||||
*/
|
||||
outptr0 = output_data[outrow++];
|
||||
outptr1 = output_data[outrow++];
|
||||
inrow++;
|
||||
|
||||
/* The size of the input and output buffers is always a multiple of 32
|
||||
* bytes => no need to worry about buffer overflow when reading/writing
|
||||
* memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more
|
||||
* details.
|
||||
*/
|
||||
for (colctr = 0; colctr < downsampled_width; colctr += 16) {
|
||||
/* Load samples. */
|
||||
uint8x16_t sA = vld1q_u8(inptr0 + colctr);
|
||||
uint8x16_t sB = vld1q_u8(inptr1 + colctr);
|
||||
uint8x16_t sC = vld1q_u8(inptr2 + colctr);
|
||||
/* Blend samples vertically. */
|
||||
uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
|
||||
vget_low_u8(sB), three_u8);
|
||||
uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
|
||||
vget_high_u8(sB), three_u8);
|
||||
uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
|
||||
vget_low_u8(sB), three_u8);
|
||||
uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
|
||||
vget_high_u8(sB), three_u8);
|
||||
/* Add ordered dithering bias to pixel values in even output rows. */
|
||||
colsum0_l = vaddq_u16(colsum0_l, one_u16);
|
||||
colsum0_h = vaddq_u16(colsum0_h, one_u16);
|
||||
/* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
|
||||
uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
|
||||
vshrn_n_u16(colsum0_h, 2));
|
||||
uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
|
||||
vrshrn_n_u16(colsum1_h, 2));
|
||||
/* Store pixel component values to memory. */
|
||||
vst1q_u8(outptr0 + colctr, output_pixels0);
|
||||
vst1q_u8(outptr1 + colctr, output_pixels1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -664,6 +664,9 @@ EXTERN(void) jsimd_h2v1_fancy_upsample_neon
|
||||
EXTERN(void) jsimd_h2v2_fancy_upsample_neon
|
||||
(int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr);
|
||||
EXTERN(void) jsimd_h1v2_fancy_upsample_neon
|
||||
(int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr);
|
||||
|
||||
EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2
|
||||
(int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
|
||||
|
||||
Reference in New Issue
Block a user