Neon: Intrinsics impl of h2v1 & h2v2 merged upsamp
There was no previous GAS implementation. This commit also reverts40557b2301and7723d7f7d0.7723d7f7d0was only necessary because there was no Neon implementation of merged upsampling/color conversion, and40557b2301was only necessary because of7723d7f7d0.
This commit is contained in:
@@ -800,29 +800,7 @@ else()
|
|||||||
set(MD5_PPM_3x2_IFAST fd283664b3b49127984af0a7f118fccd)
|
set(MD5_PPM_3x2_IFAST fd283664b3b49127984af0a7f118fccd)
|
||||||
set(MD5_JPEG_420_ISLOW_ARI e986fb0a637a8d833d96e8a6d6d84ea1)
|
set(MD5_JPEG_420_ISLOW_ARI e986fb0a637a8d833d96e8a6d6d84ea1)
|
||||||
set(MD5_JPEG_444_ISLOW_PROGARI 0a8f1c8f66e113c3cf635df0a475a617)
|
set(MD5_JPEG_444_ISLOW_PROGARI 0a8f1c8f66e113c3cf635df0a475a617)
|
||||||
# Since v1.5.1, libjpeg-turbo uses the separate non-fancy upsampling and
|
|
||||||
# YCbCr -> RGB color conversion routines rather than merged upsampling/color
|
|
||||||
# conversion when fancy upsampling is disabled on platforms that have a SIMD
|
|
||||||
# implementation of YCbCr -> RGB color conversion but no SIMD implementation
|
|
||||||
# of merged upsampling/color conversion. This was intended to improve the
|
|
||||||
# performance of the Arm Neon SIMD extensions, the only SIMD extensions for
|
|
||||||
# which those circumstances currently apply. The separate non-fancy
|
|
||||||
# upsampling and color conversion routines usually produce bitwise-identical
|
|
||||||
# output to the merged upsampling/color conversion routines, but that is not
|
|
||||||
# the case when skipping scanlines starting at an odd-numbered scanline. In
|
|
||||||
# libjpeg-turbo 2.0.5 and prior, doing that while using merged h2v2
|
|
||||||
# upsampling caused a segfault, so this test validates the fix for that
|
|
||||||
# segfault. Unfortunately, however, the test also produces different bitwise
|
|
||||||
# output when using the Neon SIMD extensions, because of the aforementioned
|
|
||||||
# optimization. The easiest workaround is to use the old test from
|
|
||||||
# libjpeg-turbo 2.0.5 and prior when using the Neon SIMD extensions. The
|
|
||||||
# aforementioned segfault never would have occurred with the Neon SIMD
|
|
||||||
# extensions anyhow, since merged upsampling is disabled when using them.
|
|
||||||
if((CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm") AND WITH_SIMD)
|
|
||||||
set(MD5_PPM_420M_IFAST_ARI 72b59a99bcf1de24c5b27d151bde2437)
|
|
||||||
else()
|
|
||||||
set(MD5_PPM_420M_IFAST_ARI 57251da28a35b46eecb7177d82d10e0e)
|
set(MD5_PPM_420M_IFAST_ARI 57251da28a35b46eecb7177d82d10e0e)
|
||||||
endif()
|
|
||||||
set(MD5_JPEG_420_ISLOW 9a68f56bc76e466aa7e52f415d0f4a5f)
|
set(MD5_JPEG_420_ISLOW 9a68f56bc76e466aa7e52f415d0f4a5f)
|
||||||
set(MD5_PPM_420M_ISLOW_2_1 9f9de8c0612f8d06869b960b05abf9c9)
|
set(MD5_PPM_420M_ISLOW_2_1 9f9de8c0612f8d06869b960b05abf9c9)
|
||||||
set(MD5_PPM_420M_ISLOW_15_8 b6875bc070720b899566cc06459b63b7)
|
set(MD5_PPM_420M_ISLOW_15_8 b6875bc070720b899566cc06459b63b7)
|
||||||
@@ -1223,17 +1201,9 @@ foreach(libtype ${TEST_LIBTYPES})
|
|||||||
|
|
||||||
if(WITH_ARITH_DEC)
|
if(WITH_ARITH_DEC)
|
||||||
# CC: RGB->YCC SAMP: h2v2 merged IDCT: ifast ENT: arith
|
# CC: RGB->YCC SAMP: h2v2 merged IDCT: ifast ENT: arith
|
||||||
if((CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm") AND WITH_SIMD)
|
|
||||||
# Refer to the comment above the definition of MD5_PPM_420M_IFAST_ARI for
|
|
||||||
# an explanation of why this is necessary.
|
|
||||||
add_bittest(djpeg 420m-ifast-ari "-fast;-ppm"
|
|
||||||
testout_420m_ifast_ari.ppm ${TESTIMAGES}/testimgari.jpg
|
|
||||||
${MD5_PPM_420M_IFAST_ARI})
|
|
||||||
else()
|
|
||||||
add_bittest(djpeg 420m-ifast-ari "-fast;-skip;1,20;-ppm"
|
add_bittest(djpeg 420m-ifast-ari "-fast;-skip;1,20;-ppm"
|
||||||
testout_420m_ifast_ari.ppm ${TESTIMAGES}/testimgari.jpg
|
testout_420m_ifast_ari.ppm ${TESTIMAGES}/testimgari.jpg
|
||||||
${MD5_PPM_420M_IFAST_ARI})
|
${MD5_PPM_420M_IFAST_ARI})
|
||||||
endif()
|
|
||||||
|
|
||||||
add_bittest(jpegtran 420-islow ""
|
add_bittest(jpegtran 420-islow ""
|
||||||
testout_420_islow.jpg ${TESTIMAGES}/testimgari.jpg
|
testout_420_islow.jpg ${TESTIMAGES}/testimgari.jpg
|
||||||
|
|||||||
12
jdmaster.c
12
jdmaster.c
@@ -22,7 +22,6 @@
|
|||||||
#include "jpeglib.h"
|
#include "jpeglib.h"
|
||||||
#include "jpegcomp.h"
|
#include "jpegcomp.h"
|
||||||
#include "jdmaster.h"
|
#include "jdmaster.h"
|
||||||
#include "jsimd.h"
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -70,17 +69,6 @@ use_merged_upsample(j_decompress_ptr cinfo)
|
|||||||
cinfo->comp_info[1]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
|
cinfo->comp_info[1]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
|
||||||
cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size)
|
cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size)
|
||||||
return FALSE;
|
return FALSE;
|
||||||
#ifdef WITH_SIMD
|
|
||||||
/* If YCbCr-to-RGB color conversion is SIMD-accelerated but merged upsampling
|
|
||||||
isn't, then disabling merged upsampling is likely to be faster when
|
|
||||||
decompressing YCbCr JPEG images. */
|
|
||||||
if (!jsimd_can_h2v2_merged_upsample() && !jsimd_can_h2v1_merged_upsample() &&
|
|
||||||
jsimd_can_ycc_rgb() && cinfo->jpeg_color_space == JCS_YCbCr &&
|
|
||||||
(cinfo->out_color_space == JCS_RGB ||
|
|
||||||
(cinfo->out_color_space >= JCS_EXT_RGB &&
|
|
||||||
cinfo->out_color_space <= JCS_EXT_ARGB)))
|
|
||||||
return FALSE;
|
|
||||||
#endif
|
|
||||||
/* ??? also need to test for upsample-time rescaling, when & if supported */
|
/* ??? also need to test for upsample-time rescaling, when & if supported */
|
||||||
return TRUE; /* by golly, it'll work... */
|
return TRUE; /* by golly, it'll work... */
|
||||||
#else
|
#else
|
||||||
|
|||||||
@@ -266,7 +266,8 @@ endif()
|
|||||||
file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S)
|
file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S)
|
||||||
|
|
||||||
set(SIMD_SOURCES arm/jcgray-neon.c arm/jcphuff-neon.c arm/jcsample-neon.c
|
set(SIMD_SOURCES arm/jcgray-neon.c arm/jcphuff-neon.c arm/jcsample-neon.c
|
||||||
arm/jdsample-neon.c arm/jfdctfst-neon.c arm/jquanti-neon.c)
|
arm/jdmerge-neon.c arm/jdsample-neon.c arm/jfdctfst-neon.c
|
||||||
|
arm/jquanti-neon.c)
|
||||||
if(NEON_INTRINSICS)
|
if(NEON_INTRINSICS)
|
||||||
set(SIMD_SOURCES ${SIMD_SOURCES} arm/jccolor-neon.c arm/jidctint-neon.c)
|
set(SIMD_SOURCES ${SIMD_SOURCES} arm/jccolor-neon.c arm/jidctint-neon.c)
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@@ -501,12 +501,34 @@ jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
|||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
jsimd_can_h2v2_merged_upsample(void)
|
jsimd_can_h2v2_merged_upsample(void)
|
||||||
{
|
{
|
||||||
|
init_simd();
|
||||||
|
|
||||||
|
/* The code is optimised for these values only */
|
||||||
|
if (BITS_IN_JSAMPLE != 8)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(JDIMENSION) != 4)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (simd_support & JSIMD_NEON)
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
jsimd_can_h2v1_merged_upsample(void)
|
jsimd_can_h2v1_merged_upsample(void)
|
||||||
{
|
{
|
||||||
|
init_simd();
|
||||||
|
|
||||||
|
/* The code is optimised for these values only */
|
||||||
|
if (BITS_IN_JSAMPLE != 8)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(JDIMENSION) != 4)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (simd_support & JSIMD_NEON)
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -514,12 +536,74 @@ GLOBAL(void)
|
|||||||
jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
||||||
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
|
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
|
||||||
{
|
{
|
||||||
|
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
|
||||||
|
|
||||||
|
switch (cinfo->out_color_space) {
|
||||||
|
case JCS_EXT_RGB:
|
||||||
|
neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_RGBX:
|
||||||
|
case JCS_EXT_RGBA:
|
||||||
|
neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_BGR:
|
||||||
|
neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_BGRX:
|
||||||
|
case JCS_EXT_BGRA:
|
||||||
|
neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_XBGR:
|
||||||
|
case JCS_EXT_ABGR:
|
||||||
|
neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_XRGB:
|
||||||
|
case JCS_EXT_ARGB:
|
||||||
|
neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(void)
|
GLOBAL(void)
|
||||||
jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
||||||
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
|
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
|
||||||
{
|
{
|
||||||
|
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
|
||||||
|
|
||||||
|
switch (cinfo->out_color_space) {
|
||||||
|
case JCS_EXT_RGB:
|
||||||
|
neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_RGBX:
|
||||||
|
case JCS_EXT_RGBA:
|
||||||
|
neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_BGR:
|
||||||
|
neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_BGRX:
|
||||||
|
case JCS_EXT_BGRA:
|
||||||
|
neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_XBGR:
|
||||||
|
case JCS_EXT_ABGR:
|
||||||
|
neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_XRGB:
|
||||||
|
case JCS_EXT_ARGB:
|
||||||
|
neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
|
|||||||
@@ -569,12 +569,34 @@ jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
|||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
jsimd_can_h2v2_merged_upsample(void)
|
jsimd_can_h2v2_merged_upsample(void)
|
||||||
{
|
{
|
||||||
|
init_simd();
|
||||||
|
|
||||||
|
/* The code is optimised for these values only */
|
||||||
|
if (BITS_IN_JSAMPLE != 8)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(JDIMENSION) != 4)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (simd_support & JSIMD_NEON)
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
jsimd_can_h2v1_merged_upsample(void)
|
jsimd_can_h2v1_merged_upsample(void)
|
||||||
{
|
{
|
||||||
|
init_simd();
|
||||||
|
|
||||||
|
/* The code is optimised for these values only */
|
||||||
|
if (BITS_IN_JSAMPLE != 8)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(JDIMENSION) != 4)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (simd_support & JSIMD_NEON)
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -582,12 +604,74 @@ GLOBAL(void)
|
|||||||
jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
||||||
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
|
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
|
||||||
{
|
{
|
||||||
|
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
|
||||||
|
|
||||||
|
switch (cinfo->out_color_space) {
|
||||||
|
case JCS_EXT_RGB:
|
||||||
|
neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_RGBX:
|
||||||
|
case JCS_EXT_RGBA:
|
||||||
|
neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_BGR:
|
||||||
|
neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_BGRX:
|
||||||
|
case JCS_EXT_BGRA:
|
||||||
|
neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_XBGR:
|
||||||
|
case JCS_EXT_ABGR:
|
||||||
|
neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_XRGB:
|
||||||
|
case JCS_EXT_ARGB:
|
||||||
|
neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(void)
|
GLOBAL(void)
|
||||||
jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
||||||
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
|
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
|
||||||
{
|
{
|
||||||
|
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
|
||||||
|
|
||||||
|
switch (cinfo->out_color_space) {
|
||||||
|
case JCS_EXT_RGB:
|
||||||
|
neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_RGBX:
|
||||||
|
case JCS_EXT_RGBA:
|
||||||
|
neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_BGR:
|
||||||
|
neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_BGRX:
|
||||||
|
case JCS_EXT_BGRA:
|
||||||
|
neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_XBGR:
|
||||||
|
case JCS_EXT_ABGR:
|
||||||
|
neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_XRGB:
|
||||||
|
case JCS_EXT_ARGB:
|
||||||
|
neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
|
|||||||
144
simd/arm/jdmerge-neon.c
Normal file
144
simd/arm/jdmerge-neon.c
Normal file
@@ -0,0 +1,144 @@
|
|||||||
|
/*
|
||||||
|
* jdmerge-neon.c - merged upsampling/color conversion (Arm Neon)
|
||||||
|
*
|
||||||
|
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* This software is provided 'as-is', without any express or implied
|
||||||
|
* warranty. In no event will the authors be held liable for any damages
|
||||||
|
* arising from the use of this software.
|
||||||
|
*
|
||||||
|
* Permission is granted to anyone to use this software for any purpose,
|
||||||
|
* including commercial applications, and to alter it and redistribute it
|
||||||
|
* freely, subject to the following restrictions:
|
||||||
|
*
|
||||||
|
* 1. The origin of this software must not be misrepresented; you must not
|
||||||
|
* claim that you wrote the original software. If you use this software
|
||||||
|
* in a product, an acknowledgment in the product documentation would be
|
||||||
|
* appreciated but is not required.
|
||||||
|
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||||
|
* misrepresented as being the original software.
|
||||||
|
* 3. This notice may not be removed or altered from any source distribution.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define JPEG_INTERNALS
|
||||||
|
#include "../../jinclude.h"
|
||||||
|
#include "../../jpeglib.h"
|
||||||
|
#include "../../jsimd.h"
|
||||||
|
#include "../../jdct.h"
|
||||||
|
#include "../../jsimddct.h"
|
||||||
|
#include "../jsimd.h"
|
||||||
|
#include "align.h"
|
||||||
|
|
||||||
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
|
||||||
|
/* YCbCr -> RGB conversion constants */
|
||||||
|
|
||||||
|
#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
|
||||||
|
#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
|
||||||
|
#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
|
||||||
|
#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
|
||||||
|
|
||||||
|
ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
|
||||||
|
-F_0_344, F_0_714, F_1_402, F_1_772
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/* Include inline routines for colorspace extensions. */
|
||||||
|
|
||||||
|
#include "jdmrgext-neon.c"
|
||||||
|
#undef RGB_RED
|
||||||
|
#undef RGB_GREEN
|
||||||
|
#undef RGB_BLUE
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
|
||||||
|
#define RGB_RED EXT_RGB_RED
|
||||||
|
#define RGB_GREEN EXT_RGB_GREEN
|
||||||
|
#define RGB_BLUE EXT_RGB_BLUE
|
||||||
|
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||||
|
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgb_merged_upsample_neon
|
||||||
|
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgb_merged_upsample_neon
|
||||||
|
#include "jdmrgext-neon.c"
|
||||||
|
#undef RGB_RED
|
||||||
|
#undef RGB_GREEN
|
||||||
|
#undef RGB_BLUE
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef jsimd_h2v1_merged_upsample_neon
|
||||||
|
#undef jsimd_h2v2_merged_upsample_neon
|
||||||
|
|
||||||
|
#define RGB_RED EXT_RGBX_RED
|
||||||
|
#define RGB_GREEN EXT_RGBX_GREEN
|
||||||
|
#define RGB_BLUE EXT_RGBX_BLUE
|
||||||
|
#define RGB_ALPHA 3
|
||||||
|
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||||
|
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgbx_merged_upsample_neon
|
||||||
|
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgbx_merged_upsample_neon
|
||||||
|
#include "jdmrgext-neon.c"
|
||||||
|
#undef RGB_RED
|
||||||
|
#undef RGB_GREEN
|
||||||
|
#undef RGB_BLUE
|
||||||
|
#undef RGB_ALPHA
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef jsimd_h2v1_merged_upsample_neon
|
||||||
|
#undef jsimd_h2v2_merged_upsample_neon
|
||||||
|
|
||||||
|
#define RGB_RED EXT_BGR_RED
|
||||||
|
#define RGB_GREEN EXT_BGR_GREEN
|
||||||
|
#define RGB_BLUE EXT_BGR_BLUE
|
||||||
|
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||||
|
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgr_merged_upsample_neon
|
||||||
|
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgr_merged_upsample_neon
|
||||||
|
#include "jdmrgext-neon.c"
|
||||||
|
#undef RGB_RED
|
||||||
|
#undef RGB_GREEN
|
||||||
|
#undef RGB_BLUE
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef jsimd_h2v1_merged_upsample_neon
|
||||||
|
#undef jsimd_h2v2_merged_upsample_neon
|
||||||
|
|
||||||
|
#define RGB_RED EXT_BGRX_RED
|
||||||
|
#define RGB_GREEN EXT_BGRX_GREEN
|
||||||
|
#define RGB_BLUE EXT_BGRX_BLUE
|
||||||
|
#define RGB_ALPHA 3
|
||||||
|
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||||
|
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgrx_merged_upsample_neon
|
||||||
|
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgrx_merged_upsample_neon
|
||||||
|
#include "jdmrgext-neon.c"
|
||||||
|
#undef RGB_RED
|
||||||
|
#undef RGB_GREEN
|
||||||
|
#undef RGB_BLUE
|
||||||
|
#undef RGB_ALPHA
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef jsimd_h2v1_merged_upsample_neon
|
||||||
|
#undef jsimd_h2v2_merged_upsample_neon
|
||||||
|
|
||||||
|
#define RGB_RED EXT_XBGR_RED
|
||||||
|
#define RGB_GREEN EXT_XBGR_GREEN
|
||||||
|
#define RGB_BLUE EXT_XBGR_BLUE
|
||||||
|
#define RGB_ALPHA 0
|
||||||
|
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||||
|
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxbgr_merged_upsample_neon
|
||||||
|
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxbgr_merged_upsample_neon
|
||||||
|
#include "jdmrgext-neon.c"
|
||||||
|
#undef RGB_RED
|
||||||
|
#undef RGB_GREEN
|
||||||
|
#undef RGB_BLUE
|
||||||
|
#undef RGB_ALPHA
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef jsimd_h2v1_merged_upsample_neon
|
||||||
|
#undef jsimd_h2v2_merged_upsample_neon
|
||||||
|
|
||||||
|
#define RGB_RED EXT_XRGB_RED
|
||||||
|
#define RGB_GREEN EXT_XRGB_GREEN
|
||||||
|
#define RGB_BLUE EXT_XRGB_BLUE
|
||||||
|
#define RGB_ALPHA 0
|
||||||
|
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||||
|
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxrgb_merged_upsample_neon
|
||||||
|
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxrgb_merged_upsample_neon
|
||||||
|
#include "jdmrgext-neon.c"
|
||||||
|
#undef RGB_RED
|
||||||
|
#undef RGB_GREEN
|
||||||
|
#undef RGB_BLUE
|
||||||
|
#undef RGB_ALPHA
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef jsimd_h2v1_merged_upsample_neon
|
||||||
667
simd/arm/jdmrgext-neon.c
Normal file
667
simd/arm/jdmrgext-neon.c
Normal file
@@ -0,0 +1,667 @@
|
|||||||
|
/*
|
||||||
|
* jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
|
||||||
|
*
|
||||||
|
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||||
|
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* This software is provided 'as-is', without any express or implied
|
||||||
|
* warranty. In no event will the authors be held liable for any damages
|
||||||
|
* arising from the use of this software.
|
||||||
|
*
|
||||||
|
* Permission is granted to anyone to use this software for any purpose,
|
||||||
|
* including commercial applications, and to alter it and redistribute it
|
||||||
|
* freely, subject to the following restrictions:
|
||||||
|
*
|
||||||
|
* 1. The origin of this software must not be misrepresented; you must not
|
||||||
|
* claim that you wrote the original software. If you use this software
|
||||||
|
* in a product, an acknowledgment in the product documentation would be
|
||||||
|
* appreciated but is not required.
|
||||||
|
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||||
|
* misrepresented as being the original software.
|
||||||
|
* 3. This notice may not be removed or altered from any source distribution.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* This file is included by jdmerge-neon.c. */
|
||||||
|
|
||||||
|
|
||||||
|
/* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
|
||||||
|
* chroma upsampling and YCbCr -> RGB color conversion into a single function.
|
||||||
|
*
|
||||||
|
* As with the standalone functions, YCbCr -> RGB conversion is defined by the
|
||||||
|
* following equations:
|
||||||
|
* R = Y + 1.40200 * (Cr - 128)
|
||||||
|
* G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
|
||||||
|
* B = Y + 1.77200 * (Cb - 128)
|
||||||
|
*
|
||||||
|
* Scaled integer constants are used to avoid floating-point arithmetic:
|
||||||
|
* 0.3441467 = 11277 * 2^-15
|
||||||
|
* 0.7141418 = 23401 * 2^-15
|
||||||
|
* 1.4020386 = 22971 * 2^-14
|
||||||
|
* 1.7720337 = 29033 * 2^-14
|
||||||
|
* These constants are defined in jdmerge-neon.c.
|
||||||
|
*
|
||||||
|
* To ensure correct results, rounding is used when descaling.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
|
||||||
|
* routines:
|
||||||
|
*
|
||||||
|
* Input memory buffers can be safely overread up to the next multiple of
|
||||||
|
* ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
|
||||||
|
* jmemmgr.c.
|
||||||
|
*
|
||||||
|
* The output buffer cannot safely be written beyond output_width, since
|
||||||
|
* output_buf points to a possibly unpadded row in the decompressed image
|
||||||
|
* buffer allocated by the calling program.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
|
||||||
|
*/
|
||||||
|
|
||||||
|
void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
|
||||||
|
JSAMPIMAGE input_buf,
|
||||||
|
JDIMENSION in_row_group_ctr,
|
||||||
|
JSAMPARRAY output_buf)
|
||||||
|
{
|
||||||
|
JSAMPROW outptr;
|
||||||
|
/* Pointers to Y, Cb, and Cr data */
|
||||||
|
JSAMPROW inptr0, inptr1, inptr2;
|
||||||
|
|
||||||
|
const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
|
||||||
|
const int16x8_t neg_128 = vdupq_n_s16(-128);
|
||||||
|
|
||||||
|
inptr0 = input_buf[0][in_row_group_ctr];
|
||||||
|
inptr1 = input_buf[1][in_row_group_ctr];
|
||||||
|
inptr2 = input_buf[2][in_row_group_ctr];
|
||||||
|
outptr = output_buf[0];
|
||||||
|
|
||||||
|
int cols_remaining = output_width;
|
||||||
|
for (; cols_remaining >= 16; cols_remaining -= 16) {
|
||||||
|
/* De-interleave Y component values into two separate vectors, one
|
||||||
|
* containing the component values with even-numbered indices and one
|
||||||
|
* containing the component values with odd-numbered indices.
|
||||||
|
*/
|
||||||
|
uint8x8x2_t y = vld2_u8(inptr0);
|
||||||
|
uint8x8_t cb = vld1_u8(inptr1);
|
||||||
|
uint8x8_t cr = vld1_u8(inptr2);
|
||||||
|
/* Subtract 128 from Cb and Cr. */
|
||||||
|
int16x8_t cr_128 =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
|
||||||
|
int16x8_t cb_128 =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
|
||||||
|
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
|
||||||
|
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
|
||||||
|
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
|
||||||
|
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
|
||||||
|
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
|
||||||
|
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
|
||||||
|
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
|
||||||
|
vrshrn_n_s32(g_sub_y_h, 15));
|
||||||
|
/* Compute R-Y: 1.40200 * (Cr - 128) */
|
||||||
|
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
|
||||||
|
/* Compute B-Y: 1.77200 * (Cb - 128) */
|
||||||
|
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
|
||||||
|
/* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
|
||||||
|
* "odd" Y component values. This effectively upsamples the chroma
|
||||||
|
* components horizontally.
|
||||||
|
*/
|
||||||
|
int16x8_t g_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||||
|
y.val[0]));
|
||||||
|
int16x8_t r_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||||
|
y.val[0]));
|
||||||
|
int16x8_t b_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||||
|
y.val[0]));
|
||||||
|
int16x8_t g_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||||
|
y.val[1]));
|
||||||
|
int16x8_t r_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||||
|
y.val[1]));
|
||||||
|
int16x8_t b_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||||
|
y.val[1]));
|
||||||
|
/* Convert each component to unsigned and narrow, clamping to [0-255].
|
||||||
|
* Re-interleave the "even" and "odd" component values.
|
||||||
|
*/
|
||||||
|
uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
|
||||||
|
uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
|
||||||
|
uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
|
||||||
|
|
||||||
|
#ifdef RGB_ALPHA
|
||||||
|
uint8x16x4_t rgba;
|
||||||
|
rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
|
||||||
|
rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
|
||||||
|
rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
|
||||||
|
/* Set alpha channel to opaque (0xFF). */
|
||||||
|
rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
|
||||||
|
/* Store RGBA pixel data to memory. */
|
||||||
|
vst4q_u8(outptr, rgba);
|
||||||
|
#else
|
||||||
|
uint8x16x3_t rgb;
|
||||||
|
rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
|
||||||
|
rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
|
||||||
|
rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
|
||||||
|
/* Store RGB pixel data to memory. */
|
||||||
|
vst3q_u8(outptr, rgb);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Increment pointers. */
|
||||||
|
inptr0 += 16;
|
||||||
|
inptr1 += 8;
|
||||||
|
inptr2 += 8;
|
||||||
|
outptr += (RGB_PIXELSIZE * 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cols_remaining > 0) {
|
||||||
|
/* De-interleave Y component values into two separate vectors, one
|
||||||
|
* containing the component values with even-numbered indices and one
|
||||||
|
* containing the component values with odd-numbered indices.
|
||||||
|
*/
|
||||||
|
uint8x8x2_t y = vld2_u8(inptr0);
|
||||||
|
uint8x8_t cb = vld1_u8(inptr1);
|
||||||
|
uint8x8_t cr = vld1_u8(inptr2);
|
||||||
|
/* Subtract 128 from Cb and Cr. */
|
||||||
|
int16x8_t cr_128 =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
|
||||||
|
int16x8_t cb_128 =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
|
||||||
|
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
|
||||||
|
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
|
||||||
|
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
|
||||||
|
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
|
||||||
|
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
|
||||||
|
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
|
||||||
|
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
|
||||||
|
vrshrn_n_s32(g_sub_y_h, 15));
|
||||||
|
/* Compute R-Y: 1.40200 * (Cr - 128) */
|
||||||
|
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
|
||||||
|
/* Compute B-Y: 1.77200 * (Cb - 128) */
|
||||||
|
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
|
||||||
|
/* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
|
||||||
|
* "odd" Y component values. This effectively upsamples the chroma
|
||||||
|
* components horizontally.
|
||||||
|
*/
|
||||||
|
int16x8_t g_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||||
|
y.val[0]));
|
||||||
|
int16x8_t r_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||||
|
y.val[0]));
|
||||||
|
int16x8_t b_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||||
|
y.val[0]));
|
||||||
|
int16x8_t g_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||||
|
y.val[1]));
|
||||||
|
int16x8_t r_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||||
|
y.val[1]));
|
||||||
|
int16x8_t b_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||||
|
y.val[1]));
|
||||||
|
/* Convert each component to unsigned and narrow, clamping to [0-255].
|
||||||
|
* Re-interleave the "even" and "odd" component values.
|
||||||
|
*/
|
||||||
|
uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
|
||||||
|
uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
|
||||||
|
uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
|
||||||
|
|
||||||
|
#ifdef RGB_ALPHA
|
||||||
|
uint8x8x4_t rgba_h;
|
||||||
|
rgba_h.val[RGB_RED] = r.val[1];
|
||||||
|
rgba_h.val[RGB_GREEN] = g.val[1];
|
||||||
|
rgba_h.val[RGB_BLUE] = b.val[1];
|
||||||
|
/* Set alpha channel to opaque (0xFF). */
|
||||||
|
rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
|
||||||
|
uint8x8x4_t rgba_l;
|
||||||
|
rgba_l.val[RGB_RED] = r.val[0];
|
||||||
|
rgba_l.val[RGB_GREEN] = g.val[0];
|
||||||
|
rgba_l.val[RGB_BLUE] = b.val[0];
|
||||||
|
/* Set alpha channel to opaque (0xFF). */
|
||||||
|
rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
|
||||||
|
/* Store RGBA pixel data to memory. */
|
||||||
|
switch (cols_remaining) {
|
||||||
|
case 15:
|
||||||
|
vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
|
||||||
|
case 14:
|
||||||
|
vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
|
||||||
|
case 13:
|
||||||
|
vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
|
||||||
|
case 12:
|
||||||
|
vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
|
||||||
|
case 11:
|
||||||
|
vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
|
||||||
|
case 10:
|
||||||
|
vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
|
||||||
|
case 9:
|
||||||
|
vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
|
||||||
|
case 8:
|
||||||
|
vst4_u8(outptr, rgba_l);
|
||||||
|
break;
|
||||||
|
case 7:
|
||||||
|
vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
|
||||||
|
case 6:
|
||||||
|
vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
|
||||||
|
case 5:
|
||||||
|
vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
|
||||||
|
case 4:
|
||||||
|
vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
|
||||||
|
case 3:
|
||||||
|
vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
|
||||||
|
case 2:
|
||||||
|
vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
|
||||||
|
case 1:
|
||||||
|
vst4_lane_u8(outptr, rgba_l, 0);
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
uint8x8x3_t rgb_h;
|
||||||
|
rgb_h.val[RGB_RED] = r.val[1];
|
||||||
|
rgb_h.val[RGB_GREEN] = g.val[1];
|
||||||
|
rgb_h.val[RGB_BLUE] = b.val[1];
|
||||||
|
uint8x8x3_t rgb_l;
|
||||||
|
rgb_l.val[RGB_RED] = r.val[0];
|
||||||
|
rgb_l.val[RGB_GREEN] = g.val[0];
|
||||||
|
rgb_l.val[RGB_BLUE] = b.val[0];
|
||||||
|
/* Store RGB pixel data to memory. */
|
||||||
|
switch (cols_remaining) {
|
||||||
|
case 15:
|
||||||
|
vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
|
||||||
|
case 14:
|
||||||
|
vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
|
||||||
|
case 13:
|
||||||
|
vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
|
||||||
|
case 12:
|
||||||
|
vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
|
||||||
|
case 11:
|
||||||
|
vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
|
||||||
|
case 10:
|
||||||
|
vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
|
||||||
|
case 9:
|
||||||
|
vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
|
||||||
|
case 8:
|
||||||
|
vst3_u8(outptr, rgb_l);
|
||||||
|
break;
|
||||||
|
case 7:
|
||||||
|
vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
|
||||||
|
case 6:
|
||||||
|
vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
|
||||||
|
case 5:
|
||||||
|
vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
|
||||||
|
case 4:
|
||||||
|
vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
|
||||||
|
case 3:
|
||||||
|
vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
|
||||||
|
case 2:
|
||||||
|
vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
|
||||||
|
case 1:
|
||||||
|
vst3_lane_u8(outptr, rgb_l, 0);
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
|
||||||
|
*
|
||||||
|
* See comments above for details regarding color conversion and safe memory
|
||||||
|
* access.
|
||||||
|
*/
|
||||||
|
|
||||||
|
void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
|
||||||
|
JSAMPIMAGE input_buf,
|
||||||
|
JDIMENSION in_row_group_ctr,
|
||||||
|
JSAMPARRAY output_buf)
|
||||||
|
{
|
||||||
|
JSAMPROW outptr0, outptr1;
|
||||||
|
/* Pointers to Y (both rows), Cb, and Cr data */
|
||||||
|
JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
|
||||||
|
|
||||||
|
const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
|
||||||
|
const int16x8_t neg_128 = vdupq_n_s16(-128);
|
||||||
|
|
||||||
|
inptr0_0 = input_buf[0][in_row_group_ctr * 2];
|
||||||
|
inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
|
||||||
|
inptr1 = input_buf[1][in_row_group_ctr];
|
||||||
|
inptr2 = input_buf[2][in_row_group_ctr];
|
||||||
|
outptr0 = output_buf[0];
|
||||||
|
outptr1 = output_buf[1];
|
||||||
|
|
||||||
|
int cols_remaining = output_width;
|
||||||
|
for (; cols_remaining >= 16; cols_remaining -= 16) {
|
||||||
|
/* For each row, de-interleave Y component values into two separate
|
||||||
|
* vectors, one containing the component values with even-numbered indices
|
||||||
|
* and one containing the component values with odd-numbered indices.
|
||||||
|
*/
|
||||||
|
uint8x8x2_t y0 = vld2_u8(inptr0_0);
|
||||||
|
uint8x8x2_t y1 = vld2_u8(inptr0_1);
|
||||||
|
uint8x8_t cb = vld1_u8(inptr1);
|
||||||
|
uint8x8_t cr = vld1_u8(inptr2);
|
||||||
|
/* Subtract 128 from Cb and Cr. */
|
||||||
|
int16x8_t cr_128 =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
|
||||||
|
int16x8_t cb_128 =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
|
||||||
|
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
|
||||||
|
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
|
||||||
|
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
|
||||||
|
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
|
||||||
|
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
|
||||||
|
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
|
||||||
|
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
|
||||||
|
vrshrn_n_s32(g_sub_y_h, 15));
|
||||||
|
/* Compute R-Y: 1.40200 * (Cr - 128) */
|
||||||
|
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
|
||||||
|
/* Compute B-Y: 1.77200 * (Cb - 128) */
|
||||||
|
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
|
||||||
|
/* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
|
||||||
|
* the "even" and "odd" Y component values. This effectively upsamples the
|
||||||
|
* chroma components both horizontally and vertically.
|
||||||
|
*/
|
||||||
|
int16x8_t g0_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||||
|
y0.val[0]));
|
||||||
|
int16x8_t r0_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||||
|
y0.val[0]));
|
||||||
|
int16x8_t b0_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||||
|
y0.val[0]));
|
||||||
|
int16x8_t g0_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||||
|
y0.val[1]));
|
||||||
|
int16x8_t r0_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||||
|
y0.val[1]));
|
||||||
|
int16x8_t b0_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||||
|
y0.val[1]));
|
||||||
|
int16x8_t g1_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||||
|
y1.val[0]));
|
||||||
|
int16x8_t r1_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||||
|
y1.val[0]));
|
||||||
|
int16x8_t b1_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||||
|
y1.val[0]));
|
||||||
|
int16x8_t g1_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||||
|
y1.val[1]));
|
||||||
|
int16x8_t r1_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||||
|
y1.val[1]));
|
||||||
|
int16x8_t b1_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||||
|
y1.val[1]));
|
||||||
|
/* Convert each component to unsigned and narrow, clamping to [0-255].
|
||||||
|
* Re-interleave the "even" and "odd" component values.
|
||||||
|
*/
|
||||||
|
uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
|
||||||
|
uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
|
||||||
|
uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
|
||||||
|
uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
|
||||||
|
uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
|
||||||
|
uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
|
||||||
|
|
||||||
|
#ifdef RGB_ALPHA
|
||||||
|
uint8x16x4_t rgba0, rgba1;
|
||||||
|
rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
|
||||||
|
rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
|
||||||
|
rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
|
||||||
|
rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
|
||||||
|
rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
|
||||||
|
rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
|
||||||
|
/* Set alpha channel to opaque (0xFF). */
|
||||||
|
rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
|
||||||
|
rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
|
||||||
|
/* Store RGBA pixel data to memory. */
|
||||||
|
vst4q_u8(outptr0, rgba0);
|
||||||
|
vst4q_u8(outptr1, rgba1);
|
||||||
|
#else
|
||||||
|
uint8x16x3_t rgb0, rgb1;
|
||||||
|
rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
|
||||||
|
rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
|
||||||
|
rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
|
||||||
|
rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
|
||||||
|
rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
|
||||||
|
rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
|
||||||
|
/* Store RGB pixel data to memory. */
|
||||||
|
vst3q_u8(outptr0, rgb0);
|
||||||
|
vst3q_u8(outptr1, rgb1);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Increment pointers. */
|
||||||
|
inptr0_0 += 16;
|
||||||
|
inptr0_1 += 16;
|
||||||
|
inptr1 += 8;
|
||||||
|
inptr2 += 8;
|
||||||
|
outptr0 += (RGB_PIXELSIZE * 16);
|
||||||
|
outptr1 += (RGB_PIXELSIZE * 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cols_remaining > 0) {
|
||||||
|
/* For each row, de-interleave Y component values into two separate
|
||||||
|
* vectors, one containing the component values with even-numbered indices
|
||||||
|
* and one containing the component values with odd-numbered indices.
|
||||||
|
*/
|
||||||
|
uint8x8x2_t y0 = vld2_u8(inptr0_0);
|
||||||
|
uint8x8x2_t y1 = vld2_u8(inptr0_1);
|
||||||
|
uint8x8_t cb = vld1_u8(inptr1);
|
||||||
|
uint8x8_t cr = vld1_u8(inptr2);
|
||||||
|
/* Subtract 128 from Cb and Cr. */
|
||||||
|
int16x8_t cr_128 =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
|
||||||
|
int16x8_t cb_128 =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
|
||||||
|
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
|
||||||
|
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
|
||||||
|
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
|
||||||
|
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
|
||||||
|
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
|
||||||
|
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
|
||||||
|
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
|
||||||
|
vrshrn_n_s32(g_sub_y_h, 15));
|
||||||
|
/* Compute R-Y: 1.40200 * (Cr - 128) */
|
||||||
|
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
|
||||||
|
/* Compute B-Y: 1.77200 * (Cb - 128) */
|
||||||
|
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
|
||||||
|
/* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
|
||||||
|
* the "even" and "odd" Y component values. This effectively upsamples the
|
||||||
|
* chroma components both horizontally and vertically.
|
||||||
|
*/
|
||||||
|
int16x8_t g0_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||||
|
y0.val[0]));
|
||||||
|
int16x8_t r0_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||||
|
y0.val[0]));
|
||||||
|
int16x8_t b0_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||||
|
y0.val[0]));
|
||||||
|
int16x8_t g0_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||||
|
y0.val[1]));
|
||||||
|
int16x8_t r0_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||||
|
y0.val[1]));
|
||||||
|
int16x8_t b0_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||||
|
y0.val[1]));
|
||||||
|
int16x8_t g1_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||||
|
y1.val[0]));
|
||||||
|
int16x8_t r1_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||||
|
y1.val[0]));
|
||||||
|
int16x8_t b1_even =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||||
|
y1.val[0]));
|
||||||
|
int16x8_t g1_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||||
|
y1.val[1]));
|
||||||
|
int16x8_t r1_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||||
|
y1.val[1]));
|
||||||
|
int16x8_t b1_odd =
|
||||||
|
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||||
|
y1.val[1]));
|
||||||
|
/* Convert each component to unsigned and narrow, clamping to [0-255].
|
||||||
|
* Re-interleave the "even" and "odd" component values.
|
||||||
|
*/
|
||||||
|
uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
|
||||||
|
uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
|
||||||
|
uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
|
||||||
|
uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
|
||||||
|
uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
|
||||||
|
uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
|
||||||
|
|
||||||
|
#ifdef RGB_ALPHA
|
||||||
|
uint8x8x4_t rgba0_h, rgba1_h;
|
||||||
|
rgba0_h.val[RGB_RED] = r0.val[1];
|
||||||
|
rgba1_h.val[RGB_RED] = r1.val[1];
|
||||||
|
rgba0_h.val[RGB_GREEN] = g0.val[1];
|
||||||
|
rgba1_h.val[RGB_GREEN] = g1.val[1];
|
||||||
|
rgba0_h.val[RGB_BLUE] = b0.val[1];
|
||||||
|
rgba1_h.val[RGB_BLUE] = b1.val[1];
|
||||||
|
/* Set alpha channel to opaque (0xFF). */
|
||||||
|
rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
|
||||||
|
rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
|
||||||
|
|
||||||
|
uint8x8x4_t rgba0_l, rgba1_l;
|
||||||
|
rgba0_l.val[RGB_RED] = r0.val[0];
|
||||||
|
rgba1_l.val[RGB_RED] = r1.val[0];
|
||||||
|
rgba0_l.val[RGB_GREEN] = g0.val[0];
|
||||||
|
rgba1_l.val[RGB_GREEN] = g1.val[0];
|
||||||
|
rgba0_l.val[RGB_BLUE] = b0.val[0];
|
||||||
|
rgba1_l.val[RGB_BLUE] = b1.val[0];
|
||||||
|
/* Set alpha channel to opaque (0xFF). */
|
||||||
|
rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
|
||||||
|
rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
|
||||||
|
/* Store RGBA pixel data to memory. */
|
||||||
|
switch (cols_remaining) {
|
||||||
|
case 15:
|
||||||
|
vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
|
||||||
|
vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
|
||||||
|
case 14:
|
||||||
|
vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
|
||||||
|
vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
|
||||||
|
case 13:
|
||||||
|
vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
|
||||||
|
vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
|
||||||
|
case 12:
|
||||||
|
vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
|
||||||
|
vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
|
||||||
|
case 11:
|
||||||
|
vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
|
||||||
|
vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
|
||||||
|
case 10:
|
||||||
|
vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
|
||||||
|
vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
|
||||||
|
case 9:
|
||||||
|
vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
|
||||||
|
vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
|
||||||
|
case 8:
|
||||||
|
vst4_u8(outptr0, rgba0_l);
|
||||||
|
vst4_u8(outptr1, rgba1_l);
|
||||||
|
break;
|
||||||
|
case 7:
|
||||||
|
vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
|
||||||
|
vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
|
||||||
|
case 6:
|
||||||
|
vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
|
||||||
|
vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
|
||||||
|
case 5:
|
||||||
|
vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
|
||||||
|
vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
|
||||||
|
case 4:
|
||||||
|
vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
|
||||||
|
vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
|
||||||
|
case 3:
|
||||||
|
vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
|
||||||
|
vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
|
||||||
|
case 2:
|
||||||
|
vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
|
||||||
|
vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
|
||||||
|
case 1:
|
||||||
|
vst4_lane_u8(outptr0, rgba0_l, 0);
|
||||||
|
vst4_lane_u8(outptr1, rgba1_l, 0);
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
uint8x8x3_t rgb0_h, rgb1_h;
|
||||||
|
rgb0_h.val[RGB_RED] = r0.val[1];
|
||||||
|
rgb1_h.val[RGB_RED] = r1.val[1];
|
||||||
|
rgb0_h.val[RGB_GREEN] = g0.val[1];
|
||||||
|
rgb1_h.val[RGB_GREEN] = g1.val[1];
|
||||||
|
rgb0_h.val[RGB_BLUE] = b0.val[1];
|
||||||
|
rgb1_h.val[RGB_BLUE] = b1.val[1];
|
||||||
|
|
||||||
|
uint8x8x3_t rgb0_l, rgb1_l;
|
||||||
|
rgb0_l.val[RGB_RED] = r0.val[0];
|
||||||
|
rgb1_l.val[RGB_RED] = r1.val[0];
|
||||||
|
rgb0_l.val[RGB_GREEN] = g0.val[0];
|
||||||
|
rgb1_l.val[RGB_GREEN] = g1.val[0];
|
||||||
|
rgb0_l.val[RGB_BLUE] = b0.val[0];
|
||||||
|
rgb1_l.val[RGB_BLUE] = b1.val[0];
|
||||||
|
/* Store RGB pixel data to memory. */
|
||||||
|
switch (cols_remaining) {
|
||||||
|
case 15:
|
||||||
|
vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
|
||||||
|
vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
|
||||||
|
case 14:
|
||||||
|
vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
|
||||||
|
vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
|
||||||
|
case 13:
|
||||||
|
vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
|
||||||
|
vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
|
||||||
|
case 12:
|
||||||
|
vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
|
||||||
|
vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
|
||||||
|
case 11:
|
||||||
|
vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
|
||||||
|
vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
|
||||||
|
case 10:
|
||||||
|
vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
|
||||||
|
vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
|
||||||
|
case 9:
|
||||||
|
vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
|
||||||
|
vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
|
||||||
|
case 8:
|
||||||
|
vst3_u8(outptr0, rgb0_l);
|
||||||
|
vst3_u8(outptr1, rgb1_l);
|
||||||
|
break;
|
||||||
|
case 7:
|
||||||
|
vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
|
||||||
|
vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
|
||||||
|
case 6:
|
||||||
|
vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
|
||||||
|
vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
|
||||||
|
case 5:
|
||||||
|
vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
|
||||||
|
vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
|
||||||
|
case 4:
|
||||||
|
vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
|
||||||
|
vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
|
||||||
|
case 3:
|
||||||
|
vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
|
||||||
|
vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
|
||||||
|
case 2:
|
||||||
|
vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
|
||||||
|
vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
|
||||||
|
case 1:
|
||||||
|
vst3_lane_u8(outptr0, rgb0_l, 0);
|
||||||
|
vst3_lane_u8(outptr1, rgb1_l, 0);
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
44
simd/jsimd.h
44
simd/jsimd.h
@@ -824,6 +824,50 @@ EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_avx2
|
|||||||
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
||||||
JSAMPARRAY output_buf);
|
JSAMPARRAY output_buf);
|
||||||
|
|
||||||
|
EXTERN(void) jsimd_h2v1_merged_upsample_neon
|
||||||
|
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
||||||
|
JSAMPARRAY output_buf);
|
||||||
|
EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_neon
|
||||||
|
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
||||||
|
JSAMPARRAY output_buf);
|
||||||
|
EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_neon
|
||||||
|
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
||||||
|
JSAMPARRAY output_buf);
|
||||||
|
EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_neon
|
||||||
|
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
||||||
|
JSAMPARRAY output_buf);
|
||||||
|
EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_neon
|
||||||
|
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
||||||
|
JSAMPARRAY output_buf);
|
||||||
|
EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_neon
|
||||||
|
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
||||||
|
JSAMPARRAY output_buf);
|
||||||
|
EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_neon
|
||||||
|
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
||||||
|
JSAMPARRAY output_buf);
|
||||||
|
|
||||||
|
EXTERN(void) jsimd_h2v2_merged_upsample_neon
|
||||||
|
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
||||||
|
JSAMPARRAY output_buf);
|
||||||
|
EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_neon
|
||||||
|
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
||||||
|
JSAMPARRAY output_buf);
|
||||||
|
EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_neon
|
||||||
|
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
||||||
|
JSAMPARRAY output_buf);
|
||||||
|
EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_neon
|
||||||
|
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
||||||
|
JSAMPARRAY output_buf);
|
||||||
|
EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_neon
|
||||||
|
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
||||||
|
JSAMPARRAY output_buf);
|
||||||
|
EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_neon
|
||||||
|
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
||||||
|
JSAMPARRAY output_buf);
|
||||||
|
EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_neon
|
||||||
|
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
||||||
|
JSAMPARRAY output_buf);
|
||||||
|
|
||||||
EXTERN(void) jsimd_h2v1_merged_upsample_dspr2
|
EXTERN(void) jsimd_h2v1_merged_upsample_dspr2
|
||||||
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
|
||||||
JSAMPARRAY output_buf, JSAMPLE *range);
|
JSAMPARRAY output_buf, JSAMPLE *range);
|
||||||
|
|||||||
Reference in New Issue
Block a user