Neon: Intrinsics implementation of YCbCr->RGB
The previous AArch64 GAS implementation is retained by default when using GCC, in order to avoid a performance regression. The intrinsics implementation can be forced on or off using the new NEON_INTRINSICS CMake variable. The previous AArch32 GAS implementation has been removed, since the intrinsics implementation provides the same or better performance.
This commit is contained in:
@@ -272,7 +272,7 @@ if(NEON_INTRINSICS)
|
||||
endif()
|
||||
if(NEON_INTRINSICS OR BITS EQUAL 32)
|
||||
set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jchuff-neon.c
|
||||
arm/jfdctint-neon.c)
|
||||
arm/jdcolor-neon.c arm/jfdctint-neon.c)
|
||||
endif()
|
||||
if(BITS EQUAL 32)
|
||||
set_source_files_properties(${SIMD_SOURCES} COMPILE_FLAGS -mfpu=neon)
|
||||
|
||||
@@ -1270,14 +1270,9 @@ asm_function jsimd_idct_2x2_neon
|
||||
/*****************************************************************************/
|
||||
|
||||
/*
|
||||
* jsimd_ycc_extrgb_convert_neon
|
||||
* jsimd_ycc_extbgr_convert_neon
|
||||
* jsimd_ycc_extrgbx_convert_neon
|
||||
* jsimd_ycc_extbgrx_convert_neon
|
||||
* jsimd_ycc_extxbgr_convert_neon
|
||||
* jsimd_ycc_extxrgb_convert_neon
|
||||
* jsimd_ycc_rgb565_convert_neon
|
||||
*
|
||||
* Colorspace conversion YCbCr -> RGB
|
||||
* Colorspace conversion YCbCr -> RGB565
|
||||
*/
|
||||
|
||||
|
||||
@@ -1319,39 +1314,7 @@ asm_function jsimd_idct_2x2_neon
|
||||
.endm
|
||||
|
||||
.macro do_store bpp, size
|
||||
.if \bpp == 24
|
||||
.if \size == 8
|
||||
vst3.8 {d10, d11, d12}, [RGB]!
|
||||
.elseif \size == 4
|
||||
vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
|
||||
vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
|
||||
vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
|
||||
vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
|
||||
.elseif \size == 2
|
||||
vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
|
||||
vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
|
||||
.elseif \size == 1
|
||||
vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
|
||||
.else
|
||||
.error unsupported macroblock size
|
||||
.endif
|
||||
.elseif \bpp == 32
|
||||
.if \size == 8
|
||||
vst4.8 {d10, d11, d12, d13}, [RGB]!
|
||||
.elseif \size == 4
|
||||
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
|
||||
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
|
||||
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
|
||||
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
|
||||
.elseif \size == 2
|
||||
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
|
||||
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
|
||||
.elseif \size == 1
|
||||
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
|
||||
.else
|
||||
.error unsupported macroblock size
|
||||
.endif
|
||||
.elseif \bpp == 16
|
||||
.if \bpp == 16
|
||||
.if \size == 8
|
||||
vst1.16 {q15}, [RGB]!
|
||||
.elseif \size == 4
|
||||
@@ -1398,17 +1361,11 @@ asm_function jsimd_idct_2x2_neon
|
||||
vaddw.u8 q11, q10, d0
|
||||
vaddw.u8 q12, q12, d0
|
||||
vaddw.u8 q14, q14, d0
|
||||
.if \bpp != 16
|
||||
vqmovun.s16 d1\g_offs, q11
|
||||
vqmovun.s16 d1\r_offs, q12
|
||||
vqmovun.s16 d1\b_offs, q14
|
||||
.else /* rgb565 */
|
||||
vqshlu.s16 q13, q11, #8
|
||||
vqshlu.s16 q15, q12, #8
|
||||
vqshlu.s16 q14, q14, #8
|
||||
vsri.u16 q15, q13, #5
|
||||
vsri.u16 q15, q14, #11
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro do_yuv_to_rgb_stage2_store_load_stage1
|
||||
@@ -1431,20 +1388,6 @@ asm_function jsimd_idct_2x2_neon
|
||||
vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
|
||||
vaddw.u8 q12, q12, d0
|
||||
vaddw.u8 q14, q14, d0
|
||||
.if \bpp != 16 /**************** rgb24/rgb32 ******************************/
|
||||
vqmovun.s16 d1\g_offs, q11
|
||||
pld [Y, #64]
|
||||
vqmovun.s16 d1\r_offs, q12
|
||||
vld1.8 {d0}, [Y, :64]!
|
||||
vqmovun.s16 d1\b_offs, q14
|
||||
vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
|
||||
vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
|
||||
do_store \bpp, 8
|
||||
vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
|
||||
vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
|
||||
vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
|
||||
vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
|
||||
.else /**************************** rgb565 ********************************/
|
||||
vqshlu.s16 q13, q11, #8
|
||||
pld [Y, #64]
|
||||
vqshlu.s16 q15, q12, #8
|
||||
@@ -1459,7 +1402,6 @@ asm_function jsimd_idct_2x2_neon
|
||||
vmull.s16 q14, d6, d1[3]
|
||||
do_store \bpp, 8
|
||||
vmull.s16 q15, d7, d1[3]
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro do_yuv_to_rgb
|
||||
@@ -1595,12 +1537,6 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
.endm
|
||||
|
||||
/*--------------------------------- id ----- bpp R G B */
|
||||
generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
|
||||
generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
|
||||
generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
|
||||
generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
|
||||
generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
|
||||
generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
|
||||
generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0
|
||||
|
||||
.purgem do_load
|
||||
|
||||
@@ -349,20 +349,28 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
||||
|
||||
switch (cinfo->out_color_space) {
|
||||
case JCS_EXT_RGB:
|
||||
#ifndef NEON_INTRINSICS
|
||||
if (simd_features & JSIMD_FASTST3)
|
||||
#endif
|
||||
neonfct = jsimd_ycc_extrgb_convert_neon;
|
||||
#ifndef NEON_INTRINSICS
|
||||
else
|
||||
neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
case JCS_EXT_RGBA:
|
||||
neonfct = jsimd_ycc_extrgbx_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
#ifndef NEON_INTRINSICS
|
||||
if (simd_features & JSIMD_FASTST3)
|
||||
#endif
|
||||
neonfct = jsimd_ycc_extbgr_convert_neon;
|
||||
#ifndef NEON_INTRINSICS
|
||||
else
|
||||
neonfct = jsimd_ycc_extbgr_convert_neon_slowst3;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
case JCS_EXT_BGRA:
|
||||
@@ -377,10 +385,14 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
||||
neonfct = jsimd_ycc_extxrgb_convert_neon;
|
||||
break;
|
||||
default:
|
||||
#ifndef NEON_INTRINSICS
|
||||
if (simd_features & JSIMD_FASTST3)
|
||||
#endif
|
||||
neonfct = jsimd_ycc_extrgb_convert_neon;
|
||||
#ifndef NEON_INTRINSICS
|
||||
else
|
||||
neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
@@ -1929,16 +1929,20 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
|
||||
.endm
|
||||
|
||||
/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/
|
||||
#ifndef NEON_INTRINSICS
|
||||
generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1
|
||||
generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1
|
||||
generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1
|
||||
generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1
|
||||
generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1
|
||||
generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1
|
||||
#endif
|
||||
generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1
|
||||
|
||||
#ifndef NEON_INTRINSICS
|
||||
generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0
|
||||
generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0
|
||||
#endif
|
||||
|
||||
.purgem do_load
|
||||
.purgem do_store
|
||||
|
||||
309
simd/arm/jdcolext-neon.c
Normal file
309
simd/arm/jdcolext-neon.c
Normal file
@@ -0,0 +1,309 @@
|
||||
/*
|
||||
* jdcolext-neon.c - colorspace conversion (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* This file is included by jdcolor-neon.c. */
|
||||
|
||||
|
||||
/* YCbCr -> RGB conversion is defined by the following equations:
|
||||
* R = Y + 1.40200 * (Cr - 128)
|
||||
* G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
|
||||
* B = Y + 1.77200 * (Cb - 128)
|
||||
*
|
||||
* Scaled integer constants are used to avoid floating-point arithmetic:
|
||||
* 0.3441467 = 11277 * 2^-15
|
||||
* 0.7141418 = 23401 * 2^-15
|
||||
* 1.4020386 = 22971 * 2^-14
|
||||
* 1.7720337 = 29033 * 2^-14
|
||||
* These constants are defined in jdcolor-neon.c.
|
||||
*
|
||||
* To ensure correct results, rounding is used when descaling.
|
||||
*/
|
||||
|
||||
/* Notes on safe memory access for YCbCr -> RGB conversion routines:
|
||||
*
|
||||
* Input memory buffers can be safely overread up to the next multiple of
|
||||
* ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
|
||||
* jmemmgr.c.
|
||||
*
|
||||
* The output buffer cannot safely be written beyond output_width, since
|
||||
* output_buf points to a possibly unpadded row in the decompressed image
|
||||
* buffer allocated by the calling program.
|
||||
*/
|
||||
|
||||
void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
|
||||
JDIMENSION input_row, JSAMPARRAY output_buf,
|
||||
int num_rows)
|
||||
{
|
||||
JSAMPROW outptr;
|
||||
/* Pointers to Y, Cb, and Cr data */
|
||||
JSAMPROW inptr0, inptr1, inptr2;
|
||||
|
||||
const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
|
||||
const int16x8_t neg_128 = vdupq_n_s16(-128);
|
||||
|
||||
while (--num_rows >= 0) {
|
||||
inptr0 = input_buf[0][input_row];
|
||||
inptr1 = input_buf[1][input_row];
|
||||
inptr2 = input_buf[2][input_row];
|
||||
input_row++;
|
||||
outptr = *output_buf++;
|
||||
int cols_remaining = output_width;
|
||||
for (; cols_remaining >= 16; cols_remaining -= 16) {
|
||||
uint8x16_t y = vld1q_u8(inptr0);
|
||||
uint8x16_t cb = vld1q_u8(inptr1);
|
||||
uint8x16_t cr = vld1q_u8(inptr2);
|
||||
/* Subtract 128 from Cb and Cr. */
|
||||
int16x8_t cr_128_l =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
|
||||
vget_low_u8(cr)));
|
||||
int16x8_t cr_128_h =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
|
||||
vget_high_u8(cr)));
|
||||
int16x8_t cb_128_l =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
|
||||
vget_low_u8(cb)));
|
||||
int16x8_t cb_128_h =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
|
||||
vget_high_u8(cb)));
|
||||
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
|
||||
int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
|
||||
int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
|
||||
consts, 0);
|
||||
int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
|
||||
int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
|
||||
consts, 0);
|
||||
g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
|
||||
consts, 1);
|
||||
g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
|
||||
consts, 1);
|
||||
g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
|
||||
consts, 1);
|
||||
g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
|
||||
consts, 1);
|
||||
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
|
||||
int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
|
||||
vrshrn_n_s32(g_sub_y_lh, 15));
|
||||
int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
|
||||
vrshrn_n_s32(g_sub_y_hh, 15));
|
||||
/* Compute R-Y: 1.40200 * (Cr - 128) */
|
||||
int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
|
||||
consts, 2);
|
||||
int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
|
||||
consts, 2);
|
||||
/* Compute B-Y: 1.77200 * (Cb - 128) */
|
||||
int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
|
||||
consts, 3);
|
||||
int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
|
||||
consts, 3);
|
||||
/* Add Y. */
|
||||
int16x8_t r_l =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
|
||||
vget_low_u8(y)));
|
||||
int16x8_t r_h =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
|
||||
vget_high_u8(y)));
|
||||
int16x8_t b_l =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
|
||||
vget_low_u8(y)));
|
||||
int16x8_t b_h =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
|
||||
vget_high_u8(y)));
|
||||
int16x8_t g_l =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
|
||||
vget_low_u8(y)));
|
||||
int16x8_t g_h =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
|
||||
vget_high_u8(y)));
|
||||
|
||||
#ifdef RGB_ALPHA
|
||||
uint8x16x4_t rgba;
|
||||
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
|
||||
rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
|
||||
rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
|
||||
rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
|
||||
/* Set alpha channel to opaque (0xFF). */
|
||||
rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
|
||||
/* Store RGBA pixel data to memory. */
|
||||
vst4q_u8(outptr, rgba);
|
||||
#else
|
||||
uint8x16x3_t rgb;
|
||||
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
|
||||
rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
|
||||
rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
|
||||
rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
|
||||
/* Store RGB pixel data to memory. */
|
||||
vst3q_u8(outptr, rgb);
|
||||
#endif
|
||||
/* Increment pointers. */
|
||||
inptr0 += 16;
|
||||
inptr1 += 16;
|
||||
inptr2 += 16;
|
||||
outptr += (RGB_PIXELSIZE * 16);
|
||||
}
|
||||
|
||||
if (cols_remaining >= 8) {
|
||||
uint8x8_t y = vld1_u8(inptr0);
|
||||
uint8x8_t cb = vld1_u8(inptr1);
|
||||
uint8x8_t cr = vld1_u8(inptr2);
|
||||
/* Subtract 128 from Cb and Cr. */
|
||||
int16x8_t cr_128 =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
|
||||
int16x8_t cb_128 =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
|
||||
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
|
||||
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
|
||||
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
|
||||
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
|
||||
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
|
||||
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
|
||||
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
|
||||
vrshrn_n_s32(g_sub_y_h, 15));
|
||||
/* Compute R-Y: 1.40200 * (Cr - 128) */
|
||||
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
|
||||
consts, 2);
|
||||
/* Compute B-Y: 1.77200 * (Cb - 128) */
|
||||
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
|
||||
consts, 3);
|
||||
/* Add Y. */
|
||||
int16x8_t r =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
|
||||
int16x8_t b =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
|
||||
int16x8_t g =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
|
||||
|
||||
#ifdef RGB_ALPHA
|
||||
uint8x8x4_t rgba;
|
||||
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
|
||||
rgba.val[RGB_RED] = vqmovun_s16(r);
|
||||
rgba.val[RGB_GREEN] = vqmovun_s16(g);
|
||||
rgba.val[RGB_BLUE] = vqmovun_s16(b);
|
||||
/* Set alpha channel to opaque (0xFF). */
|
||||
rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
|
||||
/* Store RGBA pixel data to memory. */
|
||||
vst4_u8(outptr, rgba);
|
||||
#else
|
||||
uint8x8x3_t rgb;
|
||||
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
|
||||
rgb.val[RGB_RED] = vqmovun_s16(r);
|
||||
rgb.val[RGB_GREEN] = vqmovun_s16(g);
|
||||
rgb.val[RGB_BLUE] = vqmovun_s16(b);
|
||||
/* Store RGB pixel data to memory. */
|
||||
vst3_u8(outptr, rgb);
|
||||
#endif
|
||||
/* Increment pointers. */
|
||||
inptr0 += 8;
|
||||
inptr1 += 8;
|
||||
inptr2 += 8;
|
||||
outptr += (RGB_PIXELSIZE * 8);
|
||||
cols_remaining -= 8;
|
||||
}
|
||||
|
||||
/* Handle the tail elements. */
|
||||
if (cols_remaining > 0) {
|
||||
uint8x8_t y = vld1_u8(inptr0);
|
||||
uint8x8_t cb = vld1_u8(inptr1);
|
||||
uint8x8_t cr = vld1_u8(inptr2);
|
||||
/* Subtract 128 from Cb and Cr. */
|
||||
int16x8_t cr_128 =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
|
||||
int16x8_t cb_128 =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
|
||||
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
|
||||
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
|
||||
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
|
||||
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
|
||||
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
|
||||
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
|
||||
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
|
||||
vrshrn_n_s32(g_sub_y_h, 15));
|
||||
/* Compute R-Y: 1.40200 * (Cr - 128) */
|
||||
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
|
||||
consts, 2);
|
||||
/* Compute B-Y: 1.77200 * (Cb - 128) */
|
||||
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
|
||||
consts, 3);
|
||||
/* Add Y. */
|
||||
int16x8_t r =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
|
||||
int16x8_t b =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
|
||||
int16x8_t g =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
|
||||
|
||||
#ifdef RGB_ALPHA
|
||||
uint8x8x4_t rgba;
|
||||
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
|
||||
rgba.val[RGB_RED] = vqmovun_s16(r);
|
||||
rgba.val[RGB_GREEN] = vqmovun_s16(g);
|
||||
rgba.val[RGB_BLUE] = vqmovun_s16(b);
|
||||
/* Set alpha channel to opaque (0xFF). */
|
||||
rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
|
||||
/* Store RGBA pixel data to memory. */
|
||||
switch (cols_remaining) {
|
||||
case 7:
|
||||
vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
|
||||
case 6:
|
||||
vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
|
||||
case 5:
|
||||
vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
|
||||
case 4:
|
||||
vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
|
||||
case 3:
|
||||
vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
|
||||
case 2:
|
||||
vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
|
||||
case 1:
|
||||
vst4_lane_u8(outptr, rgba, 0);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#else
|
||||
uint8x8x3_t rgb;
|
||||
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
|
||||
rgb.val[RGB_RED] = vqmovun_s16(r);
|
||||
rgb.val[RGB_GREEN] = vqmovun_s16(g);
|
||||
rgb.val[RGB_BLUE] = vqmovun_s16(b);
|
||||
/* Store RGB pixel data to memory. */
|
||||
switch (cols_remaining) {
|
||||
case 7:
|
||||
vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
|
||||
case 6:
|
||||
vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
|
||||
case 5:
|
||||
vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
|
||||
case 4:
|
||||
vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
|
||||
case 3:
|
||||
vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
|
||||
case 2:
|
||||
vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
|
||||
case 1:
|
||||
vst3_lane_u8(outptr, rgb, 0);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
133
simd/arm/jdcolor-neon.c
Normal file
133
simd/arm/jdcolor-neon.c
Normal file
@@ -0,0 +1,133 @@
|
||||
/*
|
||||
* jdcolor-neon.c - colorspace conversion (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include "align.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
/* YCbCr -> RGB conversion constants */
|
||||
|
||||
#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
|
||||
#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
|
||||
#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
|
||||
#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
|
||||
|
||||
ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
|
||||
-F_0_344, F_0_714, F_1_402, F_1_772
|
||||
};
|
||||
|
||||
|
||||
/* Include inline routines for colorspace extensions. */
|
||||
|
||||
#include "jdcolext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
|
||||
#define RGB_RED EXT_RGB_RED
|
||||
#define RGB_GREEN EXT_RGB_GREEN
|
||||
#define RGB_BLUE EXT_RGB_BLUE
|
||||
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgb_convert_neon
|
||||
#include "jdcolext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_ycc_rgb_convert_neon
|
||||
|
||||
#define RGB_RED EXT_RGBX_RED
|
||||
#define RGB_GREEN EXT_RGBX_GREEN
|
||||
#define RGB_BLUE EXT_RGBX_BLUE
|
||||
#define RGB_ALPHA 3
|
||||
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgbx_convert_neon
|
||||
#include "jdcolext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_ALPHA
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_ycc_rgb_convert_neon
|
||||
|
||||
#define RGB_RED EXT_BGR_RED
|
||||
#define RGB_GREEN EXT_BGR_GREEN
|
||||
#define RGB_BLUE EXT_BGR_BLUE
|
||||
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgr_convert_neon
|
||||
#include "jdcolext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_ycc_rgb_convert_neon
|
||||
|
||||
#define RGB_RED EXT_BGRX_RED
|
||||
#define RGB_GREEN EXT_BGRX_GREEN
|
||||
#define RGB_BLUE EXT_BGRX_BLUE
|
||||
#define RGB_ALPHA 3
|
||||
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgrx_convert_neon
|
||||
#include "jdcolext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_ALPHA
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_ycc_rgb_convert_neon
|
||||
|
||||
#define RGB_RED EXT_XBGR_RED
|
||||
#define RGB_GREEN EXT_XBGR_GREEN
|
||||
#define RGB_BLUE EXT_XBGR_BLUE
|
||||
#define RGB_ALPHA 0
|
||||
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxbgr_convert_neon
|
||||
#include "jdcolext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_ALPHA
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_ycc_rgb_convert_neon
|
||||
|
||||
#define RGB_RED EXT_XRGB_RED
|
||||
#define RGB_GREEN EXT_XRGB_GREEN
|
||||
#define RGB_BLUE EXT_XRGB_BLUE
|
||||
#define RGB_ALPHA 0
|
||||
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxrgb_convert_neon
|
||||
#include "jdcolext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_ALPHA
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_ycc_rgb_convert_neon
|
||||
@@ -450,6 +450,8 @@ EXTERN(void) jsimd_ycc_rgb565_convert_neon
|
||||
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows);
|
||||
|
||||
#ifndef NEON_INTRINSICS
|
||||
|
||||
EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3
|
||||
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows);
|
||||
@@ -457,6 +459,8 @@ EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3
|
||||
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows);
|
||||
|
||||
#endif
|
||||
|
||||
EXTERN(void) jsimd_ycc_rgb_convert_dspr2
|
||||
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows);
|
||||
|
||||
Reference in New Issue
Block a user