Neon: Intrinsics implementation of YCbCr->RGB

The previous AArch64 GAS implementation is retained by default when
using GCC, in order to avoid a performance regression.  The intrinsics
implementation can be forced on or off using the new NEON_INTRINSICS
CMake variable.  The previous AArch32 GAS implementation has been
removed, since the intrinsics implementation provides the same or better
performance.
This commit is contained in:
Jonathan Wright
2018-07-16 10:25:14 +01:00
committed by DRC
parent f3c3f01d23
commit 0f35cd68f2
7 changed files with 466 additions and 68 deletions

View File

@@ -272,7 +272,7 @@ if(NEON_INTRINSICS)
endif() endif()
if(NEON_INTRINSICS OR BITS EQUAL 32) if(NEON_INTRINSICS OR BITS EQUAL 32)
set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jchuff-neon.c set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jchuff-neon.c
arm/jfdctint-neon.c) arm/jdcolor-neon.c arm/jfdctint-neon.c)
endif() endif()
if(BITS EQUAL 32) if(BITS EQUAL 32)
set_source_files_properties(${SIMD_SOURCES} COMPILE_FLAGS -mfpu=neon) set_source_files_properties(${SIMD_SOURCES} COMPILE_FLAGS -mfpu=neon)

View File

@@ -1270,14 +1270,9 @@ asm_function jsimd_idct_2x2_neon
/*****************************************************************************/ /*****************************************************************************/
/* /*
* jsimd_ycc_extrgb_convert_neon * jsimd_ycc_rgb565_convert_neon
* jsimd_ycc_extbgr_convert_neon
* jsimd_ycc_extrgbx_convert_neon
* jsimd_ycc_extbgrx_convert_neon
* jsimd_ycc_extxbgr_convert_neon
* jsimd_ycc_extxrgb_convert_neon
* *
* Colorspace conversion YCbCr -> RGB * Colorspace conversion YCbCr -> RGB565
*/ */
@@ -1319,39 +1314,7 @@ asm_function jsimd_idct_2x2_neon
.endm .endm
.macro do_store bpp, size .macro do_store bpp, size
.if \bpp == 24 .if \bpp == 16
.if \size == 8
vst3.8 {d10, d11, d12}, [RGB]!
.elseif \size == 4
vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
.elseif \size == 2
vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
.elseif \size == 1
vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
.else
.error unsupported macroblock size
.endif
.elseif \bpp == 32
.if \size == 8
vst4.8 {d10, d11, d12, d13}, [RGB]!
.elseif \size == 4
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
.elseif \size == 2
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
.elseif \size == 1
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
.else
.error unsupported macroblock size
.endif
.elseif \bpp == 16
.if \size == 8 .if \size == 8
vst1.16 {q15}, [RGB]! vst1.16 {q15}, [RGB]!
.elseif \size == 4 .elseif \size == 4
@@ -1398,17 +1361,11 @@ asm_function jsimd_idct_2x2_neon
vaddw.u8 q11, q10, d0 vaddw.u8 q11, q10, d0
vaddw.u8 q12, q12, d0 vaddw.u8 q12, q12, d0
vaddw.u8 q14, q14, d0 vaddw.u8 q14, q14, d0
.if \bpp != 16
vqmovun.s16 d1\g_offs, q11
vqmovun.s16 d1\r_offs, q12
vqmovun.s16 d1\b_offs, q14
.else /* rgb565 */
vqshlu.s16 q13, q11, #8 vqshlu.s16 q13, q11, #8
vqshlu.s16 q15, q12, #8 vqshlu.s16 q15, q12, #8
vqshlu.s16 q14, q14, #8 vqshlu.s16 q14, q14, #8
vsri.u16 q15, q13, #5 vsri.u16 q15, q13, #5
vsri.u16 q15, q14, #11 vsri.u16 q15, q14, #11
.endif
.endm .endm
.macro do_yuv_to_rgb_stage2_store_load_stage1 .macro do_yuv_to_rgb_stage2_store_load_stage1
@@ -1431,20 +1388,6 @@ asm_function jsimd_idct_2x2_neon
vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
vaddw.u8 q12, q12, d0 vaddw.u8 q12, q12, d0
vaddw.u8 q14, q14, d0 vaddw.u8 q14, q14, d0
.if \bpp != 16 /**************** rgb24/rgb32 ******************************/
vqmovun.s16 d1\g_offs, q11
pld [Y, #64]
vqmovun.s16 d1\r_offs, q12
vld1.8 {d0}, [Y, :64]!
vqmovun.s16 d1\b_offs, q14
vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
do_store \bpp, 8
vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
.else /**************************** rgb565 ********************************/
vqshlu.s16 q13, q11, #8 vqshlu.s16 q13, q11, #8
pld [Y, #64] pld [Y, #64]
vqshlu.s16 q15, q12, #8 vqshlu.s16 q15, q12, #8
@@ -1459,7 +1402,6 @@ asm_function jsimd_idct_2x2_neon
vmull.s16 q14, d6, d1[3] vmull.s16 q14, d6, d1[3]
do_store \bpp, 8 do_store \bpp, 8
vmull.s16 q15, d7, d1[3] vmull.s16 q15, d7, d1[3]
.endif
.endm .endm
.macro do_yuv_to_rgb .macro do_yuv_to_rgb
@@ -1595,12 +1537,6 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
.endm .endm
/*--------------------------------- id ----- bpp R G B */ /*--------------------------------- id ----- bpp R G B */
generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0
.purgem do_load .purgem do_load

View File

@@ -349,20 +349,28 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
switch (cinfo->out_color_space) { switch (cinfo->out_color_space) {
case JCS_EXT_RGB: case JCS_EXT_RGB:
#ifndef NEON_INTRINSICS
if (simd_features & JSIMD_FASTST3) if (simd_features & JSIMD_FASTST3)
#endif
neonfct = jsimd_ycc_extrgb_convert_neon; neonfct = jsimd_ycc_extrgb_convert_neon;
#ifndef NEON_INTRINSICS
else else
neonfct = jsimd_ycc_extrgb_convert_neon_slowst3; neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
#endif
break; break;
case JCS_EXT_RGBX: case JCS_EXT_RGBX:
case JCS_EXT_RGBA: case JCS_EXT_RGBA:
neonfct = jsimd_ycc_extrgbx_convert_neon; neonfct = jsimd_ycc_extrgbx_convert_neon;
break; break;
case JCS_EXT_BGR: case JCS_EXT_BGR:
#ifndef NEON_INTRINSICS
if (simd_features & JSIMD_FASTST3) if (simd_features & JSIMD_FASTST3)
#endif
neonfct = jsimd_ycc_extbgr_convert_neon; neonfct = jsimd_ycc_extbgr_convert_neon;
#ifndef NEON_INTRINSICS
else else
neonfct = jsimd_ycc_extbgr_convert_neon_slowst3; neonfct = jsimd_ycc_extbgr_convert_neon_slowst3;
#endif
break; break;
case JCS_EXT_BGRX: case JCS_EXT_BGRX:
case JCS_EXT_BGRA: case JCS_EXT_BGRA:
@@ -377,10 +385,14 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
neonfct = jsimd_ycc_extxrgb_convert_neon; neonfct = jsimd_ycc_extxrgb_convert_neon;
break; break;
default: default:
#ifndef NEON_INTRINSICS
if (simd_features & JSIMD_FASTST3) if (simd_features & JSIMD_FASTST3)
#endif
neonfct = jsimd_ycc_extrgb_convert_neon; neonfct = jsimd_ycc_extrgb_convert_neon;
#ifndef NEON_INTRINSICS
else else
neonfct = jsimd_ycc_extrgb_convert_neon_slowst3; neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
#endif
break; break;
} }

View File

@@ -1929,16 +1929,20 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
.endm .endm
/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/ /*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/
#ifndef NEON_INTRINSICS
generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1
generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1
generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1
generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1
generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1
generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1
#endif
generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1
#ifndef NEON_INTRINSICS
generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0
generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0
#endif
.purgem do_load .purgem do_load
.purgem do_store .purgem do_store

309
simd/arm/jdcolext-neon.c Normal file
View File

@@ -0,0 +1,309 @@
/*
* jdcolext-neon.c - colorspace conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jdcolor-neon.c. */
/* YCbCr -> RGB conversion is defined by the following equations:
* R = Y + 1.40200 * (Cr - 128)
* G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
* B = Y + 1.77200 * (Cb - 128)
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.3441467 = 11277 * 2^-15
* 0.7141418 = 23401 * 2^-15
* 1.4020386 = 22971 * 2^-14
* 1.7720337 = 29033 * 2^-14
* These constants are defined in jdcolor-neon.c.
*
* To ensure correct results, rounding is used when descaling.
*/
/* Notes on safe memory access for YCbCr -> RGB conversion routines:
*
* Input memory buffers can be safely overread up to the next multiple of
* ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
* jmemmgr.c.
*
* The output buffer cannot safely be written beyond output_width, since
* output_buf points to a possibly unpadded row in the decompressed image
* buffer allocated by the calling program.
*/
void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
JDIMENSION input_row, JSAMPARRAY output_buf,
int num_rows)
{
JSAMPROW outptr;
/* Pointers to Y, Cb, and Cr data */
JSAMPROW inptr0, inptr1, inptr2;
const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
const int16x8_t neg_128 = vdupq_n_s16(-128);
while (--num_rows >= 0) {
inptr0 = input_buf[0][input_row];
inptr1 = input_buf[1][input_row];
inptr2 = input_buf[2][input_row];
input_row++;
outptr = *output_buf++;
int cols_remaining = output_width;
for (; cols_remaining >= 16; cols_remaining -= 16) {
uint8x16_t y = vld1q_u8(inptr0);
uint8x16_t cb = vld1q_u8(inptr1);
uint8x16_t cr = vld1q_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128_l =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
vget_low_u8(cr)));
int16x8_t cr_128_h =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
vget_high_u8(cr)));
int16x8_t cb_128_l =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
vget_low_u8(cb)));
int16x8_t cb_128_h =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
vget_high_u8(cb)));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
consts, 0);
int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
consts, 0);
g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
consts, 1);
g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
consts, 1);
g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
consts, 1);
g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
vrshrn_n_s32(g_sub_y_lh, 15));
int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
vrshrn_n_s32(g_sub_y_hh, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
consts, 2);
int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
consts, 3);
int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
consts, 3);
/* Add Y. */
int16x8_t r_l =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
vget_low_u8(y)));
int16x8_t r_h =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
vget_high_u8(y)));
int16x8_t b_l =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
vget_low_u8(y)));
int16x8_t b_h =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
vget_high_u8(y)));
int16x8_t g_l =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
vget_low_u8(y)));
int16x8_t g_h =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
vget_high_u8(y)));
#ifdef RGB_ALPHA
uint8x16x4_t rgba;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
/* Set alpha channel to opaque (0xFF). */
rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
vst4q_u8(outptr, rgba);
#else
uint8x16x3_t rgb;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
/* Store RGB pixel data to memory. */
vst3q_u8(outptr, rgb);
#endif
/* Increment pointers. */
inptr0 += 16;
inptr1 += 16;
inptr2 += 16;
outptr += (RGB_PIXELSIZE * 16);
}
if (cols_remaining >= 8) {
uint8x8_t y = vld1_u8(inptr0);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
int16x8_t cb_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
consts, 3);
/* Add Y. */
int16x8_t r =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
int16x8_t b =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
int16x8_t g =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
#ifdef RGB_ALPHA
uint8x8x4_t rgba;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgba.val[RGB_RED] = vqmovun_s16(r);
rgba.val[RGB_GREEN] = vqmovun_s16(g);
rgba.val[RGB_BLUE] = vqmovun_s16(b);
/* Set alpha channel to opaque (0xFF). */
rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
vst4_u8(outptr, rgba);
#else
uint8x8x3_t rgb;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgb.val[RGB_RED] = vqmovun_s16(r);
rgb.val[RGB_GREEN] = vqmovun_s16(g);
rgb.val[RGB_BLUE] = vqmovun_s16(b);
/* Store RGB pixel data to memory. */
vst3_u8(outptr, rgb);
#endif
/* Increment pointers. */
inptr0 += 8;
inptr1 += 8;
inptr2 += 8;
outptr += (RGB_PIXELSIZE * 8);
cols_remaining -= 8;
}
/* Handle the tail elements. */
if (cols_remaining > 0) {
uint8x8_t y = vld1_u8(inptr0);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
int16x8_t cb_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
consts, 3);
/* Add Y. */
int16x8_t r =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
int16x8_t b =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
int16x8_t g =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
#ifdef RGB_ALPHA
uint8x8x4_t rgba;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgba.val[RGB_RED] = vqmovun_s16(r);
rgba.val[RGB_GREEN] = vqmovun_s16(g);
rgba.val[RGB_BLUE] = vqmovun_s16(b);
/* Set alpha channel to opaque (0xFF). */
rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
switch (cols_remaining) {
case 7:
vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
case 6:
vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
case 5:
vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
case 4:
vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
case 3:
vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
case 2:
vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
case 1:
vst4_lane_u8(outptr, rgba, 0);
default:
break;
}
#else
uint8x8x3_t rgb;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgb.val[RGB_RED] = vqmovun_s16(r);
rgb.val[RGB_GREEN] = vqmovun_s16(g);
rgb.val[RGB_BLUE] = vqmovun_s16(b);
/* Store RGB pixel data to memory. */
switch (cols_remaining) {
case 7:
vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
case 6:
vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
case 5:
vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
case 4:
vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
case 3:
vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
case 2:
vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
case 1:
vst3_lane_u8(outptr, rgb, 0);
default:
break;
}
#endif
}
}
}

133
simd/arm/jdcolor-neon.c Normal file
View File

@@ -0,0 +1,133 @@
/*
* jdcolor-neon.c - colorspace conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include <arm_neon.h>
/* YCbCr -> RGB conversion constants */
#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
-F_0_344, F_0_714, F_1_402, F_1_772
};
/* Include inline routines for colorspace extensions. */
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#define RGB_RED EXT_RGB_RED
#define RGB_GREEN EXT_RGB_GREEN
#define RGB_BLUE EXT_RGB_BLUE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgb_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
#define RGB_RED EXT_RGBX_RED
#define RGB_GREEN EXT_RGBX_GREEN
#define RGB_BLUE EXT_RGBX_BLUE
#define RGB_ALPHA 3
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgbx_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
#define RGB_RED EXT_BGR_RED
#define RGB_GREEN EXT_BGR_GREEN
#define RGB_BLUE EXT_BGR_BLUE
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgr_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
#define RGB_RED EXT_BGRX_RED
#define RGB_GREEN EXT_BGRX_GREEN
#define RGB_BLUE EXT_BGRX_BLUE
#define RGB_ALPHA 3
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgrx_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
#define RGB_RED EXT_XBGR_RED
#define RGB_GREEN EXT_XBGR_GREEN
#define RGB_BLUE EXT_XBGR_BLUE
#define RGB_ALPHA 0
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxbgr_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
#define RGB_RED EXT_XRGB_RED
#define RGB_GREEN EXT_XRGB_GREEN
#define RGB_BLUE EXT_XRGB_BLUE
#define RGB_ALPHA 0
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxrgb_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon

View File

@@ -450,6 +450,8 @@ EXTERN(void) jsimd_ycc_rgb565_convert_neon
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows); JSAMPARRAY output_buf, int num_rows);
#ifndef NEON_INTRINSICS
EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3 EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows); JSAMPARRAY output_buf, int num_rows);
@@ -457,6 +459,8 @@ EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows); JSAMPARRAY output_buf, int num_rows);
#endif
EXTERN(void) jsimd_ycc_rgb_convert_dspr2 EXTERN(void) jsimd_ycc_rgb_convert_dspr2
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows); JSAMPARRAY output_buf, int num_rows);