From 0f35cd68f2328983fd97f9a393c23455b9ddba99 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 16 Jul 2018 10:25:14 +0100 Subject: [PATCH] Neon: Intrinsics implementation of YCbCr->RGB The previous AArch64 GAS implementation is retained by default when using GCC, in order to avoid a performance regression. The intrinsics implementation can be forced on or off using the new NEON_INTRINSICS CMake variable. The previous AArch32 GAS implementation has been removed, since the intrinsics implementation provides the same or better performance. --- simd/CMakeLists.txt | 2 +- simd/arm/aarch32/jsimd_neon.S | 70 +------- simd/arm/aarch64/jsimd.c | 12 ++ simd/arm/aarch64/jsimd_neon.S | 4 + simd/arm/jdcolext-neon.c | 309 ++++++++++++++++++++++++++++++++++ simd/arm/jdcolor-neon.c | 133 +++++++++++++++ simd/jsimd.h | 4 + 7 files changed, 466 insertions(+), 68 deletions(-) create mode 100644 simd/arm/jdcolext-neon.c create mode 100644 simd/arm/jdcolor-neon.c diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt index 223251cd..db1d966a 100644 --- a/simd/CMakeLists.txt +++ b/simd/CMakeLists.txt @@ -272,7 +272,7 @@ if(NEON_INTRINSICS) endif() if(NEON_INTRINSICS OR BITS EQUAL 32) set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jchuff-neon.c - arm/jfdctint-neon.c) + arm/jdcolor-neon.c arm/jfdctint-neon.c) endif() if(BITS EQUAL 32) set_source_files_properties(${SIMD_SOURCES} COMPILE_FLAGS -mfpu=neon) diff --git a/simd/arm/aarch32/jsimd_neon.S b/simd/arm/aarch32/jsimd_neon.S index 4cd293d8..cfebe1b5 100644 --- a/simd/arm/aarch32/jsimd_neon.S +++ b/simd/arm/aarch32/jsimd_neon.S @@ -1270,14 +1270,9 @@ asm_function jsimd_idct_2x2_neon /*****************************************************************************/ /* - * jsimd_ycc_extrgb_convert_neon - * jsimd_ycc_extbgr_convert_neon - * jsimd_ycc_extrgbx_convert_neon - * jsimd_ycc_extbgrx_convert_neon - * jsimd_ycc_extxbgr_convert_neon - * jsimd_ycc_extxrgb_convert_neon + * jsimd_ycc_rgb565_convert_neon * - * Colorspace conversion YCbCr -> RGB + * Colorspace conversion YCbCr -> RGB565 */ @@ -1319,39 +1314,7 @@ asm_function jsimd_idct_2x2_neon .endm .macro do_store bpp, size - .if \bpp == 24 - .if \size == 8 - vst3.8 {d10, d11, d12}, [RGB]! - .elseif \size == 4 - vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! - vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! - vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! - vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! - .elseif \size == 2 - vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! - vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! - .elseif \size == 1 - vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! - .else - .error unsupported macroblock size - .endif - .elseif \bpp == 32 - .if \size == 8 - vst4.8 {d10, d11, d12, d13}, [RGB]! - .elseif \size == 4 - vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! - vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! - .elseif \size == 2 - vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! - .elseif \size == 1 - vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! - .else - .error unsupported macroblock size - .endif - .elseif \bpp == 16 + .if \bpp == 16 .if \size == 8 vst1.16 {q15}, [RGB]! .elseif \size == 4 @@ -1398,17 +1361,11 @@ asm_function jsimd_idct_2x2_neon vaddw.u8 q11, q10, d0 vaddw.u8 q12, q12, d0 vaddw.u8 q14, q14, d0 - .if \bpp != 16 - vqmovun.s16 d1\g_offs, q11 - vqmovun.s16 d1\r_offs, q12 - vqmovun.s16 d1\b_offs, q14 - .else /* rgb565 */ vqshlu.s16 q13, q11, #8 vqshlu.s16 q15, q12, #8 vqshlu.s16 q14, q14, #8 vsri.u16 q15, q13, #5 vsri.u16 q15, q14, #11 - .endif .endm .macro do_yuv_to_rgb_stage2_store_load_stage1 @@ -1431,20 +1388,6 @@ asm_function jsimd_idct_2x2_neon vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ vaddw.u8 q12, q12, d0 vaddw.u8 q14, q14, d0 - .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ - vqmovun.s16 d1\g_offs, q11 - pld [Y, #64] - vqmovun.s16 d1\r_offs, q12 - vld1.8 {d0}, [Y, :64]! - vqmovun.s16 d1\b_offs, q14 - vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ - vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ - do_store \bpp, 8 - vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ - vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ - vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ - vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ - .else /**************************** rgb565 ********************************/ vqshlu.s16 q13, q11, #8 pld [Y, #64] vqshlu.s16 q15, q12, #8 @@ -1459,7 +1402,6 @@ asm_function jsimd_idct_2x2_neon vmull.s16 q14, d6, d1[3] do_store \bpp, 8 vmull.s16 q15, d7, d1[3] - .endif .endm .macro do_yuv_to_rgb @@ -1595,12 +1537,6 @@ asm_function jsimd_ycc_\colorid\()_convert_neon .endm /*--------------------------------- id ----- bpp R G B */ -generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 -generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 -generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 -generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 -generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 -generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0 .purgem do_load diff --git a/simd/arm/aarch64/jsimd.c b/simd/arm/aarch64/jsimd.c index 9a1196b8..b588d8be 100644 --- a/simd/arm/aarch64/jsimd.c +++ b/simd/arm/aarch64/jsimd.c @@ -349,20 +349,28 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, switch (cinfo->out_color_space) { case JCS_EXT_RGB: +#ifndef NEON_INTRINSICS if (simd_features & JSIMD_FASTST3) +#endif neonfct = jsimd_ycc_extrgb_convert_neon; +#ifndef NEON_INTRINSICS else neonfct = jsimd_ycc_extrgb_convert_neon_slowst3; +#endif break; case JCS_EXT_RGBX: case JCS_EXT_RGBA: neonfct = jsimd_ycc_extrgbx_convert_neon; break; case JCS_EXT_BGR: +#ifndef NEON_INTRINSICS if (simd_features & JSIMD_FASTST3) +#endif neonfct = jsimd_ycc_extbgr_convert_neon; +#ifndef NEON_INTRINSICS else neonfct = jsimd_ycc_extbgr_convert_neon_slowst3; +#endif break; case JCS_EXT_BGRX: case JCS_EXT_BGRA: @@ -377,10 +385,14 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, neonfct = jsimd_ycc_extxrgb_convert_neon; break; default: +#ifndef NEON_INTRINSICS if (simd_features & JSIMD_FASTST3) +#endif neonfct = jsimd_ycc_extrgb_convert_neon; +#ifndef NEON_INTRINSICS else neonfct = jsimd_ycc_extrgb_convert_neon_slowst3; +#endif break; } diff --git a/simd/arm/aarch64/jsimd_neon.S b/simd/arm/aarch64/jsimd_neon.S index 9af70621..dd97aae9 100644 --- a/simd/arm/aarch64/jsimd_neon.S +++ b/simd/arm/aarch64/jsimd_neon.S @@ -1929,16 +1929,20 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 .endm /*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/ +#ifndef NEON_INTRINSICS generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1 +#endif generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1 +#ifndef NEON_INTRINSICS generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0 +#endif .purgem do_load .purgem do_store diff --git a/simd/arm/jdcolext-neon.c b/simd/arm/jdcolext-neon.c new file mode 100644 index 00000000..5e4baa94 --- /dev/null +++ b/simd/arm/jdcolext-neon.c @@ -0,0 +1,309 @@ +/* + * jdcolext-neon.c - colorspace conversion (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jdcolor-neon.c. */ + + +/* YCbCr -> RGB conversion is defined by the following equations: + * R = Y + 1.40200 * (Cr - 128) + * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) + * B = Y + 1.77200 * (Cb - 128) + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.3441467 = 11277 * 2^-15 + * 0.7141418 = 23401 * 2^-15 + * 1.4020386 = 22971 * 2^-14 + * 1.7720337 = 29033 * 2^-14 + * These constants are defined in jdcolor-neon.c. + * + * To ensure correct results, rounding is used when descaling. + */ + +/* Notes on safe memory access for YCbCr -> RGB conversion routines: + * + * Input memory buffers can be safely overread up to the next multiple of + * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in + * jmemmgr.c. + * + * The output buffer cannot safely be written beyond output_width, since + * output_buf points to a possibly unpadded row in the decompressed image + * buffer allocated by the calling program. + */ + +void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + JSAMPROW outptr; + /* Pointers to Y, Cb, and Cr data */ + JSAMPROW inptr0, inptr1, inptr2; + + const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts); + const int16x8_t neg_128 = vdupq_n_s16(-128); + + while (--num_rows >= 0) { + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + int cols_remaining = output_width; + for (; cols_remaining >= 16; cols_remaining -= 16) { + uint8x16_t y = vld1q_u8(inptr0); + uint8x16_t cb = vld1q_u8(inptr1); + uint8x16_t cr = vld1q_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128_l = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), + vget_low_u8(cr))); + int16x8_t cr_128_h = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), + vget_high_u8(cr))); + int16x8_t cb_128_l = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), + vget_low_u8(cb))); + int16x8_t cb_128_h = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), + vget_high_u8(cb))); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0); + int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l), + consts, 0); + int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0); + int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h), + consts, 0); + g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l), + consts, 1); + g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l), + consts, 1); + g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h), + consts, 1); + g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h), + consts, 1); + /* Descale G components: shift right 15, round, and narrow to 16-bit. */ + int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15), + vrshrn_n_s32(g_sub_y_lh, 15)); + int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15), + vrshrn_n_s32(g_sub_y_hh, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1), + consts, 2); + int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1), + consts, 2); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1), + consts, 3); + int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1), + consts, 3); + /* Add Y. */ + int16x8_t r_l = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l), + vget_low_u8(y))); + int16x8_t r_h = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h), + vget_high_u8(y))); + int16x8_t b_l = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l), + vget_low_u8(y))); + int16x8_t b_h = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h), + vget_high_u8(y))); + int16x8_t g_l = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l), + vget_low_u8(y))); + int16x8_t g_h = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h), + vget_high_u8(y))); + +#ifdef RGB_ALPHA + uint8x16x4_t rgba; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h)); + rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h)); + rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h)); + /* Set alpha channel to opaque (0xFF). */ + rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + vst4q_u8(outptr, rgba); +#else + uint8x16x3_t rgb; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h)); + rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h)); + rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h)); + /* Store RGB pixel data to memory. */ + vst3q_u8(outptr, rgb); +#endif + /* Increment pointers. */ + inptr0 += 16; + inptr1 += 16; + inptr2 += 16; + outptr += (RGB_PIXELSIZE * 16); + } + + if (cols_remaining >= 8) { + uint8x8_t y = vld1_u8(inptr0); + uint8x8_t cb = vld1_u8(inptr1); + uint8x8_t cr = vld1_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128 = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); + int16x8_t cb_128 = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0); + int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0); + g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1); + g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1); + /* Descale G components: shift right 15, round, and narrow to 16-bit. */ + int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), + vrshrn_n_s32(g_sub_y_h, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), + consts, 2); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), + consts, 3); + /* Add Y. */ + int16x8_t r = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y)); + int16x8_t b = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y)); + int16x8_t g = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y)); + +#ifdef RGB_ALPHA + uint8x8x4_t rgba; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgba.val[RGB_RED] = vqmovun_s16(r); + rgba.val[RGB_GREEN] = vqmovun_s16(g); + rgba.val[RGB_BLUE] = vqmovun_s16(b); + /* Set alpha channel to opaque (0xFF). */ + rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + vst4_u8(outptr, rgba); +#else + uint8x8x3_t rgb; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgb.val[RGB_RED] = vqmovun_s16(r); + rgb.val[RGB_GREEN] = vqmovun_s16(g); + rgb.val[RGB_BLUE] = vqmovun_s16(b); + /* Store RGB pixel data to memory. */ + vst3_u8(outptr, rgb); +#endif + /* Increment pointers. */ + inptr0 += 8; + inptr1 += 8; + inptr2 += 8; + outptr += (RGB_PIXELSIZE * 8); + cols_remaining -= 8; + } + + /* Handle the tail elements. */ + if (cols_remaining > 0) { + uint8x8_t y = vld1_u8(inptr0); + uint8x8_t cb = vld1_u8(inptr1); + uint8x8_t cr = vld1_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128 = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); + int16x8_t cb_128 = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0); + int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0); + g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1); + g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1); + /* Descale G components: shift right 15, round, and narrow to 16-bit. */ + int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), + vrshrn_n_s32(g_sub_y_h, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), + consts, 2); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), + consts, 3); + /* Add Y. */ + int16x8_t r = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y)); + int16x8_t b = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y)); + int16x8_t g = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y)); + +#ifdef RGB_ALPHA + uint8x8x4_t rgba; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgba.val[RGB_RED] = vqmovun_s16(r); + rgba.val[RGB_GREEN] = vqmovun_s16(g); + rgba.val[RGB_BLUE] = vqmovun_s16(b); + /* Set alpha channel to opaque (0xFF). */ + rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + switch (cols_remaining) { + case 7: + vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6); + case 6: + vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5); + case 5: + vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4); + case 4: + vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3); + case 3: + vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2); + case 2: + vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1); + case 1: + vst4_lane_u8(outptr, rgba, 0); + default: + break; + } +#else + uint8x8x3_t rgb; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgb.val[RGB_RED] = vqmovun_s16(r); + rgb.val[RGB_GREEN] = vqmovun_s16(g); + rgb.val[RGB_BLUE] = vqmovun_s16(b); + /* Store RGB pixel data to memory. */ + switch (cols_remaining) { + case 7: + vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6); + case 6: + vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5); + case 5: + vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4); + case 4: + vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3); + case 3: + vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2); + case 2: + vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1); + case 1: + vst3_lane_u8(outptr, rgb, 0); + default: + break; + } +#endif + } + } +} diff --git a/simd/arm/jdcolor-neon.c b/simd/arm/jdcolor-neon.c new file mode 100644 index 00000000..abbef8ce --- /dev/null +++ b/simd/arm/jdcolor-neon.c @@ -0,0 +1,133 @@ +/* + * jdcolor-neon.c - colorspace conversion (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include "align.h" + +#include + + +/* YCbCr -> RGB conversion constants */ + +#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */ +#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */ +#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */ +#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */ + +ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = { + -F_0_344, F_0_714, F_1_402, F_1_772 +}; + + +/* Include inline routines for colorspace extensions. */ + +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgb_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_ALPHA 3 +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgbx_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgr_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_ALPHA 3 +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgrx_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_ALPHA 0 +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxbgr_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_ALPHA 0 +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxrgb_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon diff --git a/simd/jsimd.h b/simd/jsimd.h index 40d4721d..a2758b48 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -450,6 +450,8 @@ EXTERN(void) jsimd_ycc_rgb565_convert_neon (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows); +#ifndef NEON_INTRINSICS + EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3 (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows); @@ -457,6 +459,8 @@ EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3 (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows); +#endif + EXTERN(void) jsimd_ycc_rgb_convert_dspr2 (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows);