Fix Neon SIMD build issues with Visual Studio
- Use the _M_ARM and _M_ARM64 macros provided by Visual Studio for compile-time detection of Arm builds, since __arm__ and __aarch64__ are only present in GNU-compatible compilers. - Neon/intrinsics: Use the _CountLeadingZeros() and _CountLeadingZeros64() intrinsics provided by Visual Studio, since __builtin_clz() and __builtin_clzl() are only present in GNU-compatible compilers. - Neon/intrinsics: Since Visual Studio does not support static vector initialization, replace static initialization of Neon vectors with the appropriate intrinsics. Compared to the static initialization approach, this produces identical assembly code with both GCC and Clang. - Neon/intrinsics: Since Visual Studio does not support inline assembly code, provide alternative code paths for Visual Studio whenever inline assembly is used. - Build: Set FLOATTEST appropriately for AArch64 Visual Studio builds (Visual Studio does not emit fused multiply-add [FMA] instructions by default for such builds.) - Neon/intrinsics: Move temporary buffer allocation outside of nested loops. Since Visual Studio configures Arm builds with a relatively small amount of stack memory, attempting to allocate those buffers within the inner loops caused a stack overflow. Closes #461 Closes #475
This commit is contained in:
@@ -892,7 +892,7 @@ if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386")
|
|||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
if((CPU_TYPE STREQUAL "powerpc" OR CPU_TYPE STREQUAL "arm64") AND
|
if((CPU_TYPE STREQUAL "powerpc" OR CPU_TYPE STREQUAL "arm64") AND
|
||||||
NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
NOT CMAKE_C_COMPILER_ID STREQUAL "Clang" AND NOT MSVC)
|
||||||
set(DEFAULT_FLOATTEST fp-contract)
|
set(DEFAULT_FLOATTEST fp-contract)
|
||||||
else()
|
else()
|
||||||
set(DEFAULT_FLOATTEST no-fp-contract)
|
set(DEFAULT_FLOATTEST no-fp-contract)
|
||||||
|
|||||||
@@ -135,7 +135,9 @@ default.
|
|||||||
for merged upsampling/color conversion, 1.5.1[5] is no longer necessary and has
|
for merged upsampling/color conversion, 1.5.1[5] is no longer necessary and has
|
||||||
been reverted.
|
been reverted.
|
||||||
|
|
||||||
14. The build system can now be used to generate a universal x86-64 + Armv8
|
14. The Arm Neon SIMD extensions can now be built using Visual Studio.
|
||||||
|
|
||||||
|
15. The build system can now be used to generate a universal x86-64 + Armv8
|
||||||
libjpeg-turbo SDK package for both iOS and macOS.
|
libjpeg-turbo SDK package for both iOS and macOS.
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
4
jchuff.c
4
jchuff.c
@@ -7,6 +7,7 @@
|
|||||||
* Copyright (C) 2009-2011, 2014-2016, 2018-2020, D. R. Commander.
|
* Copyright (C) 2009-2011, 2014-2016, 2018-2020, D. R. Commander.
|
||||||
* Copyright (C) 2015, Matthieu Darbois.
|
* Copyright (C) 2015, Matthieu Darbois.
|
||||||
* Copyright (C) 2018, Matthias Räncker.
|
* Copyright (C) 2018, Matthias Räncker.
|
||||||
|
* Copyright (C) 2020, Arm Limited.
|
||||||
* For conditions of distribution and use, see the accompanying README.ijg
|
* For conditions of distribution and use, see the accompanying README.ijg
|
||||||
* file.
|
* file.
|
||||||
*
|
*
|
||||||
@@ -76,7 +77,8 @@ typedef size_t bit_buf_type;
|
|||||||
* intrinsics implementation of the Arm Neon SIMD extensions, which is why we
|
* intrinsics implementation of the Arm Neon SIMD extensions, which is why we
|
||||||
* retain the old Huffman encoder behavior when using the GAS implementation.
|
* retain the old Huffman encoder behavior when using the GAS implementation.
|
||||||
*/
|
*/
|
||||||
#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__))
|
#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__) || \
|
||||||
|
defined(_M_ARM) || defined(_M_ARM64))
|
||||||
typedef unsigned long long simd_bit_buf_type;
|
typedef unsigned long long simd_bit_buf_type;
|
||||||
#else
|
#else
|
||||||
typedef bit_buf_type simd_bit_buf_type;
|
typedef bit_buf_type simd_bit_buf_type;
|
||||||
|
|||||||
@@ -477,7 +477,8 @@ jinit_upsampler(j_decompress_ptr cinfo)
|
|||||||
} else if (h_in_group == h_out_group &&
|
} else if (h_in_group == h_out_group &&
|
||||||
v_in_group * 2 == v_out_group && do_fancy) {
|
v_in_group * 2 == v_out_group && do_fancy) {
|
||||||
/* Non-fancy upsampling is handled by the generic method */
|
/* Non-fancy upsampling is handled by the generic method */
|
||||||
#if defined(__arm__) || defined(__aarch64__)
|
#if defined(__arm__) || defined(__aarch64__) || \
|
||||||
|
defined(_M_ARM) || defined(_M_ARM64)
|
||||||
if (jsimd_can_h1v2_fancy_upsample())
|
if (jsimd_can_h1v2_fancy_upsample())
|
||||||
upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
|
upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
|
||||||
else
|
else
|
||||||
|
|||||||
@@ -52,6 +52,8 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
|
|||||||
JSAMPROW inptr;
|
JSAMPROW inptr;
|
||||||
/* Pointers to Y, Cb, and Cr output data */
|
/* Pointers to Y, Cb, and Cr output data */
|
||||||
JSAMPROW outptr0, outptr1, outptr2;
|
JSAMPROW outptr0, outptr1, outptr2;
|
||||||
|
/* Allocate temporary buffer for final (image_width % 8) pixels in row. */
|
||||||
|
ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
|
||||||
|
|
||||||
/* Set up conversion constants. */
|
/* Set up conversion constants. */
|
||||||
#ifdef HAVE_VLD1_U16_X2
|
#ifdef HAVE_VLD1_U16_X2
|
||||||
@@ -79,7 +81,6 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
|
|||||||
* buffer large enough to accommodate the vector load.
|
* buffer large enough to accommodate the vector load.
|
||||||
*/
|
*/
|
||||||
if (cols_remaining < 8) {
|
if (cols_remaining < 8) {
|
||||||
ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
|
|
||||||
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
|
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
|
||||||
inptr = tmp_buf;
|
inptr = tmp_buf;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,6 +31,7 @@
|
|||||||
#include "../../../jsimddct.h"
|
#include "../../../jsimddct.h"
|
||||||
#include "../../jsimd.h"
|
#include "../../jsimd.h"
|
||||||
#include "../jchuff.h"
|
#include "../jchuff.h"
|
||||||
|
#include "neon-compat.h"
|
||||||
|
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
|
|
||||||
@@ -231,8 +232,9 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
|
|||||||
uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
|
uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
|
||||||
uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
|
uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
|
||||||
|
|
||||||
|
/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
|
||||||
const uint8x8_t bitmap_mask =
|
const uint8x8_t bitmap_mask =
|
||||||
{ 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
|
vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
|
||||||
|
|
||||||
row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
|
row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
|
||||||
row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
|
row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
|
||||||
@@ -278,7 +280,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
|
|||||||
const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
|
const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
|
||||||
|
|
||||||
while (bitmap_1_32 != 0) {
|
while (bitmap_1_32 != 0) {
|
||||||
r = __builtin_clz(bitmap_1_32);
|
r = BUILTIN_CLZ(bitmap_1_32);
|
||||||
i += r;
|
i += r;
|
||||||
bitmap_1_32 <<= r;
|
bitmap_1_32 <<= r;
|
||||||
nbits = block_nbits[i];
|
nbits = block_nbits[i];
|
||||||
@@ -299,7 +301,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
|
|||||||
i = 33;
|
i = 33;
|
||||||
|
|
||||||
while (bitmap_33_63 != 0) {
|
while (bitmap_33_63 != 0) {
|
||||||
unsigned int leading_zeros = __builtin_clz(bitmap_33_63);
|
unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
|
||||||
r += leading_zeros;
|
r += leading_zeros;
|
||||||
i += leading_zeros;
|
i += leading_zeros;
|
||||||
bitmap_33_63 <<= leading_zeros;
|
bitmap_33_63 <<= leading_zeros;
|
||||||
|
|||||||
@@ -51,6 +51,8 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
|
|||||||
JSAMPROW inptr;
|
JSAMPROW inptr;
|
||||||
/* Pointers to Y, Cb, and Cr output data */
|
/* Pointers to Y, Cb, and Cr output data */
|
||||||
JSAMPROW outptr0, outptr1, outptr2;
|
JSAMPROW outptr0, outptr1, outptr2;
|
||||||
|
/* Allocate temporary buffer for final (image_width % 16) pixels in row. */
|
||||||
|
ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
|
||||||
|
|
||||||
/* Set up conversion constants. */
|
/* Set up conversion constants. */
|
||||||
const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
|
const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
|
||||||
@@ -162,7 +164,6 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
|
|||||||
* (image_width % 16) columns of data are first memcopied to a temporary
|
* (image_width % 16) columns of data are first memcopied to a temporary
|
||||||
* buffer large enough to accommodate the vector load.
|
* buffer large enough to accommodate the vector load.
|
||||||
*/
|
*/
|
||||||
ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
|
|
||||||
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
|
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
|
||||||
inptr = tmp_buf;
|
inptr = tmp_buf;
|
||||||
|
|
||||||
@@ -255,7 +256,6 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
|
|||||||
* (image_width % 8) columns of data are first memcopied to a temporary
|
* (image_width % 8) columns of data are first memcopied to a temporary
|
||||||
* buffer large enough to accommodate the vector load.
|
* buffer large enough to accommodate the vector load.
|
||||||
*/
|
*/
|
||||||
ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
|
|
||||||
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
|
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
|
||||||
inptr = tmp_buf;
|
inptr = tmp_buf;
|
||||||
|
|
||||||
|
|||||||
@@ -205,8 +205,9 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
|
|||||||
uint8x8_t abs_row7_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row7),
|
uint8x8_t abs_row7_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row7),
|
||||||
vdupq_n_u16(0)));
|
vdupq_n_u16(0)));
|
||||||
|
|
||||||
|
/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
|
||||||
const uint8x8_t bitmap_mask =
|
const uint8x8_t bitmap_mask =
|
||||||
{ 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
|
vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
|
||||||
|
|
||||||
abs_row0_gt0 = vand_u8(abs_row0_gt0, bitmap_mask);
|
abs_row0_gt0 = vand_u8(abs_row0_gt0, bitmap_mask);
|
||||||
abs_row1_gt0 = vand_u8(abs_row1_gt0, bitmap_mask);
|
abs_row1_gt0 = vand_u8(abs_row1_gt0, bitmap_mask);
|
||||||
@@ -241,8 +242,12 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
|
|||||||
/* Encode DC coefficient. */
|
/* Encode DC coefficient. */
|
||||||
|
|
||||||
/* Find nbits required to specify sign and amplitude of coefficient. */
|
/* Find nbits required to specify sign and amplitude of coefficient. */
|
||||||
|
#if defined(_MSC_VER) && !defined(__clang__)
|
||||||
|
unsigned int lz = BUILTIN_CLZ(vgetq_lane_s16(abs_row0, 0));
|
||||||
|
#else
|
||||||
unsigned int lz;
|
unsigned int lz;
|
||||||
__asm__("clz %w0, %w1" : "=r"(lz) : "r"(vgetq_lane_s16(abs_row0, 0)));
|
__asm__("clz %w0, %w1" : "=r"(lz) : "r"(vgetq_lane_s16(abs_row0, 0)));
|
||||||
|
#endif
|
||||||
unsigned int nbits = 32 - lz;
|
unsigned int nbits = 32 - lz;
|
||||||
/* Emit Huffman-coded symbol and additional diff bits. */
|
/* Emit Huffman-coded symbol and additional diff bits. */
|
||||||
unsigned int diff = (unsigned int)(vgetq_lane_u16(row0_diff, 0) << lz) >> lz;
|
unsigned int diff = (unsigned int)(vgetq_lane_u16(row0_diff, 0) << lz) >> lz;
|
||||||
@@ -326,7 +331,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
|
|||||||
vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
|
vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
|
||||||
|
|
||||||
while (bitmap != 0) {
|
while (bitmap != 0) {
|
||||||
r = __builtin_clzl(bitmap);
|
r = BUILTIN_CLZL(bitmap);
|
||||||
i += r;
|
i += r;
|
||||||
bitmap <<= r;
|
bitmap <<= r;
|
||||||
nbits = block_nbits[i];
|
nbits = block_nbits[i];
|
||||||
@@ -365,10 +370,10 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
|
|||||||
|
|
||||||
/* Same as above but must mask diff bits and compute nbits on demand. */
|
/* Same as above but must mask diff bits and compute nbits on demand. */
|
||||||
while (bitmap != 0) {
|
while (bitmap != 0) {
|
||||||
r = __builtin_clzl(bitmap);
|
r = BUILTIN_CLZL(bitmap);
|
||||||
i += r;
|
i += r;
|
||||||
bitmap <<= r;
|
bitmap <<= r;
|
||||||
lz = __builtin_clz(block_abs[i]);
|
lz = BUILTIN_CLZ(block_abs[i]);
|
||||||
nbits = 32 - lz;
|
nbits = 32 - lz;
|
||||||
diff = (unsigned int)(block_diff[i] << lz) >> lz;
|
diff = (unsigned int)(block_diff[i] << lz) >> lz;
|
||||||
while (r > 15) {
|
while (r > 15) {
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
|
|||||||
|
|
||||||
/* Include inline routines for colorspace extensions. */
|
/* Include inline routines for colorspace extensions. */
|
||||||
|
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
#include "aarch64/jccolext-neon.c"
|
#include "aarch64/jccolext-neon.c"
|
||||||
#else
|
#else
|
||||||
#include "aarch32/jccolext-neon.c"
|
#include "aarch32/jccolext-neon.c"
|
||||||
@@ -68,7 +68,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
|
|||||||
#define RGB_BLUE EXT_RGB_BLUE
|
#define RGB_BLUE EXT_RGB_BLUE
|
||||||
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||||
#define jsimd_rgb_ycc_convert_neon jsimd_extrgb_ycc_convert_neon
|
#define jsimd_rgb_ycc_convert_neon jsimd_extrgb_ycc_convert_neon
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
#include "aarch64/jccolext-neon.c"
|
#include "aarch64/jccolext-neon.c"
|
||||||
#else
|
#else
|
||||||
#include "aarch32/jccolext-neon.c"
|
#include "aarch32/jccolext-neon.c"
|
||||||
@@ -84,7 +84,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
|
|||||||
#define RGB_BLUE EXT_RGBX_BLUE
|
#define RGB_BLUE EXT_RGBX_BLUE
|
||||||
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||||
#define jsimd_rgb_ycc_convert_neon jsimd_extrgbx_ycc_convert_neon
|
#define jsimd_rgb_ycc_convert_neon jsimd_extrgbx_ycc_convert_neon
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
#include "aarch64/jccolext-neon.c"
|
#include "aarch64/jccolext-neon.c"
|
||||||
#else
|
#else
|
||||||
#include "aarch32/jccolext-neon.c"
|
#include "aarch32/jccolext-neon.c"
|
||||||
@@ -100,7 +100,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
|
|||||||
#define RGB_BLUE EXT_BGR_BLUE
|
#define RGB_BLUE EXT_BGR_BLUE
|
||||||
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||||
#define jsimd_rgb_ycc_convert_neon jsimd_extbgr_ycc_convert_neon
|
#define jsimd_rgb_ycc_convert_neon jsimd_extbgr_ycc_convert_neon
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
#include "aarch64/jccolext-neon.c"
|
#include "aarch64/jccolext-neon.c"
|
||||||
#else
|
#else
|
||||||
#include "aarch32/jccolext-neon.c"
|
#include "aarch32/jccolext-neon.c"
|
||||||
@@ -116,7 +116,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
|
|||||||
#define RGB_BLUE EXT_BGRX_BLUE
|
#define RGB_BLUE EXT_BGRX_BLUE
|
||||||
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||||
#define jsimd_rgb_ycc_convert_neon jsimd_extbgrx_ycc_convert_neon
|
#define jsimd_rgb_ycc_convert_neon jsimd_extbgrx_ycc_convert_neon
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
#include "aarch64/jccolext-neon.c"
|
#include "aarch64/jccolext-neon.c"
|
||||||
#else
|
#else
|
||||||
#include "aarch32/jccolext-neon.c"
|
#include "aarch32/jccolext-neon.c"
|
||||||
@@ -132,7 +132,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
|
|||||||
#define RGB_BLUE EXT_XBGR_BLUE
|
#define RGB_BLUE EXT_XBGR_BLUE
|
||||||
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||||
#define jsimd_rgb_ycc_convert_neon jsimd_extxbgr_ycc_convert_neon
|
#define jsimd_rgb_ycc_convert_neon jsimd_extxbgr_ycc_convert_neon
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
#include "aarch64/jccolext-neon.c"
|
#include "aarch64/jccolext-neon.c"
|
||||||
#else
|
#else
|
||||||
#include "aarch32/jccolext-neon.c"
|
#include "aarch32/jccolext-neon.c"
|
||||||
@@ -148,7 +148,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
|
|||||||
#define RGB_BLUE EXT_XRGB_BLUE
|
#define RGB_BLUE EXT_XRGB_BLUE
|
||||||
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||||
#define jsimd_rgb_ycc_convert_neon jsimd_extxrgb_ycc_convert_neon
|
#define jsimd_rgb_ycc_convert_neon jsimd_extxrgb_ycc_convert_neon
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
#include "aarch64/jccolext-neon.c"
|
#include "aarch64/jccolext-neon.c"
|
||||||
#else
|
#else
|
||||||
#include "aarch32/jccolext-neon.c"
|
#include "aarch32/jccolext-neon.c"
|
||||||
|
|||||||
@@ -41,6 +41,8 @@ void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
|
|||||||
{
|
{
|
||||||
JSAMPROW inptr;
|
JSAMPROW inptr;
|
||||||
JSAMPROW outptr;
|
JSAMPROW outptr;
|
||||||
|
/* Allocate temporary buffer for final (image_width % 16) pixels in row. */
|
||||||
|
ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
|
||||||
|
|
||||||
while (--num_rows >= 0) {
|
while (--num_rows >= 0) {
|
||||||
inptr = *input_buf++;
|
inptr = *input_buf++;
|
||||||
@@ -55,7 +57,6 @@ void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
|
|||||||
* buffer large enough to accommodate the vector load.
|
* buffer large enough to accommodate the vector load.
|
||||||
*/
|
*/
|
||||||
if (cols_remaining < 16) {
|
if (cols_remaining < 16) {
|
||||||
ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
|
|
||||||
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
|
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
|
||||||
inptr = tmp_buf;
|
inptr = tmp_buf;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,7 +17,7 @@
|
|||||||
* but must not be updated permanently until we complete the MCU.
|
* but must not be updated permanently until we complete the MCU.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
#define BIT_BUF_SIZE 64
|
#define BIT_BUF_SIZE 64
|
||||||
#else
|
#else
|
||||||
#define BIT_BUF_SIZE 32
|
#define BIT_BUF_SIZE 32
|
||||||
@@ -54,7 +54,25 @@ typedef struct {
|
|||||||
* directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to
|
* directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to
|
||||||
* encode 0xFF as 0xFF 0x00.
|
* encode 0xFF as 0xFF 0x00.
|
||||||
*/
|
*/
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
|
|
||||||
|
#if defined(_MSC_VER) && !defined(__clang__)
|
||||||
|
#define SPLAT() { \
|
||||||
|
buffer[0] = (JOCTET)(put_buffer >> 56); \
|
||||||
|
buffer[1] = (JOCTET)(put_buffer >> 48); \
|
||||||
|
buffer[2] = (JOCTET)(put_buffer >> 40); \
|
||||||
|
buffer[3] = (JOCTET)(put_buffer >> 32); \
|
||||||
|
buffer[4] = (JOCTET)(put_buffer >> 24); \
|
||||||
|
buffer[5] = (JOCTET)(put_buffer >> 16); \
|
||||||
|
buffer[6] = (JOCTET)(put_buffer >> 8); \
|
||||||
|
buffer[7] = (JOCTET)(put_buffer ); \
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#define SPLAT() { \
|
||||||
|
__asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \
|
||||||
|
*((uint64_t *)buffer) = put_buffer; \
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#define FLUSH() { \
|
#define FLUSH() { \
|
||||||
if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
|
if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
|
||||||
@@ -67,14 +85,27 @@ typedef struct {
|
|||||||
EMIT_BYTE(put_buffer >> 8) \
|
EMIT_BYTE(put_buffer >> 8) \
|
||||||
EMIT_BYTE(put_buffer ) \
|
EMIT_BYTE(put_buffer ) \
|
||||||
} else { \
|
} else { \
|
||||||
__asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \
|
SPLAT() \
|
||||||
*((uint64_t *)buffer) = put_buffer; \
|
|
||||||
buffer += 8; \
|
buffer += 8; \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
#if defined(_MSC_VER) && !defined(__clang__)
|
||||||
|
#define SPLAT() { \
|
||||||
|
buffer[0] = (JOCTET)(put_buffer >> 24); \
|
||||||
|
buffer[1] = (JOCTET)(put_buffer >> 16); \
|
||||||
|
buffer[2] = (JOCTET)(put_buffer >> 8); \
|
||||||
|
buffer[3] = (JOCTET)(put_buffer ); \
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#define SPLAT() { \
|
||||||
|
__asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \
|
||||||
|
*((uint32_t *)buffer) = put_buffer; \
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#define FLUSH() { \
|
#define FLUSH() { \
|
||||||
if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
|
if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
|
||||||
EMIT_BYTE(put_buffer >> 24) \
|
EMIT_BYTE(put_buffer >> 24) \
|
||||||
@@ -82,8 +113,7 @@ typedef struct {
|
|||||||
EMIT_BYTE(put_buffer >> 8) \
|
EMIT_BYTE(put_buffer >> 8) \
|
||||||
EMIT_BYTE(put_buffer ) \
|
EMIT_BYTE(put_buffer ) \
|
||||||
} else { \
|
} else { \
|
||||||
__asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \
|
SPLAT() \
|
||||||
*((uint32_t *)buffer) = put_buffer; \
|
|
||||||
buffer += 4; \
|
buffer += 4; \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,6 +27,7 @@
|
|||||||
#include "../../jdct.h"
|
#include "../../jdct.h"
|
||||||
#include "../../jsimddct.h"
|
#include "../../jsimddct.h"
|
||||||
#include "../jsimd.h"
|
#include "../jsimd.h"
|
||||||
|
#include "neon-compat.h"
|
||||||
|
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
@@ -212,8 +213,9 @@ void jsimd_encode_mcu_AC_first_prepare_neon
|
|||||||
uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
|
uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
|
||||||
uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
|
uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
|
||||||
|
|
||||||
|
/* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
|
||||||
const uint8x8_t bitmap_mask =
|
const uint8x8_t bitmap_mask =
|
||||||
{ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
|
vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
|
||||||
|
|
||||||
row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
|
row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
|
||||||
row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
|
row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
|
||||||
@@ -232,7 +234,7 @@ void jsimd_encode_mcu_AC_first_prepare_neon
|
|||||||
uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
|
uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
|
||||||
uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
|
uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
|
||||||
|
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
/* Move bitmap to a 64-bit scalar register. */
|
/* Move bitmap to a 64-bit scalar register. */
|
||||||
uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
|
uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
|
||||||
/* Store zerobits bitmap. */
|
/* Store zerobits bitmap. */
|
||||||
@@ -456,8 +458,9 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
|
|||||||
uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
|
uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
|
||||||
uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
|
uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
|
||||||
|
|
||||||
|
/* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
|
||||||
const uint8x8_t bitmap_mask =
|
const uint8x8_t bitmap_mask =
|
||||||
{ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
|
vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
|
||||||
|
|
||||||
abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
|
abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
|
||||||
abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
|
abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
|
||||||
@@ -476,7 +479,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
|
|||||||
uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
|
uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
|
||||||
uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
|
uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
|
||||||
|
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
/* Move bitmap to a 64-bit scalar register. */
|
/* Move bitmap to a 64-bit scalar register. */
|
||||||
uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
|
uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
|
||||||
/* Store zerobits bitmap. */
|
/* Store zerobits bitmap. */
|
||||||
@@ -517,7 +520,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
|
|||||||
bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
|
bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
|
||||||
bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
|
bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
|
||||||
|
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
/* Move bitmap to a 64-bit scalar register. */
|
/* Move bitmap to a 64-bit scalar register. */
|
||||||
bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
|
bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
|
||||||
/* Store signbits bitmap. */
|
/* Store signbits bitmap. */
|
||||||
@@ -560,7 +563,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
|
|||||||
bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
|
bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
|
||||||
bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
|
bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
|
||||||
|
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
/* Move bitmap to a 64-bit scalar register. */
|
/* Move bitmap to a 64-bit scalar register. */
|
||||||
bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
|
bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
|
||||||
|
|
||||||
@@ -569,7 +572,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
|
|||||||
/* EOB position is defined to be 0 if all coefficients != 1. */
|
/* EOB position is defined to be 0 if all coefficients != 1. */
|
||||||
return 0;
|
return 0;
|
||||||
} else {
|
} else {
|
||||||
return 63 - __builtin_clzl(bitmap);
|
return 63 - BUILTIN_CLZL(bitmap);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
/* Move bitmap to two 32-bit scalar registers. */
|
/* Move bitmap to two 32-bit scalar registers. */
|
||||||
@@ -580,9 +583,9 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
|
|||||||
if (bitmap0 == 0 && bitmap1 == 0) {
|
if (bitmap0 == 0 && bitmap1 == 0) {
|
||||||
return 0;
|
return 0;
|
||||||
} else if (bitmap1 != 0) {
|
} else if (bitmap1 != 0) {
|
||||||
return 63 - __builtin_clz(bitmap1);
|
return 63 - BUILTIN_CLZ(bitmap1);
|
||||||
} else {
|
} else {
|
||||||
return 31 - __builtin_clz(bitmap0);
|
return 31 - BUILTIN_CLZ(bitmap0);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -84,7 +84,8 @@ void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
|
|||||||
const uint8x16_t expand_mask =
|
const uint8x16_t expand_mask =
|
||||||
vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
|
vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
|
||||||
/* Load bias pattern (alternating every pixel.) */
|
/* Load bias pattern (alternating every pixel.) */
|
||||||
const uint16x8_t bias = { 0, 1, 0, 1, 0, 1, 0, 1 };
|
/* { 0, 1, 0, 1, 0, 1, 0, 1 } */
|
||||||
|
const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
|
||||||
unsigned i, outrow;
|
unsigned i, outrow;
|
||||||
|
|
||||||
for (outrow = 0; outrow < v_samp_factor; outrow++) {
|
for (outrow = 0; outrow < v_samp_factor; outrow++) {
|
||||||
@@ -104,7 +105,7 @@ void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
|
|||||||
|
|
||||||
/* Load pixels in last DCT block into a table. */
|
/* Load pixels in last DCT block into a table. */
|
||||||
uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
|
uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
/* Pad the empty elements with the value of the last pixel. */
|
/* Pad the empty elements with the value of the last pixel. */
|
||||||
pixels = vqtbl1q_u8(pixels, expand_mask);
|
pixels = vqtbl1q_u8(pixels, expand_mask);
|
||||||
#else
|
#else
|
||||||
@@ -137,7 +138,8 @@ void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
|
|||||||
const uint8x16_t expand_mask =
|
const uint8x16_t expand_mask =
|
||||||
vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
|
vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
|
||||||
/* Load bias pattern (alternating every pixel.) */
|
/* Load bias pattern (alternating every pixel.) */
|
||||||
const uint16x8_t bias = { 1, 2, 1, 2, 1, 2, 1, 2 };
|
/* { 1, 2, 1, 2, 1, 2, 1, 2 } */
|
||||||
|
const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
|
||||||
unsigned i, outrow;
|
unsigned i, outrow;
|
||||||
|
|
||||||
for (outrow = 0; outrow < v_samp_factor; outrow++) {
|
for (outrow = 0; outrow < v_samp_factor; outrow++) {
|
||||||
@@ -165,7 +167,7 @@ void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
|
|||||||
vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
|
vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
|
||||||
uint8x16_t pixels_r1 =
|
uint8x16_t pixels_r1 =
|
||||||
vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
|
vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
/* Pad the empty elements with the value of the last pixel. */
|
/* Pad the empty elements with the value of the last pixel. */
|
||||||
pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
|
pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
|
||||||
pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
|
pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
|
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
|
||||||
|
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||||
*
|
*
|
||||||
* This software is provided 'as-is', without any express or implied
|
* This software is provided 'as-is', without any express or implied
|
||||||
* warranty. In no event will the authors be held liable for any damages
|
* warranty. In no event will the authors be held liable for any damages
|
||||||
@@ -21,3 +22,14 @@
|
|||||||
#cmakedefine HAVE_VLD1_S16_X3
|
#cmakedefine HAVE_VLD1_S16_X3
|
||||||
#cmakedefine HAVE_VLD1_U16_X2
|
#cmakedefine HAVE_VLD1_U16_X2
|
||||||
#cmakedefine HAVE_VLD1Q_U8_X4
|
#cmakedefine HAVE_VLD1Q_U8_X4
|
||||||
|
|
||||||
|
/* Define compiler-independent count-leading-zeros macros */
|
||||||
|
#if defined(_MSC_VER) && !defined(__clang__)
|
||||||
|
#define BUILTIN_CLZ(x) _CountLeadingZeros(x)
|
||||||
|
#define BUILTIN_CLZL(x) _CountLeadingZeros64(x)
|
||||||
|
#elif defined(__clang__) || defined(__GNUC__)
|
||||||
|
#define BUILTIN_CLZ(x) __builtin_clz(x)
|
||||||
|
#define BUILTIN_CLZL(x) __builtin_clzl(x)
|
||||||
|
#else
|
||||||
|
#error "Unknown compiler"
|
||||||
|
#endif
|
||||||
|
|||||||
Reference in New Issue
Block a user