diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b881ccf..d057d0ff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -892,7 +892,7 @@ if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386") endif() else() if((CPU_TYPE STREQUAL "powerpc" OR CPU_TYPE STREQUAL "arm64") AND - NOT CMAKE_C_COMPILER_ID STREQUAL "Clang") + NOT CMAKE_C_COMPILER_ID STREQUAL "Clang" AND NOT MSVC) set(DEFAULT_FLOATTEST fp-contract) else() set(DEFAULT_FLOATTEST no-fp-contract) diff --git a/ChangeLog.md b/ChangeLog.md index 0fe2ae59..d4226344 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -135,7 +135,9 @@ default. for merged upsampling/color conversion, 1.5.1[5] is no longer necessary and has been reverted. -14. The build system can now be used to generate a universal x86-64 + Armv8 +14. The Arm Neon SIMD extensions can now be built using Visual Studio. + +15. The build system can now be used to generate a universal x86-64 + Armv8 libjpeg-turbo SDK package for both iOS and macOS. diff --git a/jchuff.c b/jchuff.c index 2417cac3..8ea48b80 100644 --- a/jchuff.c +++ b/jchuff.c @@ -7,6 +7,7 @@ * Copyright (C) 2009-2011, 2014-2016, 2018-2020, D. R. Commander. * Copyright (C) 2015, Matthieu Darbois. * Copyright (C) 2018, Matthias Räncker. + * Copyright (C) 2020, Arm Limited. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -76,7 +77,8 @@ typedef size_t bit_buf_type; * intrinsics implementation of the Arm Neon SIMD extensions, which is why we * retain the old Huffman encoder behavior when using the GAS implementation. */ -#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__)) +#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__) || \ + defined(_M_ARM) || defined(_M_ARM64)) typedef unsigned long long simd_bit_buf_type; #else typedef bit_buf_type simd_bit_buf_type; diff --git a/jdsample.c b/jdsample.c index 2d347109..eaad72a0 100644 --- a/jdsample.c +++ b/jdsample.c @@ -477,7 +477,8 @@ jinit_upsampler(j_decompress_ptr cinfo) } else if (h_in_group == h_out_group && v_in_group * 2 == v_out_group && do_fancy) { /* Non-fancy upsampling is handled by the generic method */ -#if defined(__arm__) || defined(__aarch64__) +#if defined(__arm__) || defined(__aarch64__) || \ + defined(_M_ARM) || defined(_M_ARM64) if (jsimd_can_h1v2_fancy_upsample()) upsample->methods[ci] = jsimd_h1v2_fancy_upsample; else diff --git a/simd/arm/aarch32/jccolext-neon.c b/simd/arm/aarch32/jccolext-neon.c index 96b44d81..362102d2 100644 --- a/simd/arm/aarch32/jccolext-neon.c +++ b/simd/arm/aarch32/jccolext-neon.c @@ -52,6 +52,8 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf, JSAMPROW inptr; /* Pointers to Y, Cb, and Cr output data */ JSAMPROW outptr0, outptr1, outptr2; + /* Allocate temporary buffer for final (image_width % 8) pixels in row. */ + ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE]; /* Set up conversion constants. */ #ifdef HAVE_VLD1_U16_X2 @@ -79,7 +81,6 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf, * buffer large enough to accommodate the vector load. */ if (cols_remaining < 8) { - ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE]; memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE); inptr = tmp_buf; } diff --git a/simd/arm/aarch32/jchuff-neon.c b/simd/arm/aarch32/jchuff-neon.c index 941c9b24..19d94f72 100644 --- a/simd/arm/aarch32/jchuff-neon.c +++ b/simd/arm/aarch32/jchuff-neon.c @@ -31,6 +31,7 @@ #include "../../../jsimddct.h" #include "../../jsimd.h" #include "../jchuff.h" +#include "neon-compat.h" #include @@ -231,8 +232,9 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0)); uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0)); + /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */ const uint8x8_t bitmap_mask = - { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 }; + vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080)); row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask); row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask); @@ -278,7 +280,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, const unsigned int size_0xf0 = actbl->ehufsi[0xf0]; while (bitmap_1_32 != 0) { - r = __builtin_clz(bitmap_1_32); + r = BUILTIN_CLZ(bitmap_1_32); i += r; bitmap_1_32 <<= r; nbits = block_nbits[i]; @@ -299,7 +301,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, i = 33; while (bitmap_33_63 != 0) { - unsigned int leading_zeros = __builtin_clz(bitmap_33_63); + unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63); r += leading_zeros; i += leading_zeros; bitmap_33_63 <<= leading_zeros; diff --git a/simd/arm/aarch64/jccolext-neon.c b/simd/arm/aarch64/jccolext-neon.c index 756aeda8..37130c22 100644 --- a/simd/arm/aarch64/jccolext-neon.c +++ b/simd/arm/aarch64/jccolext-neon.c @@ -51,6 +51,8 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf, JSAMPROW inptr; /* Pointers to Y, Cb, and Cr output data */ JSAMPROW outptr0, outptr1, outptr2; + /* Allocate temporary buffer for final (image_width % 16) pixels in row. */ + ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE]; /* Set up conversion constants. */ const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts); @@ -162,7 +164,6 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf, * (image_width % 16) columns of data are first memcopied to a temporary * buffer large enough to accommodate the vector load. */ - ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE]; memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE); inptr = tmp_buf; @@ -255,7 +256,6 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf, * (image_width % 8) columns of data are first memcopied to a temporary * buffer large enough to accommodate the vector load. */ - ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE]; memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE); inptr = tmp_buf; diff --git a/simd/arm/aarch64/jchuff-neon.c b/simd/arm/aarch64/jchuff-neon.c index 808fa956..a0a57a66 100644 --- a/simd/arm/aarch64/jchuff-neon.c +++ b/simd/arm/aarch64/jchuff-neon.c @@ -205,8 +205,9 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, uint8x8_t abs_row7_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row7), vdupq_n_u16(0))); + /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */ const uint8x8_t bitmap_mask = - { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 }; + vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080)); abs_row0_gt0 = vand_u8(abs_row0_gt0, bitmap_mask); abs_row1_gt0 = vand_u8(abs_row1_gt0, bitmap_mask); @@ -241,8 +242,12 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, /* Encode DC coefficient. */ /* Find nbits required to specify sign and amplitude of coefficient. */ +#if defined(_MSC_VER) && !defined(__clang__) + unsigned int lz = BUILTIN_CLZ(vgetq_lane_s16(abs_row0, 0)); +#else unsigned int lz; __asm__("clz %w0, %w1" : "=r"(lz) : "r"(vgetq_lane_s16(abs_row0, 0))); +#endif unsigned int nbits = 32 - lz; /* Emit Huffman-coded symbol and additional diff bits. */ unsigned int diff = (unsigned int)(vgetq_lane_u16(row0_diff, 0) << lz) >> lz; @@ -326,7 +331,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff); while (bitmap != 0) { - r = __builtin_clzl(bitmap); + r = BUILTIN_CLZL(bitmap); i += r; bitmap <<= r; nbits = block_nbits[i]; @@ -365,10 +370,10 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, /* Same as above but must mask diff bits and compute nbits on demand. */ while (bitmap != 0) { - r = __builtin_clzl(bitmap); + r = BUILTIN_CLZL(bitmap); i += r; bitmap <<= r; - lz = __builtin_clz(block_abs[i]); + lz = BUILTIN_CLZ(block_abs[i]); nbits = 32 - lz; diff = (unsigned int)(block_diff[i] << lz) >> lz; while (r > 15) { diff --git a/simd/arm/jccolor-neon.c b/simd/arm/jccolor-neon.c index f18ed9e5..9fcc62dd 100644 --- a/simd/arm/jccolor-neon.c +++ b/simd/arm/jccolor-neon.c @@ -53,7 +53,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = { /* Include inline routines for colorspace extensions. */ -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) #include "aarch64/jccolext-neon.c" #else #include "aarch32/jccolext-neon.c" @@ -68,7 +68,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = { #define RGB_BLUE EXT_RGB_BLUE #define RGB_PIXELSIZE EXT_RGB_PIXELSIZE #define jsimd_rgb_ycc_convert_neon jsimd_extrgb_ycc_convert_neon -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) #include "aarch64/jccolext-neon.c" #else #include "aarch32/jccolext-neon.c" @@ -84,7 +84,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = { #define RGB_BLUE EXT_RGBX_BLUE #define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE #define jsimd_rgb_ycc_convert_neon jsimd_extrgbx_ycc_convert_neon -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) #include "aarch64/jccolext-neon.c" #else #include "aarch32/jccolext-neon.c" @@ -100,7 +100,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = { #define RGB_BLUE EXT_BGR_BLUE #define RGB_PIXELSIZE EXT_BGR_PIXELSIZE #define jsimd_rgb_ycc_convert_neon jsimd_extbgr_ycc_convert_neon -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) #include "aarch64/jccolext-neon.c" #else #include "aarch32/jccolext-neon.c" @@ -116,7 +116,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = { #define RGB_BLUE EXT_BGRX_BLUE #define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE #define jsimd_rgb_ycc_convert_neon jsimd_extbgrx_ycc_convert_neon -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) #include "aarch64/jccolext-neon.c" #else #include "aarch32/jccolext-neon.c" @@ -132,7 +132,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = { #define RGB_BLUE EXT_XBGR_BLUE #define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE #define jsimd_rgb_ycc_convert_neon jsimd_extxbgr_ycc_convert_neon -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) #include "aarch64/jccolext-neon.c" #else #include "aarch32/jccolext-neon.c" @@ -148,7 +148,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = { #define RGB_BLUE EXT_XRGB_BLUE #define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE #define jsimd_rgb_ycc_convert_neon jsimd_extxrgb_ycc_convert_neon -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) #include "aarch64/jccolext-neon.c" #else #include "aarch32/jccolext-neon.c" diff --git a/simd/arm/jcgryext-neon.c b/simd/arm/jcgryext-neon.c index b1f00e60..416a7385 100644 --- a/simd/arm/jcgryext-neon.c +++ b/simd/arm/jcgryext-neon.c @@ -41,6 +41,8 @@ void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf, { JSAMPROW inptr; JSAMPROW outptr; + /* Allocate temporary buffer for final (image_width % 16) pixels in row. */ + ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE]; while (--num_rows >= 0) { inptr = *input_buf++; @@ -55,7 +57,6 @@ void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf, * buffer large enough to accommodate the vector load. */ if (cols_remaining < 16) { - ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE]; memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE); inptr = tmp_buf; } diff --git a/simd/arm/jchuff.h b/simd/arm/jchuff.h index 87ff0d37..d30759f5 100644 --- a/simd/arm/jchuff.h +++ b/simd/arm/jchuff.h @@ -17,7 +17,7 @@ * but must not be updated permanently until we complete the MCU. */ -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) #define BIT_BUF_SIZE 64 #else #define BIT_BUF_SIZE 32 @@ -54,7 +54,25 @@ typedef struct { * directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to * encode 0xFF as 0xFF 0x00. */ -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) + +#if defined(_MSC_VER) && !defined(__clang__) +#define SPLAT() { \ + buffer[0] = (JOCTET)(put_buffer >> 56); \ + buffer[1] = (JOCTET)(put_buffer >> 48); \ + buffer[2] = (JOCTET)(put_buffer >> 40); \ + buffer[3] = (JOCTET)(put_buffer >> 32); \ + buffer[4] = (JOCTET)(put_buffer >> 24); \ + buffer[5] = (JOCTET)(put_buffer >> 16); \ + buffer[6] = (JOCTET)(put_buffer >> 8); \ + buffer[7] = (JOCTET)(put_buffer ); \ +} +#else +#define SPLAT() { \ + __asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \ + *((uint64_t *)buffer) = put_buffer; \ +} +#endif #define FLUSH() { \ if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \ @@ -67,14 +85,27 @@ typedef struct { EMIT_BYTE(put_buffer >> 8) \ EMIT_BYTE(put_buffer ) \ } else { \ - __asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \ - *((uint64_t *)buffer) = put_buffer; \ + SPLAT() \ buffer += 8; \ } \ } #else +#if defined(_MSC_VER) && !defined(__clang__) +#define SPLAT() { \ + buffer[0] = (JOCTET)(put_buffer >> 24); \ + buffer[1] = (JOCTET)(put_buffer >> 16); \ + buffer[2] = (JOCTET)(put_buffer >> 8); \ + buffer[3] = (JOCTET)(put_buffer ); \ +} +#else +#define SPLAT() { \ + __asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \ + *((uint32_t *)buffer) = put_buffer; \ +} +#endif + #define FLUSH() { \ if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \ EMIT_BYTE(put_buffer >> 24) \ @@ -82,8 +113,7 @@ typedef struct { EMIT_BYTE(put_buffer >> 8) \ EMIT_BYTE(put_buffer ) \ } else { \ - __asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \ - *((uint32_t *)buffer) = put_buffer; \ + SPLAT() \ buffer += 4; \ } \ } diff --git a/simd/arm/jcphuff-neon.c b/simd/arm/jcphuff-neon.c index 61f94c2e..8b6d53be 100644 --- a/simd/arm/jcphuff-neon.c +++ b/simd/arm/jcphuff-neon.c @@ -27,6 +27,7 @@ #include "../../jdct.h" #include "../../jsimddct.h" #include "../jsimd.h" +#include "neon-compat.h" #include @@ -212,8 +213,9 @@ void jsimd_encode_mcu_AC_first_prepare_neon uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0))); uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0))); + /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */ const uint8x8_t bitmap_mask = - { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 }; + vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201)); row0_eq0 = vand_u8(row0_eq0, bitmap_mask); row1_eq0 = vand_u8(row1_eq0, bitmap_mask); @@ -232,7 +234,7 @@ void jsimd_encode_mcu_AC_first_prepare_neon uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67); uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567); -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) /* Move bitmap to a 64-bit scalar register. */ uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); /* Store zerobits bitmap. */ @@ -456,8 +458,9 @@ int jsimd_encode_mcu_AC_refine_prepare_neon uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0))); uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0))); + /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */ const uint8x8_t bitmap_mask = - { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 }; + vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201)); abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask); abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask); @@ -476,7 +479,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67); uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567); -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) /* Move bitmap to a 64-bit scalar register. */ uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); /* Store zerobits bitmap. */ @@ -517,7 +520,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67); bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567); -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) /* Move bitmap to a 64-bit scalar register. */ bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); /* Store signbits bitmap. */ @@ -560,7 +563,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67); bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567); -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) /* Move bitmap to a 64-bit scalar register. */ bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); @@ -569,7 +572,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon /* EOB position is defined to be 0 if all coefficients != 1. */ return 0; } else { - return 63 - __builtin_clzl(bitmap); + return 63 - BUILTIN_CLZL(bitmap); } #else /* Move bitmap to two 32-bit scalar registers. */ @@ -580,9 +583,9 @@ int jsimd_encode_mcu_AC_refine_prepare_neon if (bitmap0 == 0 && bitmap1 == 0) { return 0; } else if (bitmap1 != 0) { - return 63 - __builtin_clz(bitmap1); + return 63 - BUILTIN_CLZ(bitmap1); } else { - return 31 - __builtin_clz(bitmap0); + return 31 - BUILTIN_CLZ(bitmap0); } #endif } diff --git a/simd/arm/jcsample-neon.c b/simd/arm/jcsample-neon.c index e4e7827a..8a3e2378 100644 --- a/simd/arm/jcsample-neon.c +++ b/simd/arm/jcsample-neon.c @@ -84,7 +84,8 @@ void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, const uint8x16_t expand_mask = vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]); /* Load bias pattern (alternating every pixel.) */ - const uint16x8_t bias = { 0, 1, 0, 1, 0, 1, 0, 1 }; + /* { 0, 1, 0, 1, 0, 1, 0, 1 } */ + const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000)); unsigned i, outrow; for (outrow = 0; outrow < v_samp_factor; outrow++) { @@ -104,7 +105,7 @@ void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, /* Load pixels in last DCT block into a table. */ uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE); -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) /* Pad the empty elements with the value of the last pixel. */ pixels = vqtbl1q_u8(pixels, expand_mask); #else @@ -137,7 +138,8 @@ void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, const uint8x16_t expand_mask = vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]); /* Load bias pattern (alternating every pixel.) */ - const uint16x8_t bias = { 1, 2, 1, 2, 1, 2, 1, 2 }; + /* { 1, 2, 1, 2, 1, 2, 1, 2 } */ + const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001)); unsigned i, outrow; for (outrow = 0; outrow < v_samp_factor; outrow++) { @@ -165,7 +167,7 @@ void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE); uint8x16_t pixels_r1 = vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE); -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) /* Pad the empty elements with the value of the last pixel. */ pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask); pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask); diff --git a/simd/arm/neon-compat.h.in b/simd/arm/neon-compat.h.in index 7a03d81f..e2347b9b 100644 --- a/simd/arm/neon-compat.h.in +++ b/simd/arm/neon-compat.h.in @@ -1,5 +1,6 @@ /* * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * Copyright (C) 2020, Arm Limited. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -21,3 +22,14 @@ #cmakedefine HAVE_VLD1_S16_X3 #cmakedefine HAVE_VLD1_U16_X2 #cmakedefine HAVE_VLD1Q_U8_X4 + +/* Define compiler-independent count-leading-zeros macros */ +#if defined(_MSC_VER) && !defined(__clang__) +#define BUILTIN_CLZ(x) _CountLeadingZeros(x) +#define BUILTIN_CLZL(x) _CountLeadingZeros64(x) +#elif defined(__clang__) || defined(__GNUC__) +#define BUILTIN_CLZ(x) __builtin_clz(x) +#define BUILTIN_CLZL(x) __builtin_clzl(x) +#else +#error "Unknown compiler" +#endif