diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7b881ccf..d057d0ff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -892,7 +892,7 @@ if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386")
   endif()
 else()
   if((CPU_TYPE STREQUAL "powerpc" OR CPU_TYPE STREQUAL "arm64") AND
-    NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
+    NOT CMAKE_C_COMPILER_ID STREQUAL "Clang" AND NOT MSVC)
     set(DEFAULT_FLOATTEST fp-contract)
   else()
     set(DEFAULT_FLOATTEST no-fp-contract)
diff --git a/ChangeLog.md b/ChangeLog.md
index 0fe2ae59..d4226344 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -135,7 +135,9 @@ default.
 for merged upsampling/color conversion, 1.5.1[5] is no longer necessary and has
 been reverted.
 
-14. The build system can now be used to generate a universal x86-64 + Armv8
+14. The Arm Neon SIMD extensions can now be built using Visual Studio.
+
+15. The build system can now be used to generate a universal x86-64 + Armv8
 libjpeg-turbo SDK package for both iOS and macOS.
 
 
diff --git a/jchuff.c b/jchuff.c
index 2417cac3..8ea48b80 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -7,6 +7,7 @@
  * Copyright (C) 2009-2011, 2014-2016, 2018-2020, D. R. Commander.
  * Copyright (C) 2015, Matthieu Darbois.
  * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -76,7 +77,8 @@ typedef size_t bit_buf_type;
  * intrinsics implementation of the Arm Neon SIMD extensions, which is why we
  * retain the old Huffman encoder behavior when using the GAS implementation.
  */
-#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__))
+#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__) || \
+                            defined(_M_ARM) || defined(_M_ARM64))
 typedef unsigned long long simd_bit_buf_type;
 #else
 typedef bit_buf_type simd_bit_buf_type;
diff --git a/jdsample.c b/jdsample.c
index 2d347109..eaad72a0 100644
--- a/jdsample.c
+++ b/jdsample.c
@@ -477,7 +477,8 @@ jinit_upsampler(j_decompress_ptr cinfo)
     } else if (h_in_group == h_out_group &&
                v_in_group * 2 == v_out_group && do_fancy) {
       /* Non-fancy upsampling is handled by the generic method */
-#if defined(__arm__) || defined(__aarch64__)
+#if defined(__arm__) || defined(__aarch64__) || \
+    defined(_M_ARM) || defined(_M_ARM64)
       if (jsimd_can_h1v2_fancy_upsample())
         upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
       else
diff --git a/simd/arm/aarch32/jccolext-neon.c b/simd/arm/aarch32/jccolext-neon.c
index 96b44d81..362102d2 100644
--- a/simd/arm/aarch32/jccolext-neon.c
+++ b/simd/arm/aarch32/jccolext-neon.c
@@ -52,6 +52,8 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
   JSAMPROW inptr;
   /* Pointers to Y, Cb, and Cr output data */
   JSAMPROW outptr0, outptr1, outptr2;
+  /* Allocate temporary buffer for final (image_width % 8) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
 
   /* Set up conversion constants. */
 #ifdef HAVE_VLD1_U16_X2
@@ -79,7 +81,6 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
        * buffer large enough to accommodate the vector load.
        */
       if (cols_remaining < 8) {
-        ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
         memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
         inptr = tmp_buf;
       }
diff --git a/simd/arm/aarch32/jchuff-neon.c b/simd/arm/aarch32/jchuff-neon.c
index 941c9b24..19d94f72 100644
--- a/simd/arm/aarch32/jchuff-neon.c
+++ b/simd/arm/aarch32/jchuff-neon.c
@@ -31,6 +31,7 @@
 #include "../../../jsimddct.h"
 #include "../../jsimd.h"
 #include "../jchuff.h"
+#include "neon-compat.h"
 
 #include <limits.h>
 
@@ -231,8 +232,9 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
   uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
   uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
 
+  /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
   const uint8x8_t bitmap_mask =
-    { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
+    vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
 
   row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
   row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
@@ -278,7 +280,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
   const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
 
   while (bitmap_1_32 != 0) {
-    r = __builtin_clz(bitmap_1_32);
+    r = BUILTIN_CLZ(bitmap_1_32);
     i += r;
     bitmap_1_32 <<= r;
     nbits = block_nbits[i];
@@ -299,7 +301,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
   i = 33;
 
   while (bitmap_33_63 != 0) {
-    unsigned int leading_zeros = __builtin_clz(bitmap_33_63);
+    unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
     r += leading_zeros;
     i += leading_zeros;
     bitmap_33_63 <<= leading_zeros;
diff --git a/simd/arm/aarch64/jccolext-neon.c b/simd/arm/aarch64/jccolext-neon.c
index 756aeda8..37130c22 100644
--- a/simd/arm/aarch64/jccolext-neon.c
+++ b/simd/arm/aarch64/jccolext-neon.c
@@ -51,6 +51,8 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
   JSAMPROW inptr;
   /* Pointers to Y, Cb, and Cr output data */
   JSAMPROW outptr0, outptr1, outptr2;
+  /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
 
   /* Set up conversion constants. */
   const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
@@ -162,7 +164,6 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
        * (image_width % 16) columns of data are first memcopied to a temporary
        * buffer large enough to accommodate the vector load.
        */
-      ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
       memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
       inptr = tmp_buf;
 
@@ -255,7 +256,6 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
        * (image_width % 8) columns of data are first memcopied to a temporary
        * buffer large enough to accommodate the vector load.
        */
-      ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
       memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
       inptr = tmp_buf;
 
diff --git a/simd/arm/aarch64/jchuff-neon.c b/simd/arm/aarch64/jchuff-neon.c
index 808fa956..a0a57a66 100644
--- a/simd/arm/aarch64/jchuff-neon.c
+++ b/simd/arm/aarch64/jchuff-neon.c
@@ -205,8 +205,9 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
   uint8x8_t abs_row7_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row7),
                                                vdupq_n_u16(0)));
 
+  /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
   const uint8x8_t bitmap_mask =
-    { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
+    vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
 
   abs_row0_gt0 = vand_u8(abs_row0_gt0, bitmap_mask);
   abs_row1_gt0 = vand_u8(abs_row1_gt0, bitmap_mask);
@@ -241,8 +242,12 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
   /* Encode DC coefficient. */
 
   /* Find nbits required to specify sign and amplitude of coefficient. */
+#if defined(_MSC_VER) && !defined(__clang__)
+  unsigned int lz = BUILTIN_CLZ(vgetq_lane_s16(abs_row0, 0));
+#else
   unsigned int lz;
   __asm__("clz %w0, %w1" : "=r"(lz) : "r"(vgetq_lane_s16(abs_row0, 0)));
+#endif
   unsigned int nbits = 32 - lz;
   /* Emit Huffman-coded symbol and additional diff bits. */
   unsigned int diff = (unsigned int)(vgetq_lane_u16(row0_diff, 0) << lz) >> lz;
@@ -326,7 +331,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
     vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
 
     while (bitmap != 0) {
-      r = __builtin_clzl(bitmap);
+      r = BUILTIN_CLZL(bitmap);
       i += r;
       bitmap <<= r;
       nbits = block_nbits[i];
@@ -365,10 +370,10 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
 
     /* Same as above but must mask diff bits and compute nbits on demand. */
     while (bitmap != 0) {
-      r = __builtin_clzl(bitmap);
+      r = BUILTIN_CLZL(bitmap);
       i += r;
       bitmap <<= r;
-      lz = __builtin_clz(block_abs[i]);
+      lz = BUILTIN_CLZ(block_abs[i]);
       nbits = 32 - lz;
       diff = (unsigned int)(block_diff[i] << lz) >> lz;
       while (r > 15) {
diff --git a/simd/arm/jccolor-neon.c b/simd/arm/jccolor-neon.c
index f18ed9e5..9fcc62dd 100644
--- a/simd/arm/jccolor-neon.c
+++ b/simd/arm/jccolor-neon.c
@@ -53,7 +53,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
 
 /* Include inline routines for colorspace extensions. */
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #include "aarch64/jccolext-neon.c"
 #else
 #include "aarch32/jccolext-neon.c"
@@ -68,7 +68,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
 #define RGB_BLUE  EXT_RGB_BLUE
 #define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
 #define jsimd_rgb_ycc_convert_neon  jsimd_extrgb_ycc_convert_neon
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #include "aarch64/jccolext-neon.c"
 #else
 #include "aarch32/jccolext-neon.c"
@@ -84,7 +84,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
 #define RGB_BLUE  EXT_RGBX_BLUE
 #define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
 #define jsimd_rgb_ycc_convert_neon  jsimd_extrgbx_ycc_convert_neon
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #include "aarch64/jccolext-neon.c"
 #else
 #include "aarch32/jccolext-neon.c"
@@ -100,7 +100,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
 #define RGB_BLUE  EXT_BGR_BLUE
 #define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
 #define jsimd_rgb_ycc_convert_neon  jsimd_extbgr_ycc_convert_neon
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #include "aarch64/jccolext-neon.c"
 #else
 #include "aarch32/jccolext-neon.c"
@@ -116,7 +116,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
 #define RGB_BLUE  EXT_BGRX_BLUE
 #define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
 #define jsimd_rgb_ycc_convert_neon  jsimd_extbgrx_ycc_convert_neon
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #include "aarch64/jccolext-neon.c"
 #else
 #include "aarch32/jccolext-neon.c"
@@ -132,7 +132,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
 #define RGB_BLUE  EXT_XBGR_BLUE
 #define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
 #define jsimd_rgb_ycc_convert_neon  jsimd_extxbgr_ycc_convert_neon
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #include "aarch64/jccolext-neon.c"
 #else
 #include "aarch32/jccolext-neon.c"
@@ -148,7 +148,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
 #define RGB_BLUE  EXT_XRGB_BLUE
 #define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
 #define jsimd_rgb_ycc_convert_neon  jsimd_extxrgb_ycc_convert_neon
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #include "aarch64/jccolext-neon.c"
 #else
 #include "aarch32/jccolext-neon.c"
diff --git a/simd/arm/jcgryext-neon.c b/simd/arm/jcgryext-neon.c
index b1f00e60..416a7385 100644
--- a/simd/arm/jcgryext-neon.c
+++ b/simd/arm/jcgryext-neon.c
@@ -41,6 +41,8 @@ void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
 {
   JSAMPROW inptr;
   JSAMPROW outptr;
+  /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
 
   while (--num_rows >= 0) {
     inptr = *input_buf++;
@@ -55,7 +57,6 @@ void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
        * buffer large enough to accommodate the vector load.
        */
       if (cols_remaining < 16) {
-        ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
         memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
         inptr = tmp_buf;
       }
diff --git a/simd/arm/jchuff.h b/simd/arm/jchuff.h
index 87ff0d37..d30759f5 100644
--- a/simd/arm/jchuff.h
+++ b/simd/arm/jchuff.h
@@ -17,7 +17,7 @@
  * but must not be updated permanently until we complete the MCU.
  */
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #define BIT_BUF_SIZE  64
 #else
 #define BIT_BUF_SIZE  32
@@ -54,7 +54,25 @@ typedef struct {
  * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to
  * encode 0xFF as 0xFF 0x00.
  */
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+  buffer[0] = (JOCTET)(put_buffer >> 56); \
+  buffer[1] = (JOCTET)(put_buffer >> 48); \
+  buffer[2] = (JOCTET)(put_buffer >> 40); \
+  buffer[3] = (JOCTET)(put_buffer >> 32); \
+  buffer[4] = (JOCTET)(put_buffer >> 24); \
+  buffer[5] = (JOCTET)(put_buffer >> 16); \
+  buffer[6] = (JOCTET)(put_buffer >>  8); \
+  buffer[7] = (JOCTET)(put_buffer      ); \
+}
+#else
+#define SPLAT() { \
+  __asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \
+  *((uint64_t *)buffer) = put_buffer; \
+}
+#endif
 
 #define FLUSH() { \
   if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
@@ -67,14 +85,27 @@ typedef struct {
     EMIT_BYTE(put_buffer >>  8) \
     EMIT_BYTE(put_buffer      ) \
   } else { \
-    __asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \
-    *((uint64_t *)buffer) = put_buffer; \
+    SPLAT() \
     buffer += 8; \
   } \
 }
 
 #else
 
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+  buffer[0] = (JOCTET)(put_buffer >> 24); \
+  buffer[1] = (JOCTET)(put_buffer >> 16); \
+  buffer[2] = (JOCTET)(put_buffer >>  8); \
+  buffer[3] = (JOCTET)(put_buffer      ); \
+}
+#else
+#define SPLAT() { \
+  __asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \
+  *((uint32_t *)buffer) = put_buffer; \
+}
+#endif
+
 #define FLUSH() { \
   if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
     EMIT_BYTE(put_buffer >> 24) \
@@ -82,8 +113,7 @@ typedef struct {
     EMIT_BYTE(put_buffer >>  8) \
     EMIT_BYTE(put_buffer      ) \
   } else { \
-    __asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \
-    *((uint32_t *)buffer) = put_buffer; \
+    SPLAT() \
     buffer += 4; \
   } \
 }
diff --git a/simd/arm/jcphuff-neon.c b/simd/arm/jcphuff-neon.c
index 61f94c2e..8b6d53be 100644
--- a/simd/arm/jcphuff-neon.c
+++ b/simd/arm/jcphuff-neon.c
@@ -27,6 +27,7 @@
 #include "../../jdct.h"
 #include "../../jsimddct.h"
 #include "../jsimd.h"
+#include "neon-compat.h"
 
 #include <arm_neon.h>
 
@@ -212,8 +213,9 @@ void jsimd_encode_mcu_AC_first_prepare_neon
   uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
   uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
 
+  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
   const uint8x8_t bitmap_mask =
-    { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
+    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
 
   row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
   row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
@@ -232,7 +234,7 @@ void jsimd_encode_mcu_AC_first_prepare_neon
   uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
   uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
   /* Move bitmap to a 64-bit scalar register. */
   uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
   /* Store zerobits bitmap. */
@@ -456,8 +458,9 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
   uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
   uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
 
+  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
   const uint8x8_t bitmap_mask =
-    { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
+    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
 
   abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
   abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
@@ -476,7 +479,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
   uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
   uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
   /* Move bitmap to a 64-bit scalar register. */
   uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
   /* Store zerobits bitmap. */
@@ -517,7 +520,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
   bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
   bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
   /* Move bitmap to a 64-bit scalar register. */
   bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
   /* Store signbits bitmap. */
@@ -560,7 +563,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
   bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
   bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
   /* Move bitmap to a 64-bit scalar register. */
   bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
 
@@ -569,7 +572,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
     /* EOB position is defined to be 0 if all coefficients != 1. */
     return 0;
   } else {
-    return 63 - __builtin_clzl(bitmap);
+    return 63 - BUILTIN_CLZL(bitmap);
   }
 #else
   /* Move bitmap to two 32-bit scalar registers. */
@@ -580,9 +583,9 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
   if (bitmap0 == 0 && bitmap1 == 0) {
     return 0;
   } else if (bitmap1 != 0) {
-    return 63 - __builtin_clz(bitmap1);
+    return 63 - BUILTIN_CLZ(bitmap1);
   } else {
-    return 31 - __builtin_clz(bitmap0);
+    return 31 - BUILTIN_CLZ(bitmap0);
   }
 #endif
 }
diff --git a/simd/arm/jcsample-neon.c b/simd/arm/jcsample-neon.c
index e4e7827a..8a3e2378 100644
--- a/simd/arm/jcsample-neon.c
+++ b/simd/arm/jcsample-neon.c
@@ -84,7 +84,8 @@ void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
   const uint8x16_t expand_mask =
     vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
   /* Load bias pattern (alternating every pixel.) */
-  const uint16x8_t bias = { 0, 1, 0, 1, 0, 1, 0, 1 };
+  /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
+  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
   unsigned i, outrow;
 
   for (outrow = 0; outrow < v_samp_factor; outrow++) {
@@ -104,7 +105,7 @@ void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
 
     /* Load pixels in last DCT block into a table. */
     uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     /* Pad the empty elements with the value of the last pixel. */
     pixels = vqtbl1q_u8(pixels, expand_mask);
 #else
@@ -137,7 +138,8 @@ void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
   const uint8x16_t expand_mask =
     vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
   /* Load bias pattern (alternating every pixel.) */
-  const uint16x8_t bias = { 1, 2, 1, 2, 1, 2, 1, 2 };
+  /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
+  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
   unsigned i, outrow;
 
   for (outrow = 0; outrow < v_samp_factor; outrow++) {
@@ -165,7 +167,7 @@ void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
       vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
     uint8x16_t pixels_r1 =
       vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     /* Pad the empty elements with the value of the last pixel. */
     pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
     pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
diff --git a/simd/arm/neon-compat.h.in b/simd/arm/neon-compat.h.in
index 7a03d81f..e2347b9b 100644
--- a/simd/arm/neon-compat.h.in
+++ b/simd/arm/neon-compat.h.in
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -21,3 +22,14 @@
 #cmakedefine HAVE_VLD1_S16_X3
 #cmakedefine HAVE_VLD1_U16_X2
 #cmakedefine HAVE_VLD1Q_U8_X4
+
+/* Define compiler-independent count-leading-zeros macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#define BUILTIN_CLZ(x)  _CountLeadingZeros(x)
+#define BUILTIN_CLZL(x)  _CountLeadingZeros64(x)
+#elif defined(__clang__) || defined(__GNUC__)
+#define BUILTIN_CLZ(x)  __builtin_clz(x)
+#define BUILTIN_CLZL(x)  __builtin_clzl(x)
+#else
+#error "Unknown compiler"
+#endif