Fix Neon SIMD build issues with Visual Studio

- Use the _M_ARM and _M_ARM64 macros provided by Visual Studio for compile-time detection of Arm builds, since __arm__ and __aarch64__ are only present in GNU-compatible compilers. - Neon/intrinsics: Use the _CountLeadingZeros() and _CountLeadingZeros64() intrinsics provided by Visual Studio, since __builtin_clz() and __builtin_clzl() are only present in GNU-compatible compilers. - Neon/intrinsics: Since Visual Studio does not support static vector initialization, replace static initialization of Neon vectors with the appropriate intrinsics. Compared to the static initialization approach, this produces identical assembly code with both GCC and Clang. - Neon/intrinsics: Since Visual Studio does not support inline assembly code, provide alternative code paths for Visual Studio whenever inline assembly is used. - Build: Set FLOATTEST appropriately for AArch64 Visual Studio builds (Visual Studio does not emit fused multiply-add [FMA] instructions by default for such builds.) - Neon/intrinsics: Move temporary buffer allocation outside of nested loops. Since Visual Studio configures Arm builds with a relatively small amount of stack memory, attempting to allocate those buffers within the inner loops caused a stack overflow. Closes #461 Closes #475
2020-11-17 12:48:49 +00:00
parent 91dd3b23ad
commit eb14189caa
14 changed files with 102 additions and 41 deletions
--- a/simd/arm/jchuff.h
+++ b/simd/arm/jchuff.h
@@ -17,7 +17,7 @@
 * but must not be updated permanently until we complete the MCU.
 */

-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #define BIT_BUF_SIZE  64
 #else
 #define BIT_BUF_SIZE  32
@@ -54,7 +54,25 @@ typedef struct {
 * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to
 * encode 0xFF as 0xFF 0x00.
 */
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+  buffer[0] = (JOCTET)(put_buffer >> 56); \
+  buffer[1] = (JOCTET)(put_buffer >> 48); \
+  buffer[2] = (JOCTET)(put_buffer >> 40); \
+  buffer[3] = (JOCTET)(put_buffer >> 32); \
+  buffer[4] = (JOCTET)(put_buffer >> 24); \
+  buffer[5] = (JOCTET)(put_buffer >> 16); \
+  buffer[6] = (JOCTET)(put_buffer >>  8); \
+  buffer[7] = (JOCTET)(put_buffer      ); \
+}
+#else
+#define SPLAT() { \
+  __asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \
+  *((uint64_t *)buffer) = put_buffer; \
+}
+#endif

 #define FLUSH() { \
  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
@@ -67,14 +85,27 @@ typedef struct {
    EMIT_BYTE(put_buffer >>  8) \
    EMIT_BYTE(put_buffer      ) \
  } else { \
-    __asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \
-    *((uint64_t *)buffer) = put_buffer; \
+    SPLAT() \
    buffer += 8; \
  } \
 }

 #else

+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+  buffer[0] = (JOCTET)(put_buffer >> 24); \
+  buffer[1] = (JOCTET)(put_buffer >> 16); \
+  buffer[2] = (JOCTET)(put_buffer >>  8); \
+  buffer[3] = (JOCTET)(put_buffer      ); \
+}
+#else
+#define SPLAT() { \
+  __asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \
+  *((uint32_t *)buffer) = put_buffer; \
+}
+#endif
+
 #define FLUSH() { \
  if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
    EMIT_BYTE(put_buffer >> 24) \
@@ -82,8 +113,7 @@ typedef struct {
    EMIT_BYTE(put_buffer >>  8) \
    EMIT_BYTE(put_buffer      ) \
  } else { \
-    __asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \
-    *((uint32_t *)buffer) = put_buffer; \
+    SPLAT() \
    buffer += 4; \
  } \
 }