From 33859880e9eb33fa718bbe2bc7043a0a644cba9b Mon Sep 17 00:00:00 2001
From: DRC <information@libjpeg-turbo.org>
Date: Fri, 13 Nov 2020 12:12:47 -0600
Subject: [PATCH] Neon: Auto-detect compiler intrinsics completeness

This allows the Neon intrinsics code to be built successfully (albeit
likely with reduced run-time performance) with Xcode 5.0-6.2
(iOS/AArch64) and Android NDK < r19 (AArch32).  Note that Xcode 5.0-6.2
will not build the Armv8 GAS code without gas-preprocessor.pl, and no
version of Xcode will build the Armv7 GAS code without
gas-preprocessor.pl, so we always use the full Neon intrinsics
implementation by default with macOS and iOS builds.

Auto-detecting the completeness of the compiler's set of Neon intrinsics
also allows us to more intelligently set the default value of
NEON_INTRINSICS, based on the values of HAVE_VLD1*.  This is a
reasonable, albeit imperfect, proxy for whether a compiler has a full
and optimal set of Neon intrinsics.  Specific notes:

  - 64-bit RGB-to-YCbCr color conversion
    does not use any of the intrinsics in question, regresses with GCC
  - 64-bit accurate integer forward DCT
    uses vld1_s16_x3(), regresses with GCC
  - 64-bit Huffman encoding
    uses vld1q_u8_x4(), regresses with GCC
  - 64-bit YCbCr-to-RGB color conversion
    does not use any of the intrinsics in question, regresses with GCC
  - 64-bit accurate integer inverse DCT
    uses vld1_s16_x3(), regresses with GCC
  - 64-bit 4x4 inverse DCT
    uses vld1_s16_x3().  I did not test this algorithm in isolation, so
    it may in fact regress with GCC, but the regression may be hidden by
    the speedup from the new SIMD-accelerated upsampling algorithms.

  - 32-bit RGB-to-YCbCr color conversion:
    uses vld1_u16_x2(), regresses with GCC
  - 32-bit accurate integer forward DCT
    uses vld1_s16_x3(), regression irrelevant because there was no
    previous implementation
  - 32-bit accurate integer inverse DCT
    uses vld1_s16_x3(), regresses with GCC
  - 32-bit fast integer inverse DCT
    does not use any of the intrinsics in question, regresses with GCC
  - 32-bit 4x4 inverse DCT
    uses vld1_s16_x3().  I did not test this algorithm in isolation, so
    it may in fact regress with GCC, but the regression may be hidden by
    the speedup from the new SIMD-accelerated upsampling algorithms.

Presumably when GCC includes a full and optimal set of Neon intrinsics,
the HAVE_VLD1* tests will pass, and the full Neon intrinsics
implementation will be enabled automatically.
---
 BUILDING.md                      |  6 +++++-
 CMakeLists.txt                   | 28 +++++--------------------
 simd/CMakeLists.txt              | 36 ++++++++++++++++++++++++++++++++
 simd/arm/aarch32/jccolext-neon.c |  3 ++-
 simd/arm/aarch64/jchuff-neon.c   |  6 ++++--
 simd/arm/jccolor-neon.c          |  2 ++
 simd/arm/jfdctint-neon.c         |  4 +++-
 simd/arm/jidctint-neon.c         |  9 ++++----
 simd/arm/jidctred-neon.c         |  4 +++-
 simd/arm/neon-compat.h.in        | 23 ++++++++++++++++++++
 10 files changed, 88 insertions(+), 33 deletions(-)
 create mode 100644 simd/arm/neon-compat.h.in

diff --git a/BUILDING.md b/BUILDING.md
index 8a19f01d..116d0dda 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -402,7 +402,7 @@ for these platforms.
 
 ### Armv8 (64-bit)
 
-**Xcode 6.3.x or later required**
+**Xcode 5 or later required, Xcode 6.3.x or later recommended**
 
 The following script demonstrates how to build libjpeg-turbo to run on the
 iPhone 5S/iPad Mini 2/iPad Air and newer.
@@ -434,6 +434,8 @@ Building libjpeg-turbo for Android platforms requires v13b or later of the
 
 ### Armv7 (32-bit)
 
+**NDK r19 or later with Clang recommended**
+
 The following is a general recipe script that can be modified for your specific
 needs.
 
@@ -459,6 +461,8 @@ needs.
 
 ### Armv8 (64-bit)
 
+**Clang recommended**
+
 The following is a general recipe script that can be modified for your specific
 needs.
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 49c4f903..e2759e46 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -139,9 +139,9 @@ endforeach()
 
 macro(boolean_number var)
   if(${var})
-    set(${var} 1)
+    set(${var} 1 ${ARGN})
   else()
-    set(${var} 0)
+    set(${var} 0 ${ARGN})
   endif()
 endmacro()
 
@@ -548,28 +548,10 @@ if(WITH_ARITH_DEC)
 endif()
 
 if(WITH_SIMD)
-  if(CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm")
-    # GCC doesn't yet have a full or optimal set of Neon intrinsics, so for
-    # performance reasons, when using GCC, we default to using the older GAS
-    # implementation of the Neon SIMD extensions for certain algorithms.
-    if(CMAKE_COMPILER_IS_GNUCC)
-      set(DEFAULT_NEON_INTRINSICS 0)
-    else()
-      set(DEFAULT_NEON_INTRINSICS 1)
-    endif()
-    option(NEON_INTRINSICS
-      "Because GCC doesn't yet have a full or optimal set of Neon intrinsics, for performance reasons, the default when building libjpeg-turbo with GCC is to continue using the older GAS implementation of the Neon SIMD extensions for certain algorithms.  Setting this option forces the full Neon intrinsics implementation to be used with all compilers.  Unsetting this option forces the hybrid GAS/intrinsics implementation to be used with all compilers."
-      ${DEFAULT_NEON_INTRINSICS})
-    boolean_number(NEON_INTRINSICS)
-    if(NEON_INTRINSICS)
-      add_definitions(-DNEON_INTRINSICS)
-      message(STATUS "Use full Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
-    else()
-      message(STATUS "Use partial Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
-    endif()
-  endif()
-
   add_subdirectory(simd)
+  if(NEON_INTRINSICS)
+    add_definitions(-DNEON_INTRINSICS)
+  endif()
 elseif(NOT WITH_12BIT)
   message(STATUS "SIMD extensions: None (WITH_SIMD = ${WITH_SIMD})")
 endif()
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index 3636e6f5..f3c24ef0 100644
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -213,6 +213,42 @@ endif()
 
 elseif(CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm")
 
+include(CheckSymbolExists)
+if(BITS EQUAL 32)
+  set(CMAKE_REQUIRED_FLAGS -mfpu=neon)
+endif()
+check_symbol_exists(vld1_s16_x3 arm_neon.h HAVE_VLD1_S16_X3)
+check_symbol_exists(vld1_u16_x2 arm_neon.h HAVE_VLD1_U16_X2)
+check_symbol_exists(vld1q_u8_x4 arm_neon.h HAVE_VLD1Q_U8_X4)
+if(BITS EQUAL 32)
+  unset(CMAKE_REQUIRED_FLAGS)
+endif()
+configure_file(arm/neon-compat.h.in arm/neon-compat.h @ONLY)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/arm)
+
+# GCC (as of this writing) and some older versions of Clang do not have a full
+# or optimal set of Neon intrinsics, so for performance reasons, when using
+# those compilers, we default to using the older GAS implementation of the Neon
+# SIMD extensions for certain algorithms.  The presence or absence of the three
+# intrinsics we tested above is a reasonable proxy for this.  We always default
+# to using the full Neon intrinsics implementation when building for macOS or
+# iOS, to avoid the need for gas-preprocessor.
+if((HAVE_VLD1_S16_X3 AND HAVE_VLD1_U16_X2 AND HAVE_VLD1Q_U8_X4) OR APPLE)
+  set(DEFAULT_NEON_INTRINSICS 1)
+else()
+  set(DEFAULT_NEON_INTRINSICS 0)
+endif()
+option(NEON_INTRINSICS
+  "Because GCC (as of this writing) and some older versions of Clang do not have a full or optimal set of Neon intrinsics, for performance reasons, the default when building libjpeg-turbo with those compilers is to continue using the older GAS implementation of the Neon SIMD extensions for certain algorithms.  Setting this option forces the full Neon intrinsics implementation to be used with all compilers.  Unsetting this option forces the hybrid GAS/intrinsics implementation to be used with all compilers."
+  ${DEFAULT_NEON_INTRINSICS})
+boolean_number(NEON_INTRINSICS PARENT_SCOPE)
+if(NEON_INTRINSICS)
+  add_definitions(-DNEON_INTRINSICS)
+  message(STATUS "Use full Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
+else()
+  message(STATUS "Use partial Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
+endif()
+
 set(SIMD_SOURCES arm/jcgray-neon.c arm/jcphuff-neon.c arm/jcsample-neon.c
   arm/jdmerge-neon.c arm/jdsample-neon.c arm/jfdctfst-neon.c
   arm/jidctred-neon.c arm/jquanti-neon.c)
diff --git a/simd/arm/aarch32/jccolext-neon.c b/simd/arm/aarch32/jccolext-neon.c
index 38f90c4c..96b44d81 100644
--- a/simd/arm/aarch32/jccolext-neon.c
+++ b/simd/arm/aarch32/jccolext-neon.c
@@ -2,6 +2,7 @@
  * jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
  *
  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -53,7 +54,7 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
   JSAMPROW outptr0, outptr1, outptr2;
 
   /* Set up conversion constants. */
-#if defined(__clang__)
+#ifdef HAVE_VLD1_U16_X2
   const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
 #else
   /* GCC does not currently support the intrinsic vld1_<type>_x2(). */
diff --git a/simd/arm/aarch64/jchuff-neon.c b/simd/arm/aarch64/jchuff-neon.c
index 25ede30d..808fa956 100644
--- a/simd/arm/aarch64/jchuff-neon.c
+++ b/simd/arm/aarch64/jchuff-neon.c
@@ -2,6 +2,7 @@
  * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
  *
  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -32,6 +33,7 @@
 #include "../../jsimd.h"
 #include "../align.h"
 #include "../jchuff.h"
+#include "neon-compat.h"
 
 #include <limits.h>
 
@@ -65,7 +67,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
   uint16_t block_diff[DCTSIZE2];
 
   /* Load lookup table indices for rows of zig-zag ordering. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1Q_U8_X4
   const uint8x16x4_t idx_rows_0123 =
     vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
   const uint8x16x4_t idx_rows_4567 =
@@ -87,7 +89,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
 #endif
 
   /* Load 8x8 block of DCT coefficients. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1Q_U8_X4
   const int8x16x4_t tbl_rows_0123 =
     vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
   const int8x16x4_t tbl_rows_4567 =
diff --git a/simd/arm/jccolor-neon.c b/simd/arm/jccolor-neon.c
index 1f8d007d..f18ed9e5 100644
--- a/simd/arm/jccolor-neon.c
+++ b/simd/arm/jccolor-neon.c
@@ -2,6 +2,7 @@
  * jccolor-neon.c - colorspace conversion (Arm Neon)
  *
  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -28,6 +29,7 @@
 #include "../../jsimddct.h"
 #include "../jsimd.h"
 #include "align.h"
+#include "neon-compat.h"
 
 #include <arm_neon.h>
 
diff --git a/simd/arm/jfdctint-neon.c b/simd/arm/jfdctint-neon.c
index 5e891e68..ccfc07b1 100644
--- a/simd/arm/jfdctint-neon.c
+++ b/simd/arm/jfdctint-neon.c
@@ -2,6 +2,7 @@
  * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
  *
  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -28,6 +29,7 @@
 #include "../../jsimddct.h"
 #include "../jsimd.h"
 #include "align.h"
+#include "neon-compat.h"
 
 #include <arm_neon.h>
 
@@ -85,7 +87,7 @@ ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
 void jsimd_fdct_islow_neon(DCTELEM *data)
 {
   /* Load DCT constants. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1_S16_X3
   const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
 #else
   /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
diff --git a/simd/arm/jidctint-neon.c b/simd/arm/jidctint-neon.c
index cf4a9464..043b652e 100644
--- a/simd/arm/jidctint-neon.c
+++ b/simd/arm/jidctint-neon.c
@@ -30,6 +30,7 @@
 #include "../../jsimddct.h"
 #include "../jsimd.h"
 #include "align.h"
+#include "neon-compat.h"
 
 #include <arm_neon.h>
 
@@ -354,7 +355,7 @@ static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
                                                   int16_t *workspace_2)
 {
   /* Load constants for IDCT computation. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1_S16_X3
   const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
 #else
   const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
@@ -481,7 +482,7 @@ static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
                                                  int16_t *workspace_2)
 {
   /* Load constants for IDCT computation. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1_S16_X3
   const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
 #else
   const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
@@ -565,7 +566,7 @@ static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
                                                   unsigned buf_offset)
 {
   /* Load constants for IDCT computation. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1_S16_X3
   const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
 #else
   const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
@@ -712,7 +713,7 @@ static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
                                                  unsigned buf_offset)
 {
   /* Load constants for IDCT computation. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1_S16_X3
   const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
 #else
   const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
diff --git a/simd/arm/jidctred-neon.c b/simd/arm/jidctred-neon.c
index 023eb7de..be9627e6 100644
--- a/simd/arm/jidctred-neon.c
+++ b/simd/arm/jidctred-neon.c
@@ -2,6 +2,7 @@
  * jidctred-neon.c - reduced-size IDCT (Arm Neon)
  *
  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -28,6 +29,7 @@
 #include "../../jsimddct.h"
 #include "../jsimd.h"
 #include "align.h"
+#include "neon-compat.h"
 
 #include <arm_neon.h>
 
@@ -221,7 +223,7 @@ void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
   int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
 
   /* Load constants for IDCT computation. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1_S16_X3
   const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
 #else
   /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
diff --git a/simd/arm/neon-compat.h.in b/simd/arm/neon-compat.h.in
new file mode 100644
index 00000000..7a03d81f
--- /dev/null
+++ b/simd/arm/neon-compat.h.in
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#cmakedefine HAVE_VLD1_S16_X3
+#cmakedefine HAVE_VLD1_U16_X2
+#cmakedefine HAVE_VLD1Q_U8_X4