Neon: Auto-detect compiler intrinsics completeness

This allows the Neon intrinsics code to be built successfully (albeit
likely with reduced run-time performance) with Xcode 5.0-6.2
(iOS/AArch64) and Android NDK < r19 (AArch32).  Note that Xcode 5.0-6.2
will not build the Armv8 GAS code without gas-preprocessor.pl, and no
version of Xcode will build the Armv7 GAS code without
gas-preprocessor.pl, so we always use the full Neon intrinsics
implementation by default with macOS and iOS builds.

Auto-detecting the completeness of the compiler's set of Neon intrinsics
also allows us to more intelligently set the default value of
NEON_INTRINSICS, based on the values of HAVE_VLD1*.  This is a
reasonable, albeit imperfect, proxy for whether a compiler has a full
and optimal set of Neon intrinsics.  Specific notes:

  - 64-bit RGB-to-YCbCr color conversion
    does not use any of the intrinsics in question, regresses with GCC
  - 64-bit accurate integer forward DCT
    uses vld1_s16_x3(), regresses with GCC
  - 64-bit Huffman encoding
    uses vld1q_u8_x4(), regresses with GCC
  - 64-bit YCbCr-to-RGB color conversion
    does not use any of the intrinsics in question, regresses with GCC
  - 64-bit accurate integer inverse DCT
    uses vld1_s16_x3(), regresses with GCC
  - 64-bit 4x4 inverse DCT
    uses vld1_s16_x3().  I did not test this algorithm in isolation, so
    it may in fact regress with GCC, but the regression may be hidden by
    the speedup from the new SIMD-accelerated upsampling algorithms.

  - 32-bit RGB-to-YCbCr color conversion:
    uses vld1_u16_x2(), regresses with GCC
  - 32-bit accurate integer forward DCT
    uses vld1_s16_x3(), regression irrelevant because there was no
    previous implementation
  - 32-bit accurate integer inverse DCT
    uses vld1_s16_x3(), regresses with GCC
  - 32-bit fast integer inverse DCT
    does not use any of the intrinsics in question, regresses with GCC
  - 32-bit 4x4 inverse DCT
    uses vld1_s16_x3().  I did not test this algorithm in isolation, so
    it may in fact regress with GCC, but the regression may be hidden by
    the speedup from the new SIMD-accelerated upsampling algorithms.

Presumably when GCC includes a full and optimal set of Neon intrinsics,
the HAVE_VLD1* tests will pass, and the full Neon intrinsics
implementation will be enabled automatically.
This commit is contained in:
DRC
2020-11-13 12:12:47 -06:00
parent bbd8089297
commit 33859880e9
10 changed files with 88 additions and 33 deletions

View File

@@ -402,7 +402,7 @@ for these platforms.
### Armv8 (64-bit)
**Xcode 6.3.x or later required**
**Xcode 5 or later required, Xcode 6.3.x or later recommended**
The following script demonstrates how to build libjpeg-turbo to run on the
iPhone 5S/iPad Mini 2/iPad Air and newer.
@@ -434,6 +434,8 @@ Building libjpeg-turbo for Android platforms requires v13b or later of the
### Armv7 (32-bit)
**NDK r19 or later with Clang recommended**
The following is a general recipe script that can be modified for your specific
needs.
@@ -459,6 +461,8 @@ needs.
### Armv8 (64-bit)
**Clang recommended**
The following is a general recipe script that can be modified for your specific
needs.

View File

@@ -139,9 +139,9 @@ endforeach()
macro(boolean_number var)
if(${var})
set(${var} 1)
set(${var} 1 ${ARGN})
else()
set(${var} 0)
set(${var} 0 ${ARGN})
endif()
endmacro()
@@ -548,28 +548,10 @@ if(WITH_ARITH_DEC)
endif()
if(WITH_SIMD)
if(CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm")
# GCC doesn't yet have a full or optimal set of Neon intrinsics, so for
# performance reasons, when using GCC, we default to using the older GAS
# implementation of the Neon SIMD extensions for certain algorithms.
if(CMAKE_COMPILER_IS_GNUCC)
set(DEFAULT_NEON_INTRINSICS 0)
else()
set(DEFAULT_NEON_INTRINSICS 1)
endif()
option(NEON_INTRINSICS
"Because GCC doesn't yet have a full or optimal set of Neon intrinsics, for performance reasons, the default when building libjpeg-turbo with GCC is to continue using the older GAS implementation of the Neon SIMD extensions for certain algorithms. Setting this option forces the full Neon intrinsics implementation to be used with all compilers. Unsetting this option forces the hybrid GAS/intrinsics implementation to be used with all compilers."
${DEFAULT_NEON_INTRINSICS})
boolean_number(NEON_INTRINSICS)
if(NEON_INTRINSICS)
add_definitions(-DNEON_INTRINSICS)
message(STATUS "Use full Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
else()
message(STATUS "Use partial Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
endif()
endif()
add_subdirectory(simd)
if(NEON_INTRINSICS)
add_definitions(-DNEON_INTRINSICS)
endif()
elseif(NOT WITH_12BIT)
message(STATUS "SIMD extensions: None (WITH_SIMD = ${WITH_SIMD})")
endif()

View File

@@ -213,6 +213,42 @@ endif()
elseif(CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm")
include(CheckSymbolExists)
if(BITS EQUAL 32)
set(CMAKE_REQUIRED_FLAGS -mfpu=neon)
endif()
check_symbol_exists(vld1_s16_x3 arm_neon.h HAVE_VLD1_S16_X3)
check_symbol_exists(vld1_u16_x2 arm_neon.h HAVE_VLD1_U16_X2)
check_symbol_exists(vld1q_u8_x4 arm_neon.h HAVE_VLD1Q_U8_X4)
if(BITS EQUAL 32)
unset(CMAKE_REQUIRED_FLAGS)
endif()
configure_file(arm/neon-compat.h.in arm/neon-compat.h @ONLY)
include_directories(${CMAKE_CURRENT_BINARY_DIR}/arm)
# GCC (as of this writing) and some older versions of Clang do not have a full
# or optimal set of Neon intrinsics, so for performance reasons, when using
# those compilers, we default to using the older GAS implementation of the Neon
# SIMD extensions for certain algorithms. The presence or absence of the three
# intrinsics we tested above is a reasonable proxy for this. We always default
# to using the full Neon intrinsics implementation when building for macOS or
# iOS, to avoid the need for gas-preprocessor.
if((HAVE_VLD1_S16_X3 AND HAVE_VLD1_U16_X2 AND HAVE_VLD1Q_U8_X4) OR APPLE)
set(DEFAULT_NEON_INTRINSICS 1)
else()
set(DEFAULT_NEON_INTRINSICS 0)
endif()
option(NEON_INTRINSICS
"Because GCC (as of this writing) and some older versions of Clang do not have a full or optimal set of Neon intrinsics, for performance reasons, the default when building libjpeg-turbo with those compilers is to continue using the older GAS implementation of the Neon SIMD extensions for certain algorithms. Setting this option forces the full Neon intrinsics implementation to be used with all compilers. Unsetting this option forces the hybrid GAS/intrinsics implementation to be used with all compilers."
${DEFAULT_NEON_INTRINSICS})
boolean_number(NEON_INTRINSICS PARENT_SCOPE)
if(NEON_INTRINSICS)
add_definitions(-DNEON_INTRINSICS)
message(STATUS "Use full Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
else()
message(STATUS "Use partial Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
endif()
set(SIMD_SOURCES arm/jcgray-neon.c arm/jcphuff-neon.c arm/jcsample-neon.c
arm/jdmerge-neon.c arm/jdsample-neon.c arm/jfdctfst-neon.c
arm/jidctred-neon.c arm/jquanti-neon.c)

View File

@@ -2,6 +2,7 @@
* jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -53,7 +54,7 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
JSAMPROW outptr0, outptr1, outptr2;
/* Set up conversion constants. */
#if defined(__clang__)
#ifdef HAVE_VLD1_U16_X2
const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
#else
/* GCC does not currently support the intrinsic vld1_<type>_x2(). */

View File

@@ -2,6 +2,7 @@
* jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -32,6 +33,7 @@
#include "../../jsimd.h"
#include "../align.h"
#include "../jchuff.h"
#include "neon-compat.h"
#include <limits.h>
@@ -65,7 +67,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
uint16_t block_diff[DCTSIZE2];
/* Load lookup table indices for rows of zig-zag ordering. */
#if defined(__clang__) || defined(_MSC_VER)
#ifdef HAVE_VLD1Q_U8_X4
const uint8x16x4_t idx_rows_0123 =
vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
const uint8x16x4_t idx_rows_4567 =
@@ -87,7 +89,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
#endif
/* Load 8x8 block of DCT coefficients. */
#if defined(__clang__) || defined(_MSC_VER)
#ifdef HAVE_VLD1Q_U8_X4
const int8x16x4_t tbl_rows_0123 =
vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
const int8x16x4_t tbl_rows_4567 =

View File

@@ -2,6 +2,7 @@
* jccolor-neon.c - colorspace conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -28,6 +29,7 @@
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include "neon-compat.h"
#include <arm_neon.h>

View File

@@ -2,6 +2,7 @@
* jfdctint-neon.c - accurate integer FDCT (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -28,6 +29,7 @@
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include "neon-compat.h"
#include <arm_neon.h>
@@ -85,7 +87,7 @@ ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
void jsimd_fdct_islow_neon(DCTELEM *data)
{
/* Load DCT constants. */
#if defined(__clang__) || defined(_MSC_VER)
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
#else
/* GCC does not currently support the intrinsic vld1_<type>_x3(). */

View File

@@ -30,6 +30,7 @@
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include "neon-compat.h"
#include <arm_neon.h>
@@ -354,7 +355,7 @@ static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
int16_t *workspace_2)
{
/* Load constants for IDCT computation. */
#if defined(__clang__) || defined(_MSC_VER)
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
#else
const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
@@ -481,7 +482,7 @@ static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
int16_t *workspace_2)
{
/* Load constants for IDCT computation. */
#if defined(__clang__) || defined(_MSC_VER)
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
#else
const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
@@ -565,7 +566,7 @@ static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
unsigned buf_offset)
{
/* Load constants for IDCT computation. */
#if defined(__clang__) || defined(_MSC_VER)
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
#else
const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
@@ -712,7 +713,7 @@ static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
unsigned buf_offset)
{
/* Load constants for IDCT computation. */
#if defined(__clang__) || defined(_MSC_VER)
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
#else
const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);

View File

@@ -2,6 +2,7 @@
* jidctred-neon.c - reduced-size IDCT (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -28,6 +29,7 @@
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include "neon-compat.h"
#include <arm_neon.h>
@@ -221,7 +223,7 @@ void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
/* Load constants for IDCT computation. */
#if defined(__clang__) || defined(_MSC_VER)
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
#else
/* GCC does not currently support the intrinsic vld1_<type>_x3(). */

23
simd/arm/neon-compat.h.in Normal file
View File

@@ -0,0 +1,23 @@
/*
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#cmakedefine HAVE_VLD1_S16_X3
#cmakedefine HAVE_VLD1_U16_X2
#cmakedefine HAVE_VLD1Q_U8_X4