From fa2b6ea09253ce06152bc31591f6b84a3286502d Mon Sep 17 00:00:00 2001 From: DRC Date: Fri, 12 Jan 2024 18:21:41 -0500 Subject: [PATCH] Eliminate duplicate copies of jpeg_nbits_table ef9a4e05ba919494cbebe50e15f332de5ab97e82 (libjpeg-turbo 1.4.x), which was based on https://bug815473.bmoattachments.org/attachment.cgi?id=692126 (https://bugzilla.mozilla.org/show_bug.cgi?id=815473), modified the C baseline Huffman encoder so that it precomputes jpeg_nbits_table, in order to facilitate sharing the table among multiple processes. However, libjpeg-turbo never shared the table, and because the table was implemented as a static array, f3a8684cd1c28e557d394470962a7a224c76ddbc (libjpeg-turbo 1.5.x) and 37bae1a0e977ee1ba769e6f0aa27e519ab6e58c6 (libjpeg-turbo 2.0.x) each introduced a duplicate copy of the table for (respectively) the SSE2 baseline Huffman encoder and the C progressive Huffman encoder. This commit does the following: - Move the duplicated code in jchuff.c and jcphuff.c, originally introduced in 0cfc4c17b740cb2cbb11f9d85c8ab3745d5b913a and 37bae1a0e977ee1ba769e6f0aa27e519ab6e58c6, into a header (jpeg_nbits.h). - Credit the co-author of 0cfc4c17b740cb2cbb11f9d85c8ab3745d5b913a. (Refer to https://sourceforge.net/p/libjpeg-turbo/patches/57). - Modify the SSE2 baseline Huffman encoder so that the C Huffman encoders can share its definition of jpeg_nbits_table. - Move the definition of jpeg_nbits_table into a C source file (jpeg_nbits.c) rather than a header, and define the table only if USE_CLZ_INTRINSIC is undefined and the SSE2 baseline Huffman encoder will not be built. - Apply hidden symbol visibility to the shared definition of jpeg_nbits_table, if the compiler supports the necessary attribute. (In practice, only Visual C++ doesn't.) Closes #114 See also: https://bugzilla.mozilla.org/show_bug.cgi?id=1501523 --- CMakeLists.txt | 11 +++++++- jchuff.c | 38 ++------------------------ jconfigint.h.in | 3 +++ jcphuff.c | 37 ++----------------------- jpeg_nbits_table.h => jpeg_nbits.c | 38 +++++++++++++++++++++++++- jpeg_nbits.h | 43 ++++++++++++++++++++++++++++++ simd/i386/jchuff-sse2.asm | 11 ++++---- simd/x86_64/jchuff-sse2.asm | 11 ++++---- 8 files changed, 109 insertions(+), 83 deletions(-) rename jpeg_nbits_table.h => jpeg_nbits.c (99%) create mode 100644 jpeg_nbits.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 02709e9a..66e077ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -468,6 +468,15 @@ if(UNIX) endif() endif() +if(NOT MSVC OR CMAKE_C_COMPILER_ID STREQUAL "Clang") + check_c_source_compiles("const int __attribute__((visibility(\"hidden\"))) table[1] = { 0 }; int main(void) { return table[0]; }" + HIDDEN_WORKS) + if(HIDDEN_WORKS) + set(HIDDEN "__attribute__((visibility(\"hidden\")))") + message(STATUS "HIDDEN = ${HIDDEN}") + endif() +endif() + if(MSVC) set(INLINE_OPTIONS "__inline;inline") else() @@ -572,7 +581,7 @@ set(JPEG_SOURCES ${JPEG12_SOURCES} jcapimin.c jchuff.c jcicc.c jcinit.c jclhuff.c jcmarker.c jcmaster.c jcomapi.c jcparam.c jcphuff.c jctrans.c jdapimin.c jdatadst.c jdatasrc.c jdhuff.c jdicc.c jdinput.c jdlhuff.c jdmarker.c jdmaster.c jdphuff.c jdtrans.c jerror.c jfdctflt.c jmemmgr.c - jmemnobs.c) + jmemnobs.c jpeg_nbits.c) if(WITH_ARITH_ENC OR WITH_ARITH_DEC) set(JPEG_SOURCES ${JPEG_SOURCES} jaricom.c) diff --git a/jchuff.c b/jchuff.c index 3fede05f..488c9b5c 100644 --- a/jchuff.c +++ b/jchuff.c @@ -6,7 +6,7 @@ * Lossless JPEG Modifications: * Copyright (C) 1999, Ken Murchison. * libjpeg-turbo Modifications: - * Copyright (C) 2009-2011, 2014-2016, 2018-2023, D. R. Commander. + * Copyright (C) 2009-2011, 2014-2016, 2018-2024, D. R. Commander. * Copyright (C) 2015, Matthieu Darbois. * Copyright (C) 2018, Matthias Räncker. * Copyright (C) 2020, Arm Limited. @@ -35,41 +35,7 @@ #include "jchuff.h" /* Declarations shared with jc*huff.c */ #endif #include - -/* - * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be - * used for bit counting rather than the lookup table. This will reduce the - * memory footprint by 64k, which is important for some mobile applications - * that create many isolated instances of libjpeg-turbo (web browsers, for - * instance.) This may improve performance on some mobile platforms as well. - * This feature is enabled by default only on Arm processors, because some x86 - * chips have a slow implementation of bsr, and the use of clz/bsr cannot be - * shown to have a significant performance impact even on the x86 chips that - * have a fast implementation of it. When building for Armv6, you can - * explicitly disable the use of clz/bsr by adding -mthumb to the compiler - * flags (this defines __thumb__). - */ - -/* NOTE: Both GCC and Clang define __GNUC__ */ -#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \ - defined(_M_ARM) || defined(_M_ARM64) -#if !defined(__thumb__) || defined(__thumb2__) -#define USE_CLZ_INTRINSIC -#endif -#endif - -#ifdef USE_CLZ_INTRINSIC -#if defined(_MSC_VER) && !defined(__clang__) -#define JPEG_NBITS_NONZERO(x) (32 - _CountLeadingZeros(x)) -#else -#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x)) -#endif -#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0) -#else -#include "jpeg_nbits_table.h" -#define JPEG_NBITS(x) (jpeg_nbits_table[x]) -#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x) -#endif +#include "jpeg_nbits.h" /* Expanded entropy encoder object for Huffman encoding. diff --git a/jconfigint.h.in b/jconfigint.h.in index e7e66e74..5c14e32a 100644 --- a/jconfigint.h.in +++ b/jconfigint.h.in @@ -1,6 +1,9 @@ /* libjpeg-turbo build number */ #define BUILD "@BUILD@" +/* How to hide global symbols. */ +#define HIDDEN @HIDDEN@ + /* Compiler's inline keyword */ #undef inline diff --git a/jcphuff.c b/jcphuff.c index 56e63bd6..484e2d85 100644 --- a/jcphuff.c +++ b/jcphuff.c @@ -6,7 +6,7 @@ * Lossless JPEG Modifications: * Copyright (C) 1999, Ken Murchison. * libjpeg-turbo Modifications: - * Copyright (C) 2011, 2015, 2018, 2021-2022, D. R. Commander. + * Copyright (C) 2011, 2015, 2018, 2021-2022, 2024, D. R. Commander. * Copyright (C) 2016, 2018, 2022, Matthieu Darbois. * Copyright (C) 2020, Arm Limited. * Copyright (C) 2021, Alex Richardson. @@ -44,40 +44,7 @@ #ifdef C_PROGRESSIVE_SUPPORTED -/* - * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be - * used for bit counting rather than the lookup table. This will reduce the - * memory footprint by 64k, which is important for some mobile applications - * that create many isolated instances of libjpeg-turbo (web browsers, for - * instance.) This may improve performance on some mobile platforms as well. - * This feature is enabled by default only on Arm processors, because some x86 - * chips have a slow implementation of bsr, and the use of clz/bsr cannot be - * shown to have a significant performance impact even on the x86 chips that - * have a fast implementation of it. When building for Armv6, you can - * explicitly disable the use of clz/bsr by adding -mthumb to the compiler - * flags (this defines __thumb__). - */ - -/* NOTE: Both GCC and Clang define __GNUC__ */ -#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \ - defined(_M_ARM) || defined(_M_ARM64) -#if !defined(__thumb__) || defined(__thumb2__) -#define USE_CLZ_INTRINSIC -#endif -#endif - -#ifdef USE_CLZ_INTRINSIC -#if defined(_MSC_VER) && !defined(__clang__) -#define JPEG_NBITS_NONZERO(x) (32 - _CountLeadingZeros(x)) -#else -#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x)) -#endif -#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0) -#else -#include "jpeg_nbits_table.h" -#define JPEG_NBITS(x) (jpeg_nbits_table[x]) -#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x) -#endif +#include "jpeg_nbits.h" /* Expanded entropy encoder object for progressive Huffman encoding. */ diff --git a/jpeg_nbits_table.h b/jpeg_nbits.c similarity index 99% rename from jpeg_nbits_table.h rename to jpeg_nbits.c index fcf73878..c8ee6b05 100644 --- a/jpeg_nbits_table.h +++ b/jpeg_nbits.c @@ -1,4 +1,32 @@ -static const unsigned char jpeg_nbits_table[65536] = { +/* + * Copyright (C) 2024, D. R. Commander. + * + * For conditions of distribution and use, see the accompanying README.ijg + * file. + */ + +#include "jpeg_nbits.h" +#include "jconfigint.h" + + +#ifndef USE_CLZ_INTRINSIC + +#define INCLUDE_JPEG_NBITS_TABLE + +/* When building for x86[-64] with the SIMD extensions enabled, the C Huffman + * encoders can reuse jpeg_nbits_table from the SSE2 baseline Huffman encoder. + */ +#if (defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || \ + defined(_M_X64)) && defined(WITH_SIMD) +#undef INCLUDE_JPEG_NBITS_TABLE +#endif + +#endif + + +#ifdef INCLUDE_JPEG_NBITS_TABLE + +const unsigned char HIDDEN jpeg_nbits_table[65536] = { 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, @@ -4096,3 +4124,11 @@ static const unsigned char jpeg_nbits_table[65536] = { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }; + +#else + +/* Suppress compiler warnings about empty translation unit. */ + +typedef int dummy_jpeg_nbits_table; + +#endif diff --git a/jpeg_nbits.h b/jpeg_nbits.h new file mode 100644 index 00000000..6481a122 --- /dev/null +++ b/jpeg_nbits.h @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2014, 2021, 2024, D. R. Commander. + * Copyright (C) 2014, Olle Liljenzin. + * Copyright (C) 2020, Arm Limited. + * + * For conditions of distribution and use, see the accompanying README.ijg + * file. + */ + +/* + * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be + * used for bit counting rather than the lookup table. This will reduce the + * memory footprint by 64k, which is important for some mobile applications + * that create many isolated instances of libjpeg-turbo (web browsers, for + * instance.) This may improve performance on some mobile platforms as well. + * This feature is enabled by default only on Arm processors, because some x86 + * chips have a slow implementation of bsr, and the use of clz/bsr cannot be + * shown to have a significant performance impact even on the x86 chips that + * have a fast implementation of it. When building for Armv6, you can + * explicitly disable the use of clz/bsr by adding -mthumb to the compiler + * flags (this defines __thumb__). + */ + +/* NOTE: Both GCC and Clang define __GNUC__ */ +#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \ + defined(_M_ARM) || defined(_M_ARM64) +#if !defined(__thumb__) || defined(__thumb2__) +#define USE_CLZ_INTRINSIC +#endif +#endif + +#ifdef USE_CLZ_INTRINSIC +#if defined(_MSC_VER) && !defined(__clang__) +#define JPEG_NBITS_NONZERO(x) (32 - _CountLeadingZeros(x)) +#else +#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x)) +#endif +#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0) +#else +extern const unsigned char jpeg_nbits_table[65536]; +#define JPEG_NBITS(x) (jpeg_nbits_table[x]) +#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x) +#endif diff --git a/simd/i386/jchuff-sse2.asm b/simd/i386/jchuff-sse2.asm index 278cf5e8..76cc85f6 100644 --- a/simd/i386/jchuff-sse2.asm +++ b/simd/i386/jchuff-sse2.asm @@ -1,7 +1,7 @@ ; ; jchuff-sse2.asm - Huffman entropy encoding (SSE2) ; -; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander. +; Copyright (C) 2009-2011, 2014-2017, 2019, 2024, D. R. Commander. ; Copyright (C) 2015, Matthieu Darbois. ; Copyright (C) 2018, Matthias Räncker. ; @@ -65,7 +65,8 @@ times 1 << 2 db 3 times 1 << 1 db 2 times 1 << 0 db 1 times 1 db 0 -jpeg_nbits_table: +GLOBAL_DATA(jpeg_nbits_table) +EXTN(jpeg_nbits_table): times 1 db 0 times 1 << 0 db 1 times 1 << 1 db 2 @@ -88,9 +89,9 @@ times 1 << 14 db 15 %ifdef PIC %define NBITS(x) nbits_base + x %else -%define NBITS(x) jpeg_nbits_table + x +%define NBITS(x) EXTN(jpeg_nbits_table) + x %endif -%define MASK_BITS(x) NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table) +%define MASK_BITS(x) NBITS((x) * 8) + (jpeg_mask_bits - EXTN(jpeg_nbits_table)) ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -469,7 +470,7 @@ EXTN(jsimd_huff_encode_one_block_sse2): pcmpeqw mm_all_0xff, mm_all_0xff ;Z: all_0xff[i] = 0xFF; %endmacro - GET_SYM nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER + GET_SYM nbits_base, EXTN(jpeg_nbits_table), GET_SYM_BEFORE, GET_SYM_AFTER psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 -- shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59 diff --git a/simd/x86_64/jchuff-sse2.asm b/simd/x86_64/jchuff-sse2.asm index a0eb9ce2..0c2cdd64 100644 --- a/simd/x86_64/jchuff-sse2.asm +++ b/simd/x86_64/jchuff-sse2.asm @@ -1,7 +1,7 @@ ; ; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2) ; -; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, 2023, D. R. Commander. +; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, 2023-2024, D. R. Commander. ; Copyright (C) 2015, Matthieu Darbois. ; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2023, Aliaksiej Kandracienka. @@ -67,7 +67,8 @@ times 1 << 2 db 3 times 1 << 1 db 2 times 1 << 0 db 1 times 1 db 0 -jpeg_nbits_table: +GLOBAL_DATA(jpeg_nbits_table) +EXTN(jpeg_nbits_table): times 1 db 0 times 1 << 0 db 1 times 1 << 1 db 2 @@ -89,7 +90,7 @@ times 1 << 15 db 16 alignz 32 %define NBITS(x) nbits_base + x -%define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table) +%define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - EXTN(jpeg_nbits_table)) ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -290,7 +291,7 @@ EXTN(jsimd_huff_encode_one_block_sse2): mov dctbl, POINTER [rbp+48] mov actbl, POINTER [rbp+56] punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 - lea nbits_base, [rel jpeg_nbits_table] + lea nbits_base, [rel EXTN(jpeg_nbits_table)] %else @@ -312,7 +313,7 @@ EXTN(jsimd_huff_encode_one_block_sse2): mov buffer, rsi movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 movsx codeq, word [block] ;Z: code = block[0]; - lea nbits_base, [rel jpeg_nbits_table] + lea nbits_base, [rel EXTN(jpeg_nbits_table)] pxor xmm4, xmm4 ;A: w4[i] = 0; sub codeq, rcx ;Z: code -= last_dc_val; punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11