Eliminate duplicate copies of jpeg_nbits_table

ef9a4e05ba (libjpeg-turbo 1.4.x), which
was based on
https://bug815473.bmoattachments.org/attachment.cgi?id=692126
(https://bugzilla.mozilla.org/show_bug.cgi?id=815473), modified the C
baseline Huffman encoder so that it precomputes jpeg_nbits_table, in
order to facilitate sharing the table among multiple processes.
However, libjpeg-turbo never shared the table, and because the table was
implemented as a static array, f3a8684cd1
(libjpeg-turbo 1.5.x) and 37bae1a0e9
(libjpeg-turbo 2.0.x) each introduced a duplicate copy of the table for
(respectively) the SSE2 baseline Huffman encoder and the C progressive
Huffman encoder.

This commit does the following:
- Move the duplicated code in jchuff.c and jcphuff.c, originally
  introduced in 0cfc4c17b7 and
  37bae1a0e9, into a header
  (jpeg_nbits.h).
- Credit the co-author of 0cfc4c17b7.
  (Refer to https://sourceforge.net/p/libjpeg-turbo/patches/57).
- Modify the SSE2 baseline Huffman encoder so that the C Huffman
  encoders can share its definition of jpeg_nbits_table.
- Move the definition of jpeg_nbits_table into a C source file
  (jpeg_nbits.c) rather than a header, and define the table only if
  USE_CLZ_INTRINSIC is undefined and the SSE2 baseline Huffman encoder
  will not be built.
- Apply hidden symbol visibility to the shared definition of
  jpeg_nbits_table, if the compiler supports the necessary attribute.
  (In practice, only Visual C++ doesn't.)

Closes #114

See also:
https://bugzilla.mozilla.org/show_bug.cgi?id=1501523
This commit is contained in:
DRC
2024-01-12 18:21:41 -05:00
parent be96fa0a40
commit fa2b6ea092
8 changed files with 109 additions and 83 deletions

View File

@@ -468,6 +468,15 @@ if(UNIX)
endif()
endif()
if(NOT MSVC OR CMAKE_C_COMPILER_ID STREQUAL "Clang")
check_c_source_compiles("const int __attribute__((visibility(\"hidden\"))) table[1] = { 0 }; int main(void) { return table[0]; }"
HIDDEN_WORKS)
if(HIDDEN_WORKS)
set(HIDDEN "__attribute__((visibility(\"hidden\")))")
message(STATUS "HIDDEN = ${HIDDEN}")
endif()
endif()
if(MSVC)
set(INLINE_OPTIONS "__inline;inline")
else()
@@ -572,7 +581,7 @@ set(JPEG_SOURCES ${JPEG12_SOURCES} jcapimin.c jchuff.c jcicc.c jcinit.c
jclhuff.c jcmarker.c jcmaster.c jcomapi.c jcparam.c jcphuff.c jctrans.c
jdapimin.c jdatadst.c jdatasrc.c jdhuff.c jdicc.c jdinput.c jdlhuff.c
jdmarker.c jdmaster.c jdphuff.c jdtrans.c jerror.c jfdctflt.c jmemmgr.c
jmemnobs.c)
jmemnobs.c jpeg_nbits.c)
if(WITH_ARITH_ENC OR WITH_ARITH_DEC)
set(JPEG_SOURCES ${JPEG_SOURCES} jaricom.c)

View File

@@ -6,7 +6,7 @@
* Lossless JPEG Modifications:
* Copyright (C) 1999, Ken Murchison.
* libjpeg-turbo Modifications:
* Copyright (C) 2009-2011, 2014-2016, 2018-2023, D. R. Commander.
* Copyright (C) 2009-2011, 2014-2016, 2018-2024, D. R. Commander.
* Copyright (C) 2015, Matthieu Darbois.
* Copyright (C) 2018, Matthias Räncker.
* Copyright (C) 2020, Arm Limited.
@@ -35,41 +35,7 @@
#include "jchuff.h" /* Declarations shared with jc*huff.c */
#endif
#include <limits.h>
/*
* NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
* used for bit counting rather than the lookup table. This will reduce the
* memory footprint by 64k, which is important for some mobile applications
* that create many isolated instances of libjpeg-turbo (web browsers, for
* instance.) This may improve performance on some mobile platforms as well.
* This feature is enabled by default only on Arm processors, because some x86
* chips have a slow implementation of bsr, and the use of clz/bsr cannot be
* shown to have a significant performance impact even on the x86 chips that
* have a fast implementation of it. When building for Armv6, you can
* explicitly disable the use of clz/bsr by adding -mthumb to the compiler
* flags (this defines __thumb__).
*/
/* NOTE: Both GCC and Clang define __GNUC__ */
#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
defined(_M_ARM) || defined(_M_ARM64)
#if !defined(__thumb__) || defined(__thumb2__)
#define USE_CLZ_INTRINSIC
#endif
#endif
#ifdef USE_CLZ_INTRINSIC
#if defined(_MSC_VER) && !defined(__clang__)
#define JPEG_NBITS_NONZERO(x) (32 - _CountLeadingZeros(x))
#else
#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
#endif
#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
#else
#include "jpeg_nbits_table.h"
#define JPEG_NBITS(x) (jpeg_nbits_table[x])
#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
#endif
#include "jpeg_nbits.h"
/* Expanded entropy encoder object for Huffman encoding.

View File

@@ -1,6 +1,9 @@
/* libjpeg-turbo build number */
#define BUILD "@BUILD@"
/* How to hide global symbols. */
#define HIDDEN @HIDDEN@
/* Compiler's inline keyword */
#undef inline

View File

@@ -6,7 +6,7 @@
* Lossless JPEG Modifications:
* Copyright (C) 1999, Ken Murchison.
* libjpeg-turbo Modifications:
* Copyright (C) 2011, 2015, 2018, 2021-2022, D. R. Commander.
* Copyright (C) 2011, 2015, 2018, 2021-2022, 2024, D. R. Commander.
* Copyright (C) 2016, 2018, 2022, Matthieu Darbois.
* Copyright (C) 2020, Arm Limited.
* Copyright (C) 2021, Alex Richardson.
@@ -44,40 +44,7 @@
#ifdef C_PROGRESSIVE_SUPPORTED
/*
* NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
* used for bit counting rather than the lookup table. This will reduce the
* memory footprint by 64k, which is important for some mobile applications
* that create many isolated instances of libjpeg-turbo (web browsers, for
* instance.) This may improve performance on some mobile platforms as well.
* This feature is enabled by default only on Arm processors, because some x86
* chips have a slow implementation of bsr, and the use of clz/bsr cannot be
* shown to have a significant performance impact even on the x86 chips that
* have a fast implementation of it. When building for Armv6, you can
* explicitly disable the use of clz/bsr by adding -mthumb to the compiler
* flags (this defines __thumb__).
*/
/* NOTE: Both GCC and Clang define __GNUC__ */
#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
defined(_M_ARM) || defined(_M_ARM64)
#if !defined(__thumb__) || defined(__thumb2__)
#define USE_CLZ_INTRINSIC
#endif
#endif
#ifdef USE_CLZ_INTRINSIC
#if defined(_MSC_VER) && !defined(__clang__)
#define JPEG_NBITS_NONZERO(x) (32 - _CountLeadingZeros(x))
#else
#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
#endif
#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
#else
#include "jpeg_nbits_table.h"
#define JPEG_NBITS(x) (jpeg_nbits_table[x])
#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
#endif
#include "jpeg_nbits.h"
/* Expanded entropy encoder object for progressive Huffman encoding. */

View File

@@ -1,4 +1,32 @@
static const unsigned char jpeg_nbits_table[65536] = {
/*
* Copyright (C) 2024, D. R. Commander.
*
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*/
#include "jpeg_nbits.h"
#include "jconfigint.h"
#ifndef USE_CLZ_INTRINSIC
#define INCLUDE_JPEG_NBITS_TABLE
/* When building for x86[-64] with the SIMD extensions enabled, the C Huffman
* encoders can reuse jpeg_nbits_table from the SSE2 baseline Huffman encoder.
*/
#if (defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || \
defined(_M_X64)) && defined(WITH_SIMD)
#undef INCLUDE_JPEG_NBITS_TABLE
#endif
#endif
#ifdef INCLUDE_JPEG_NBITS_TABLE
const unsigned char HIDDEN jpeg_nbits_table[65536] = {
0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
@@ -4096,3 +4124,11 @@ static const unsigned char jpeg_nbits_table[65536] = {
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
};
#else
/* Suppress compiler warnings about empty translation unit. */
typedef int dummy_jpeg_nbits_table;
#endif

43
jpeg_nbits.h Normal file
View File

@@ -0,0 +1,43 @@
/*
* Copyright (C) 2014, 2021, 2024, D. R. Commander.
* Copyright (C) 2014, Olle Liljenzin.
* Copyright (C) 2020, Arm Limited.
*
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*/
/*
* NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
* used for bit counting rather than the lookup table. This will reduce the
* memory footprint by 64k, which is important for some mobile applications
* that create many isolated instances of libjpeg-turbo (web browsers, for
* instance.) This may improve performance on some mobile platforms as well.
* This feature is enabled by default only on Arm processors, because some x86
* chips have a slow implementation of bsr, and the use of clz/bsr cannot be
* shown to have a significant performance impact even on the x86 chips that
* have a fast implementation of it. When building for Armv6, you can
* explicitly disable the use of clz/bsr by adding -mthumb to the compiler
* flags (this defines __thumb__).
*/
/* NOTE: Both GCC and Clang define __GNUC__ */
#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
defined(_M_ARM) || defined(_M_ARM64)
#if !defined(__thumb__) || defined(__thumb2__)
#define USE_CLZ_INTRINSIC
#endif
#endif
#ifdef USE_CLZ_INTRINSIC
#if defined(_MSC_VER) && !defined(__clang__)
#define JPEG_NBITS_NONZERO(x) (32 - _CountLeadingZeros(x))
#else
#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
#endif
#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
#else
extern const unsigned char jpeg_nbits_table[65536];
#define JPEG_NBITS(x) (jpeg_nbits_table[x])
#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
#endif

View File

@@ -1,7 +1,7 @@
;
; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
;
; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander.
; Copyright (C) 2009-2011, 2014-2017, 2019, 2024, D. R. Commander.
; Copyright (C) 2015, Matthieu Darbois.
; Copyright (C) 2018, Matthias Räncker.
;
@@ -65,7 +65,8 @@ times 1 << 2 db 3
times 1 << 1 db 2
times 1 << 0 db 1
times 1 db 0
jpeg_nbits_table:
GLOBAL_DATA(jpeg_nbits_table)
EXTN(jpeg_nbits_table):
times 1 db 0
times 1 << 0 db 1
times 1 << 1 db 2
@@ -88,9 +89,9 @@ times 1 << 14 db 15
%ifdef PIC
%define NBITS(x) nbits_base + x
%else
%define NBITS(x) jpeg_nbits_table + x
%define NBITS(x) EXTN(jpeg_nbits_table) + x
%endif
%define MASK_BITS(x) NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table)
%define MASK_BITS(x) NBITS((x) * 8) + (jpeg_mask_bits - EXTN(jpeg_nbits_table))
; --------------------------------------------------------------------------
SECTION SEG_TEXT
@@ -469,7 +470,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
pcmpeqw mm_all_0xff, mm_all_0xff ;Z: all_0xff[i] = 0xFF;
%endmacro
GET_SYM nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER
GET_SYM nbits_base, EXTN(jpeg_nbits_table), GET_SYM_BEFORE, GET_SYM_AFTER
psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 --
shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59

View File

@@ -1,7 +1,7 @@
;
; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
;
; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, 2023, D. R. Commander.
; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, 2023-2024, D. R. Commander.
; Copyright (C) 2015, Matthieu Darbois.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
@@ -67,7 +67,8 @@ times 1 << 2 db 3
times 1 << 1 db 2
times 1 << 0 db 1
times 1 db 0
jpeg_nbits_table:
GLOBAL_DATA(jpeg_nbits_table)
EXTN(jpeg_nbits_table):
times 1 db 0
times 1 << 0 db 1
times 1 << 1 db 2
@@ -89,7 +90,7 @@ times 1 << 15 db 16
alignz 32
%define NBITS(x) nbits_base + x
%define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table)
%define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - EXTN(jpeg_nbits_table))
; --------------------------------------------------------------------------
SECTION SEG_TEXT
@@ -290,7 +291,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
mov dctbl, POINTER [rbp+48]
mov actbl, POINTER [rbp+56]
punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
lea nbits_base, [rel jpeg_nbits_table]
lea nbits_base, [rel EXTN(jpeg_nbits_table)]
%else
@@ -312,7 +313,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
mov buffer, rsi
movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
movsx codeq, word [block] ;Z: code = block[0];
lea nbits_base, [rel jpeg_nbits_table]
lea nbits_base, [rel EXTN(jpeg_nbits_table)]
pxor xmm4, xmm4 ;A: w4[i] = 0;
sub codeq, rcx ;Z: code -= last_dc_val;
punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11