- Use the _M_ARM and _M_ARM64 macros provided by Visual Studio for compile-time detection of Arm builds, since __arm__ and __aarch64__ are only present in GNU-compatible compilers. - Neon/intrinsics: Use the _CountLeadingZeros() and _CountLeadingZeros64() intrinsics provided by Visual Studio, since __builtin_clz() and __builtin_clzl() are only present in GNU-compatible compilers. - Neon/intrinsics: Since Visual Studio does not support static vector initialization, replace static initialization of Neon vectors with the appropriate intrinsics. Compared to the static initialization approach, this produces identical assembly code with both GCC and Clang. - Neon/intrinsics: Since Visual Studio does not support inline assembly code, provide alternative code paths for Visual Studio whenever inline assembly is used. - Build: Set FLOATTEST appropriately for AArch64 Visual Studio builds (Visual Studio does not emit fused multiply-add [FMA] instructions by default for such builds.) - Neon/intrinsics: Move temporary buffer allocation outside of nested loops. Since Visual Studio configures Arm builds with a relatively small amount of stack memory, attempting to allocate those buffers within the inner loops caused a stack overflow. Closes #461 Closes #475
150 lines
4.3 KiB
C
150 lines
4.3 KiB
C
/*
|
|
* jchuff.h
|
|
*
|
|
* This file was part of the Independent JPEG Group's software:
|
|
* Copyright (C) 1991-1997, Thomas G. Lane.
|
|
* libjpeg-turbo Modifications:
|
|
* Copyright (C) 2009, 2018, D. R. Commander.
|
|
* Copyright (C) 2018, Matthias Räncker.
|
|
* Copyright (C) 2020, Arm Limited.
|
|
* For conditions of distribution and use, see the accompanying README.ijg
|
|
* file.
|
|
*/
|
|
|
|
/* Expanded entropy encoder object for Huffman encoding.
|
|
*
|
|
* The savable_state subrecord contains fields that change within an MCU,
|
|
* but must not be updated permanently until we complete the MCU.
|
|
*/
|
|
|
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
|
#define BIT_BUF_SIZE 64
|
|
#else
|
|
#define BIT_BUF_SIZE 32
|
|
#endif
|
|
|
|
typedef struct {
|
|
size_t put_buffer; /* current bit accumulation buffer */
|
|
int free_bits; /* # of bits available in it */
|
|
int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
|
|
} savable_state;
|
|
|
|
typedef struct {
|
|
JOCTET *next_output_byte; /* => next byte to write in buffer */
|
|
size_t free_in_buffer; /* # of byte spaces remaining in buffer */
|
|
savable_state cur; /* Current bit buffer & DC state */
|
|
j_compress_ptr cinfo; /* dump_buffer needs access to this */
|
|
int simd;
|
|
} working_state;
|
|
|
|
/* Outputting bits to the file */
|
|
|
|
/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be encoded
|
|
* as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the byte is
|
|
* 0xFF. Otherwise, the output buffer pointer is advanced by 1, and the
|
|
* speculative 0 byte will be overwritten by the next byte.
|
|
*/
|
|
#define EMIT_BYTE(b) { \
|
|
buffer[0] = (JOCTET)(b); \
|
|
buffer[1] = 0; \
|
|
buffer -= -2 + ((JOCTET)(b) < 0xFF); \
|
|
}
|
|
|
|
/* Output the entire bit buffer. If there are no 0xFF bytes in it, then write
|
|
* directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to
|
|
* encode 0xFF as 0xFF 0x00.
|
|
*/
|
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
|
|
|
#if defined(_MSC_VER) && !defined(__clang__)
|
|
#define SPLAT() { \
|
|
buffer[0] = (JOCTET)(put_buffer >> 56); \
|
|
buffer[1] = (JOCTET)(put_buffer >> 48); \
|
|
buffer[2] = (JOCTET)(put_buffer >> 40); \
|
|
buffer[3] = (JOCTET)(put_buffer >> 32); \
|
|
buffer[4] = (JOCTET)(put_buffer >> 24); \
|
|
buffer[5] = (JOCTET)(put_buffer >> 16); \
|
|
buffer[6] = (JOCTET)(put_buffer >> 8); \
|
|
buffer[7] = (JOCTET)(put_buffer ); \
|
|
}
|
|
#else
|
|
#define SPLAT() { \
|
|
__asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \
|
|
*((uint64_t *)buffer) = put_buffer; \
|
|
}
|
|
#endif
|
|
|
|
#define FLUSH() { \
|
|
if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
|
|
EMIT_BYTE(put_buffer >> 56) \
|
|
EMIT_BYTE(put_buffer >> 48) \
|
|
EMIT_BYTE(put_buffer >> 40) \
|
|
EMIT_BYTE(put_buffer >> 32) \
|
|
EMIT_BYTE(put_buffer >> 24) \
|
|
EMIT_BYTE(put_buffer >> 16) \
|
|
EMIT_BYTE(put_buffer >> 8) \
|
|
EMIT_BYTE(put_buffer ) \
|
|
} else { \
|
|
SPLAT() \
|
|
buffer += 8; \
|
|
} \
|
|
}
|
|
|
|
#else
|
|
|
|
#if defined(_MSC_VER) && !defined(__clang__)
|
|
#define SPLAT() { \
|
|
buffer[0] = (JOCTET)(put_buffer >> 24); \
|
|
buffer[1] = (JOCTET)(put_buffer >> 16); \
|
|
buffer[2] = (JOCTET)(put_buffer >> 8); \
|
|
buffer[3] = (JOCTET)(put_buffer ); \
|
|
}
|
|
#else
|
|
#define SPLAT() { \
|
|
__asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \
|
|
*((uint32_t *)buffer) = put_buffer; \
|
|
}
|
|
#endif
|
|
|
|
#define FLUSH() { \
|
|
if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
|
|
EMIT_BYTE(put_buffer >> 24) \
|
|
EMIT_BYTE(put_buffer >> 16) \
|
|
EMIT_BYTE(put_buffer >> 8) \
|
|
EMIT_BYTE(put_buffer ) \
|
|
} else { \
|
|
SPLAT() \
|
|
buffer += 4; \
|
|
} \
|
|
}
|
|
|
|
#endif
|
|
|
|
/* Fill the bit buffer to capacity with the leading bits from code, then output
|
|
* the bit buffer and put the remaining bits from code into the bit buffer.
|
|
*/
|
|
#define PUT_AND_FLUSH(code, size) { \
|
|
put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
|
|
FLUSH() \
|
|
free_bits += BIT_BUF_SIZE; \
|
|
put_buffer = code; \
|
|
}
|
|
|
|
/* Insert code into the bit buffer and output the bit buffer if needed.
|
|
* NOTE: We can't flush with free_bits == 0, since the left shift in
|
|
* PUT_AND_FLUSH() would have undefined behavior.
|
|
*/
|
|
#define PUT_BITS(code, size) { \
|
|
free_bits -= size; \
|
|
if (free_bits < 0) \
|
|
PUT_AND_FLUSH(code, size) \
|
|
else \
|
|
put_buffer = (put_buffer << size) | code; \
|
|
}
|
|
|
|
#define PUT_CODE(code, size, diff) { \
|
|
diff |= code << nbits; \
|
|
nbits += size; \
|
|
PUT_BITS(diff, nbits) \
|
|
}
|