Merge branch 'master' into dev
This commit is contained in:
84
ChangeLog.md
84
ChangeLog.md
@@ -1,7 +1,7 @@
|
||||
2.1 pre-beta
|
||||
============
|
||||
|
||||
### Significant changes relative to 2.0.3:
|
||||
### Significant changes relative to 2.0.4:
|
||||
|
||||
1. The build system, x86-64 SIMD extensions, and accelerated Huffman codec now
|
||||
support the x32 ABI on Linux, which allows for using x86-64 instructions with
|
||||
@@ -43,19 +43,14 @@ longer supports 32-bit Java virtual machines. Oracle no longer provides a
|
||||
32-bit JVM for macOS, and Apple's implementation of Java 1.6 (Java for OS X
|
||||
systems) is long obsolete.
|
||||
|
||||
5. Fixed a regression in the Windows packaging system (introduced by
|
||||
2.0 beta1[2]) whereby, if both the 64-bit libjpeg-turbo SDK for GCC and the
|
||||
64-bit libjpeg-turbo SDK for Visual C++ were installed on the same system, only
|
||||
one of them could be uninstalled.
|
||||
|
||||
6. The SSE2 (x86 SIMD) and C Huffman encoding algorithms have been
|
||||
5. The SSE2 (x86 SIMD) and C Huffman encoding algorithms have been
|
||||
significantly optimized, resulting in a measured average overall compression
|
||||
speedup of 12-28% for 64-bit code and 22-52% for 32-bit code on various Intel
|
||||
and AMD CPUs, as well as a measured average overall compression speedup of
|
||||
0-23% on platforms that do not have a SIMD-accelerated Huffman encoding
|
||||
implementation.
|
||||
|
||||
7. When decompressing progressive Huffman-encoded JPEG images, the block
|
||||
6. When decompressing progressive Huffman-encoded JPEG images, the block
|
||||
smoothing algorithm that the libjpeg API library optionally applies is now more
|
||||
fault-tolerant. Previously, if a particular scan was incomplete, then the
|
||||
smoothing parameters for the incomplete scan would be applied to the entire
|
||||
@@ -66,42 +61,16 @@ higher-frequency scan. libjpeg-turbo now applies block smoothing parameters to
|
||||
each iMCU row based on which scan generated the pixels in that row, rather than
|
||||
always using the block smoothing parameters for the most recent scan.
|
||||
|
||||
8. Fixed a signed integer overflow and subsequent segfault that occurred when
|
||||
attempting to decompress images with more than 715827882 pixels using the
|
||||
64-bit C version of TJBench.
|
||||
|
||||
9. Fixed out-of-bounds write in `tjDecompressToYUV2()` and
|
||||
`tjDecompressToYUVPlanes()` (sometimes manifesting as a double free) that
|
||||
occurred when attempting to decompress grayscale JPEG images that were
|
||||
compressed with a sampling factor other than 1 (for instance, with
|
||||
`cjpeg -grayscale -sample 2x2`).
|
||||
|
||||
10. Fixed a regression introduced by 2.0.2[5] that caused the TurboJPEG API to
|
||||
incorrectly identify some JPEG images with unusual sampling factors as 4:4:4
|
||||
JPEG images. This was known to cause a buffer overflow when attempting to
|
||||
decompress some such images using `tjDecompressToYUV2()` or
|
||||
`tjDecompressToYUVPlanes()`.
|
||||
|
||||
11. Fixed an issue, detected by ASan, whereby attempting to losslessly
|
||||
transform a specially-crafted malformed JPEG image containing an
|
||||
extremely-high-frequency coefficient block (junk image data that could never be
|
||||
generated by a legitimate JPEG compressor) could cause the Huffman encoder's
|
||||
local buffer to be overrun. (Refer to 1.4.0[9] and 1.4beta1[15].) Given that
|
||||
the buffer overrun was fully contained within the stack and did not cause a
|
||||
segfault or other user-visible errant behavior, and given that the lossless
|
||||
transformer (unlike the decompressor) is not generally exposed to arbitrary
|
||||
data exploits, this issue did not likely pose a security risk.
|
||||
|
||||
12. Added SIMD acceleration for progressive Huffman encoding on ARM 64-bit
|
||||
7. Added SIMD acceleration for progressive Huffman encoding on ARM 64-bit
|
||||
(ARMv8) platforms. This speeds up the compression of full-color progressive
|
||||
JPEGs by about 30-40% on average (relative to libjpeg-turbo 2.0.x) when using
|
||||
modern ARMv8 CPUs.
|
||||
|
||||
13. Added configure-time and run-time auto-detection of Loongson MMI SIMD
|
||||
8. Added configure-time and run-time auto-detection of Loongson MMI SIMD
|
||||
instructions, so that the Loongson MMI SIMD extensions can be included in any
|
||||
MIPS64 libjpeg-turbo build.
|
||||
|
||||
14. Added fault tolerance features to djpeg and jpegtran, mainly to demonstrate
|
||||
9. Added fault tolerance features to djpeg and jpegtran, mainly to demonstrate
|
||||
methods by which applications can guard against the exploits of the JPEG format
|
||||
described in the report
|
||||
["Two Issues with the JPEG Standard"](https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
|
||||
@@ -112,6 +81,47 @@ limit the number of allowable scans in the input file.
|
||||
treat all warnings as fatal.
|
||||
|
||||
|
||||
2.0.4
|
||||
=====
|
||||
|
||||
### Significant changes relative to 2.0.3:
|
||||
|
||||
1. Fixed a regression in the Windows packaging system (introduced by
|
||||
2.0 beta1[2]) whereby, if both the 64-bit libjpeg-turbo SDK for GCC and the
|
||||
64-bit libjpeg-turbo SDK for Visual C++ were installed on the same system, only
|
||||
one of them could be uninstalled.
|
||||
|
||||
2. Fixed a signed integer overflow and subsequent segfault that occurred when
|
||||
attempting to decompress images with more than 715827882 pixels using the
|
||||
64-bit C version of TJBench.
|
||||
|
||||
3. Fixed out-of-bounds write in `tjDecompressToYUV2()` and
|
||||
`tjDecompressToYUVPlanes()` (sometimes manifesting as a double free) that
|
||||
occurred when attempting to decompress grayscale JPEG images that were
|
||||
compressed with a sampling factor other than 1 (for instance, with
|
||||
`cjpeg -grayscale -sample 2x2`).
|
||||
|
||||
4. Fixed a regression introduced by 2.0.2[5] that caused the TurboJPEG API to
|
||||
incorrectly identify some JPEG images with unusual sampling factors as 4:4:4
|
||||
JPEG images. This was known to cause a buffer overflow when attempting to
|
||||
decompress some such images using `tjDecompressToYUV2()` or
|
||||
`tjDecompressToYUVPlanes()`.
|
||||
|
||||
5. Fixed an issue, detected by ASan, whereby attempting to losslessly transform
|
||||
a specially-crafted malformed JPEG image containing an extremely-high-frequency
|
||||
coefficient block (junk image data that could never be generated by a
|
||||
legitimate JPEG compressor) could cause the Huffman encoder's local buffer to
|
||||
be overrun. (Refer to 1.4.0[9] and 1.4beta1[15].) Given that the buffer
|
||||
overrun was fully contained within the stack and did not cause a segfault or
|
||||
other user-visible errant behavior, and given that the lossless transformer
|
||||
(unlike the decompressor) is not generally exposed to arbitrary data exploits,
|
||||
this issue did not likely pose a security risk.
|
||||
|
||||
6. The ARM 64-bit (ARMv8) NEON SIMD assembly code now stores constants in a
|
||||
separate read-only data section rather than in the text section, to support
|
||||
execute-only memory layouts.
|
||||
|
||||
|
||||
2.0.3
|
||||
=====
|
||||
|
||||
|
||||
34
README.md
34
README.md
@@ -1,14 +1,14 @@
|
||||
Background
|
||||
==========
|
||||
|
||||
libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
|
||||
AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression
|
||||
on x86, x86-64, ARM, and PowerPC systems, as well as progressive JPEG
|
||||
compression on x86 and x86-64 systems. On such systems, libjpeg-turbo is
|
||||
generally 2-6x as fast as libjpeg, all else being equal. On other types of
|
||||
systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
|
||||
virtue of its highly-optimized Huffman coding routines. In many cases, the
|
||||
performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
|
||||
libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
|
||||
baseline JPEG compression and decompression on x86, x86-64, ARM, PowerPC, and
|
||||
MIPS systems, as well as progressive JPEG compression on x86, x86-64, and ARMv8
|
||||
systems. On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
|
||||
all else being equal. On other types of systems, libjpeg-turbo can still
|
||||
outperform libjpeg by a significant amount, by virtue of its highly-optimized
|
||||
Huffman coding routines. In many cases, the performance of libjpeg-turbo
|
||||
rivals that of proprietary high-speed JPEG codecs.
|
||||
|
||||
libjpeg-turbo implements both the traditional libjpeg API as well as the less
|
||||
powerful but more straightforward TurboJPEG API. libjpeg-turbo also features
|
||||
@@ -145,14 +145,14 @@ supported and which aren't.
|
||||
|
||||
#### Fully supported
|
||||
|
||||
- **libjpeg: IDCT scaling extensions in decompressor**<br>
|
||||
- **libjpeg API: IDCT scaling extensions in decompressor**<br>
|
||||
libjpeg-turbo supports IDCT scaling with scaling factors of 1/8, 1/4, 3/8,
|
||||
1/2, 5/8, 3/4, 7/8, 9/8, 5/4, 11/8, 3/2, 13/8, 7/4, 15/8, and 2/1 (only 1/4
|
||||
and 1/2 are SIMD-accelerated.)
|
||||
|
||||
- **libjpeg: Arithmetic coding**
|
||||
- **libjpeg API: Arithmetic coding**
|
||||
|
||||
- **libjpeg: In-memory source and destination managers**<br>
|
||||
- **libjpeg API: In-memory source and destination managers**<br>
|
||||
See notes below.
|
||||
|
||||
- **cjpeg: Separate quality settings for luminance and chrominance**<br>
|
||||
@@ -184,14 +184,14 @@ means of quality improvement. The reader is invited to peruse the research at
|
||||
but it is the general belief of our project that these features have not
|
||||
demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
|
||||
|
||||
- **libjpeg: DCT scaling in compressor**<br>
|
||||
- **libjpeg API: DCT scaling in compressor**<br>
|
||||
`cinfo.scale_num` and `cinfo.scale_denom` are silently ignored.
|
||||
There is no technical reason why DCT scaling could not be supported when
|
||||
emulating the libjpeg v7+ API/ABI, but without the SmartScale extension (see
|
||||
below), only scaling factors of 1/2, 8/15, 4/7, 8/13, 2/3, 8/11, 4/5, and
|
||||
8/9 would be available, which is of limited usefulness.
|
||||
|
||||
- **libjpeg: SmartScale**<br>
|
||||
- **libjpeg API: SmartScale**<br>
|
||||
`cinfo.block_size` is silently ignored.
|
||||
SmartScale is an extension to the JPEG format that allows for DCT block
|
||||
sizes other than 8x8. Providing support for this new format would be
|
||||
@@ -204,7 +204,7 @@ demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
|
||||
interest in providing this feature would be as a means of supporting
|
||||
additional DCT scaling factors.
|
||||
|
||||
- **libjpeg: Fancy downsampling in compressor**<br>
|
||||
- **libjpeg API: Fancy downsampling in compressor**<br>
|
||||
`cinfo.do_fancy_downsampling` is silently ignored.
|
||||
This requires the DCT scaling feature, which is not supported.
|
||||
|
||||
@@ -252,8 +252,8 @@ building libjpeg-turbo. This will restore the pre-1.3 behavior, in which
|
||||
libjpeg v8 API/ABI.
|
||||
|
||||
On Un*x systems, including the in-memory source/destination managers changes
|
||||
the dynamic library version from 62.1.0 to 62.2.0 if using libjpeg v6b API/ABI
|
||||
emulation and from 7.1.0 to 7.2.0 if using libjpeg v7 API/ABI emulation.
|
||||
the dynamic library version from 62.2.0 to 62.3.0 if using libjpeg v6b API/ABI
|
||||
emulation and from 7.2.0 to 7.3.0 if using libjpeg v7 API/ABI emulation.
|
||||
|
||||
Note that, on most Un*x systems, the dynamic linker will not look for a
|
||||
function in a library until that function is actually used. Thus, if a program
|
||||
@@ -329,7 +329,7 @@ in a way that makes the rest of the libjpeg infrastructure happy, so it is
|
||||
necessary to use the slow Huffman decoder when decompressing a JPEG image that
|
||||
has restart markers. This can cause the decompression performance to drop by
|
||||
as much as 20%, but the performance will still be much greater than that of
|
||||
libjpeg. Many consumer packages, such as PhotoShop, use restart markers when
|
||||
libjpeg. Many consumer packages, such as Photoshop, use restart markers when
|
||||
generating JPEG images, so images generated by those programs will experience
|
||||
this issue.
|
||||
|
||||
|
||||
2
djpeg.c
2
djpeg.c
@@ -538,7 +538,9 @@ main(int argc, char **argv)
|
||||
FILE *input_file;
|
||||
FILE *output_file;
|
||||
unsigned char *inbuffer = NULL;
|
||||
#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
|
||||
unsigned long insize = 0;
|
||||
#endif
|
||||
JDIMENSION num_scanlines;
|
||||
|
||||
/* On Mac, fetch a command line. */
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2, AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression on x86, x86-64, ARM, and PowerPC systems, as well as progressive JPEG compression on x86 and x86-64 systems. On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal. On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines. In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
|
||||
libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate baseline JPEG compression and decompression on x86, x86-64, ARM, PowerPC, and MIPS systems, as well as progressive JPEG compression on x86, x86-64, and ARMv8 systems. On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal. On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines. In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
|
||||
|
||||
libjpeg-turbo implements both the traditional libjpeg API as well as the less powerful but more straightforward TurboJPEG API. libjpeg-turbo also features colorspace extensions that allow it to compress from/decompress to 32-bit and big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java interface.
|
||||
|
||||
|
||||
@@ -8,15 +8,14 @@ Maintainer: @PKGVENDOR@ <@PKGEMAIL@>
|
||||
Homepage: @PKGURL@
|
||||
Installed-Size: {__SIZE}
|
||||
Description: A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs
|
||||
libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
|
||||
AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression
|
||||
on x86, x86-64, ARM, and PowerPC systems, as well as progressive JPEG
|
||||
compression on x86 and x86-64 systems. On such systems, libjpeg-turbo is
|
||||
generally 2-6x as fast as libjpeg, all else being equal. On other types of
|
||||
systems, libjpeg-turbo can still outperform libjpeg by a significant amount,
|
||||
by virtue of its highly-optimized Huffman coding routines. In many cases, the
|
||||
performance of libjpeg-turbo rivals that of proprietary high-speed JPEG
|
||||
codecs.
|
||||
libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
|
||||
baseline JPEG compression and decompression on x86, x86-64, ARM, PowerPC, and
|
||||
MIPS systems, as well as progressive JPEG compression on x86, x86-64, and
|
||||
ARMv8 systems. On such systems, libjpeg-turbo is generally 2-6x as fast as
|
||||
libjpeg, all else being equal. On other types of systems, libjpeg-turbo can
|
||||
still outperform libjpeg by a significant amount, by virtue of its
|
||||
highly-optimized Huffman coding routines. In many cases, the performance of
|
||||
libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
|
||||
.
|
||||
libjpeg-turbo implements both the traditional libjpeg API as well as the less
|
||||
powerful but more straightforward TurboJPEG API. libjpeg-turbo also features
|
||||
|
||||
@@ -51,14 +51,14 @@ Provides: %{name} = %{version}-%{release}, @CMAKE_PROJECT_NAME@ = %{version}-%{r
|
||||
%endif
|
||||
|
||||
%description
|
||||
libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
|
||||
AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression
|
||||
on x86, x86-64, ARM, and PowerPC systems, as well as progressive JPEG
|
||||
compression on x86 and x86-64 systems. On such systems, libjpeg-turbo is
|
||||
generally 2-6x as fast as libjpeg, all else being equal. On other types of
|
||||
systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
|
||||
virtue of its highly-optimized Huffman coding routines. In many cases, the
|
||||
performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
|
||||
libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
|
||||
baseline JPEG compression and decompression on x86, x86-64, ARM, PowerPC, and
|
||||
MIPS systems, as well as progressive JPEG compression on x86, x86-64, and ARMv8
|
||||
systems. On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
|
||||
all else being equal. On other types of systems, libjpeg-turbo can still
|
||||
outperform libjpeg by a significant amount, by virtue of its highly-optimized
|
||||
Huffman coding routines. In many cases, the performance of libjpeg-turbo
|
||||
rivals that of proprietary high-speed JPEG codecs.
|
||||
|
||||
libjpeg-turbo implements both the traditional libjpeg API as well as the less
|
||||
powerful but more straightforward TurboJPEG API. libjpeg-turbo also features
|
||||
|
||||
@@ -31,6 +31,265 @@
|
||||
.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__)
|
||||
.section __DATA,__const
|
||||
#else
|
||||
.section .rodata, "a", %progbits
|
||||
#endif
|
||||
|
||||
/* Constants for jsimd_idct_islow_neon() */
|
||||
|
||||
#define F_0_298 2446 /* FIX(0.298631336) */
|
||||
#define F_0_390 3196 /* FIX(0.390180644) */
|
||||
#define F_0_541 4433 /* FIX(0.541196100) */
|
||||
#define F_0_765 6270 /* FIX(0.765366865) */
|
||||
#define F_0_899 7373 /* FIX(0.899976223) */
|
||||
#define F_1_175 9633 /* FIX(1.175875602) */
|
||||
#define F_1_501 12299 /* FIX(1.501321110) */
|
||||
#define F_1_847 15137 /* FIX(1.847759065) */
|
||||
#define F_1_961 16069 /* FIX(1.961570560) */
|
||||
#define F_2_053 16819 /* FIX(2.053119869) */
|
||||
#define F_2_562 20995 /* FIX(2.562915447) */
|
||||
#define F_3_072 25172 /* FIX(3.072711026) */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_idct_islow_neon_consts:
|
||||
.short F_0_298
|
||||
.short -F_0_390
|
||||
.short F_0_541
|
||||
.short F_0_765
|
||||
.short - F_0_899
|
||||
.short F_1_175
|
||||
.short F_1_501
|
||||
.short - F_1_847
|
||||
.short - F_1_961
|
||||
.short F_2_053
|
||||
.short - F_2_562
|
||||
.short F_3_072
|
||||
.short 0 /* padding */
|
||||
.short 0
|
||||
.short 0
|
||||
.short 0
|
||||
|
||||
#undef F_0_298
|
||||
#undef F_0_390
|
||||
#undef F_0_541
|
||||
#undef F_0_765
|
||||
#undef F_0_899
|
||||
#undef F_1_175
|
||||
#undef F_1_501
|
||||
#undef F_1_847
|
||||
#undef F_1_961
|
||||
#undef F_2_053
|
||||
#undef F_2_562
|
||||
#undef F_3_072
|
||||
|
||||
/* Constants for jsimd_idct_ifast_neon() */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_idct_ifast_neon_consts:
|
||||
.short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
|
||||
.short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
|
||||
.short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
|
||||
.short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
|
||||
|
||||
/* Constants for jsimd_idct_4x4_neon() and jsimd_idct_2x2_neon() */
|
||||
|
||||
#define CONST_BITS 13
|
||||
|
||||
#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
|
||||
#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
|
||||
#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
|
||||
#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
|
||||
#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
|
||||
#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
|
||||
#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
|
||||
#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
|
||||
#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
|
||||
#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
|
||||
#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
|
||||
#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
|
||||
#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
|
||||
#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_idct_4x4_neon_consts:
|
||||
.short FIX_1_847759065 /* v0.h[0] */
|
||||
.short -FIX_0_765366865 /* v0.h[1] */
|
||||
.short -FIX_0_211164243 /* v0.h[2] */
|
||||
.short FIX_1_451774981 /* v0.h[3] */
|
||||
.short -FIX_2_172734803 /* d1[0] */
|
||||
.short FIX_1_061594337 /* d1[1] */
|
||||
.short -FIX_0_509795579 /* d1[2] */
|
||||
.short -FIX_0_601344887 /* d1[3] */
|
||||
.short FIX_0_899976223 /* v2.h[0] */
|
||||
.short FIX_2_562915447 /* v2.h[1] */
|
||||
.short 1 << (CONST_BITS + 1) /* v2.h[2] */
|
||||
.short 0 /* v2.h[3] */
|
||||
|
||||
.balign 8
|
||||
Ljsimd_idct_2x2_neon_consts:
|
||||
.short -FIX_0_720959822 /* v14[0] */
|
||||
.short FIX_0_850430095 /* v14[1] */
|
||||
.short -FIX_1_272758580 /* v14[2] */
|
||||
.short FIX_3_624509785 /* v14[3] */
|
||||
|
||||
/* Constants for jsimd_ycc_*_neon() */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_ycc_rgb_neon_consts:
|
||||
.short 0, 0, 0, 0
|
||||
.short 22971, -11277, -23401, 29033
|
||||
.short -128, -128, -128, -128
|
||||
.short -128, -128, -128, -128
|
||||
|
||||
/* Constants for jsimd_*_ycc_neon() */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_rgb_ycc_neon_consts:
|
||||
.short 19595, 38470, 7471, 11059
|
||||
.short 21709, 32768, 27439, 5329
|
||||
.short 32767, 128, 32767, 128
|
||||
.short 32767, 128, 32767, 128
|
||||
|
||||
/* Constants for jsimd_fdct_islow_neon() */
|
||||
|
||||
#define F_0_298 2446 /* FIX(0.298631336) */
|
||||
#define F_0_390 3196 /* FIX(0.390180644) */
|
||||
#define F_0_541 4433 /* FIX(0.541196100) */
|
||||
#define F_0_765 6270 /* FIX(0.765366865) */
|
||||
#define F_0_899 7373 /* FIX(0.899976223) */
|
||||
#define F_1_175 9633 /* FIX(1.175875602) */
|
||||
#define F_1_501 12299 /* FIX(1.501321110) */
|
||||
#define F_1_847 15137 /* FIX(1.847759065) */
|
||||
#define F_1_961 16069 /* FIX(1.961570560) */
|
||||
#define F_2_053 16819 /* FIX(2.053119869) */
|
||||
#define F_2_562 20995 /* FIX(2.562915447) */
|
||||
#define F_3_072 25172 /* FIX(3.072711026) */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_fdct_islow_neon_consts:
|
||||
.short F_0_298
|
||||
.short -F_0_390
|
||||
.short F_0_541
|
||||
.short F_0_765
|
||||
.short - F_0_899
|
||||
.short F_1_175
|
||||
.short F_1_501
|
||||
.short - F_1_847
|
||||
.short - F_1_961
|
||||
.short F_2_053
|
||||
.short - F_2_562
|
||||
.short F_3_072
|
||||
.short 0 /* padding */
|
||||
.short 0
|
||||
.short 0
|
||||
.short 0
|
||||
|
||||
#undef F_0_298
|
||||
#undef F_0_390
|
||||
#undef F_0_541
|
||||
#undef F_0_765
|
||||
#undef F_0_899
|
||||
#undef F_1_175
|
||||
#undef F_1_501
|
||||
#undef F_1_847
|
||||
#undef F_1_961
|
||||
#undef F_2_053
|
||||
#undef F_2_562
|
||||
#undef F_3_072
|
||||
|
||||
/* Constants for jsimd_fdct_ifast_neon() */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_fdct_ifast_neon_consts:
|
||||
.short (98 * 128) /* XFIX_0_382683433 */
|
||||
.short (139 * 128) /* XFIX_0_541196100 */
|
||||
.short (181 * 128) /* XFIX_0_707106781 */
|
||||
.short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
|
||||
|
||||
/* Constants for jsimd_h2*_downsample_neon() */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_h2_downsample_neon_consts:
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
|
||||
0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
|
||||
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
|
||||
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
|
||||
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */
|
||||
.byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
|
||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */
|
||||
.byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */
|
||||
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */
|
||||
|
||||
/* Constants for jsimd_huff_encode_one_block_neon() */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_huff_encode_one_block_neon_consts:
|
||||
.byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
||||
.byte 0, 1, 2, 3, 16, 17, 32, 33, \
|
||||
18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
|
||||
.byte 34, 35, 48, 49, 255, 255, 50, 51, \
|
||||
36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */
|
||||
.byte 8, 9, 22, 23, 36, 37, 50, 51, \
|
||||
255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */
|
||||
.byte 54, 55, 40, 41, 26, 27, 12, 13, \
|
||||
14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */
|
||||
.byte 6, 7, 20, 21, 34, 35, 48, 49, \
|
||||
50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */
|
||||
.byte 42, 43, 28, 29, 14, 15, 30, 31, \
|
||||
44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */
|
||||
.byte 255, 255, 255, 255, 56, 57, 42, 43, \
|
||||
28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */
|
||||
.byte 26, 27, 40, 41, 42, 43, 28, 29, \
|
||||
14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */
|
||||
.byte 255, 255, 255, 255, 0, 1, 255, 255, \
|
||||
255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */
|
||||
.byte 255, 255, 255, 255, 255, 255, 255, 255, \
|
||||
0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */
|
||||
.byte 255, 255, 255, 255, 255, 255, 255, 255, \
|
||||
255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
|
||||
.byte 4, 5, 6, 7, 255, 255, 255, 255, \
|
||||
255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
|
||||
|
||||
/* Constants for jsimd_encode_mcu_AC_first_prepare_neon() */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_encode_mcu_AC_first_prepare_neon_consts:
|
||||
.byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
||||
|
||||
/* Constants for jsimd_encode_mcu_AC_refine_prepare_neon() */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_encode_mcu_AC_refine_prepare_neon_consts:
|
||||
.byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
||||
|
||||
.text
|
||||
|
||||
|
||||
@@ -55,6 +314,17 @@ _\fname:
|
||||
#endif
|
||||
.endm
|
||||
|
||||
/* Get symbol location */
|
||||
.macro get_symbol_loc reg, symbol
|
||||
#ifdef __APPLE__
|
||||
adrp \reg, \symbol@PAGE
|
||||
add \reg, \reg, \symbol@PAGEOFF
|
||||
#else
|
||||
adrp \reg, \symbol
|
||||
add \reg, \reg, :lo12:\symbol
|
||||
#endif
|
||||
.endm
|
||||
|
||||
/* Transpose elements of single 128 bit registers */
|
||||
.macro transpose_single x0, x1, xi, xilen, literal
|
||||
ins \xi\xilen[0], \x0\xilen[0]
|
||||
@@ -139,51 +409,6 @@ _\fname:
|
||||
#define CONST_BITS 13
|
||||
#define PASS1_BITS 2
|
||||
|
||||
#define F_0_298 2446 /* FIX(0.298631336) */
|
||||
#define F_0_390 3196 /* FIX(0.390180644) */
|
||||
#define F_0_541 4433 /* FIX(0.541196100) */
|
||||
#define F_0_765 6270 /* FIX(0.765366865) */
|
||||
#define F_0_899 7373 /* FIX(0.899976223) */
|
||||
#define F_1_175 9633 /* FIX(1.175875602) */
|
||||
#define F_1_501 12299 /* FIX(1.501321110) */
|
||||
#define F_1_847 15137 /* FIX(1.847759065) */
|
||||
#define F_1_961 16069 /* FIX(1.961570560) */
|
||||
#define F_2_053 16819 /* FIX(2.053119869) */
|
||||
#define F_2_562 20995 /* FIX(2.562915447) */
|
||||
#define F_3_072 25172 /* FIX(3.072711026) */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_idct_islow_neon_consts:
|
||||
.short F_0_298
|
||||
.short -F_0_390
|
||||
.short F_0_541
|
||||
.short F_0_765
|
||||
.short - F_0_899
|
||||
.short F_1_175
|
||||
.short F_1_501
|
||||
.short - F_1_847
|
||||
.short - F_1_961
|
||||
.short F_2_053
|
||||
.short - F_2_562
|
||||
.short F_3_072
|
||||
.short 0 /* padding */
|
||||
.short 0
|
||||
.short 0
|
||||
.short 0
|
||||
|
||||
#undef F_0_298
|
||||
#undef F_0_390
|
||||
#undef F_0_541
|
||||
#undef F_0_765
|
||||
#undef F_0_899
|
||||
#undef F_1_175
|
||||
#undef F_1_501
|
||||
#undef F_1_847
|
||||
#undef F_1_961
|
||||
#undef F_2_053
|
||||
#undef F_2_562
|
||||
#undef F_3_072
|
||||
|
||||
#define XFIX_P_0_298 v0.h[0]
|
||||
#define XFIX_N_0_390 v0.h[1]
|
||||
#define XFIX_P_0_541 v0.h[2]
|
||||
@@ -217,7 +442,7 @@ asm_function jsimd_idct_islow_neon
|
||||
uxtw x3, w3
|
||||
|
||||
sub sp, sp, #64
|
||||
adr x15, Ljsimd_idct_islow_neon_consts
|
||||
get_symbol_loc x15, Ljsimd_idct_islow_neon_consts
|
||||
mov x10, sp
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
|
||||
@@ -791,13 +1016,6 @@ asm_function jsimd_idct_islow_neon
|
||||
#define XFIX_1_847759065 v0.h[2]
|
||||
#define XFIX_2_613125930 v0.h[3]
|
||||
|
||||
.balign 16
|
||||
Ljsimd_idct_ifast_neon_consts:
|
||||
.short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
|
||||
.short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
|
||||
.short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
|
||||
.short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
|
||||
|
||||
asm_function jsimd_idct_ifast_neon
|
||||
|
||||
DCT_TABLE .req x0
|
||||
@@ -832,7 +1050,7 @@ asm_function jsimd_idct_ifast_neon
|
||||
* 7 | d30 | d31 ( v23.8h )
|
||||
*/
|
||||
/* Save NEON registers used in fast IDCT */
|
||||
adr TMP5, Ljsimd_idct_ifast_neon_consts
|
||||
get_symbol_loc TMP5, Ljsimd_idct_ifast_neon_consts
|
||||
ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32
|
||||
ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
|
||||
ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32
|
||||
@@ -1023,38 +1241,6 @@ asm_function jsimd_idct_ifast_neon
|
||||
* but readability will suffer somewhat.
|
||||
*/
|
||||
|
||||
#define CONST_BITS 13
|
||||
|
||||
#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
|
||||
#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
|
||||
#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
|
||||
#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
|
||||
#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
|
||||
#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
|
||||
#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
|
||||
#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
|
||||
#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
|
||||
#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
|
||||
#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
|
||||
#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
|
||||
#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
|
||||
#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_idct_4x4_neon_consts:
|
||||
.short FIX_1_847759065 /* v0.h[0] */
|
||||
.short -FIX_0_765366865 /* v0.h[1] */
|
||||
.short -FIX_0_211164243 /* v0.h[2] */
|
||||
.short FIX_1_451774981 /* v0.h[3] */
|
||||
.short -FIX_2_172734803 /* d1[0] */
|
||||
.short FIX_1_061594337 /* d1[1] */
|
||||
.short -FIX_0_509795579 /* d1[2] */
|
||||
.short -FIX_0_601344887 /* d1[3] */
|
||||
.short FIX_0_899976223 /* v2.h[0] */
|
||||
.short FIX_2_562915447 /* v2.h[1] */
|
||||
.short 1 << (CONST_BITS + 1) /* v2.h[2] */
|
||||
.short 0 /* v2.h[3] */
|
||||
|
||||
.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
|
||||
smull v28.4s, \x4, v2.h[2]
|
||||
smlal v28.4s, \x8, v0.h[0]
|
||||
@@ -1121,7 +1307,7 @@ asm_function jsimd_idct_4x4_neon
|
||||
sub sp, sp, 64
|
||||
mov x9, sp
|
||||
/* Load constants (v3.4h is just used for padding) */
|
||||
adr TMP4, Ljsimd_idct_4x4_neon_consts
|
||||
get_symbol_loc TMP4, Ljsimd_idct_4x4_neon_consts
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
|
||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
|
||||
@@ -1264,13 +1450,6 @@ asm_function jsimd_idct_4x4_neon
|
||||
* bit exact compatibility with jpeg-6b.
|
||||
*/
|
||||
|
||||
.balign 8
|
||||
Ljsimd_idct_2x2_neon_consts:
|
||||
.short -FIX_0_720959822 /* v14[0] */
|
||||
.short FIX_0_850430095 /* v14[1] */
|
||||
.short -FIX_1_272758580 /* v14[2] */
|
||||
.short FIX_3_624509785 /* v14[3] */
|
||||
|
||||
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
|
||||
sshll v15.4s, \x4, #15
|
||||
smull v26.4s, \x6, v14.h[3]
|
||||
@@ -1311,7 +1490,7 @@ asm_function jsimd_idct_2x2_neon
|
||||
mov x9, sp
|
||||
|
||||
/* Load constants */
|
||||
adr TMP2, Ljsimd_idct_2x2_neon_consts
|
||||
get_symbol_loc TMP2, Ljsimd_idct_2x2_neon_consts
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
|
||||
ld1 {v14.4h}, [TMP2]
|
||||
@@ -1663,21 +1842,6 @@ asm_function jsimd_idct_2x2_neon
|
||||
do_yuv_to_rgb_stage2
|
||||
.endm
|
||||
|
||||
/* Apple gas crashes on adrl, work around that by using adr.
|
||||
* But this requires a copy of these constants for each function.
|
||||
*/
|
||||
|
||||
.balign 16
|
||||
.if \fast_st3 == 1
|
||||
Ljsimd_ycc_\colorid\()_neon_consts:
|
||||
.else
|
||||
Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
|
||||
.endif
|
||||
.short 0, 0, 0, 0
|
||||
.short 22971, -11277, -23401, 29033
|
||||
.short -128, -128, -128, -128
|
||||
.short -128, -128, -128, -128
|
||||
|
||||
.if \fast_st3 == 1
|
||||
asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
.else
|
||||
@@ -1703,11 +1867,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
|
||||
mov x9, sp
|
||||
|
||||
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
|
||||
.if \fast_st3 == 1
|
||||
adr x15, Ljsimd_ycc_\colorid\()_neon_consts
|
||||
.else
|
||||
adr x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
|
||||
.endif
|
||||
get_symbol_loc x15, Ljsimd_ycc_rgb_neon_consts
|
||||
|
||||
/* Save NEON registers */
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
|
||||
@@ -2004,17 +2164,6 @@ generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b,
|
||||
do_rgb_to_yuv_stage1
|
||||
.endm
|
||||
|
||||
.balign 16
|
||||
.if \fast_ld3 == 1
|
||||
Ljsimd_\colorid\()_ycc_neon_consts:
|
||||
.else
|
||||
Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
|
||||
.endif
|
||||
.short 19595, 38470, 7471, 11059
|
||||
.short 21709, 32768, 27439, 5329
|
||||
.short 32767, 128, 32767, 128
|
||||
.short 32767, 128, 32767, 128
|
||||
|
||||
.if \fast_ld3 == 1
|
||||
asm_function jsimd_\colorid\()_ycc_convert_neon
|
||||
.else
|
||||
@@ -2037,11 +2186,7 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
|
||||
N .req w12
|
||||
|
||||
/* Load constants to d0, d1, d2, d3 */
|
||||
.if \fast_ld3 == 1
|
||||
adr x13, Ljsimd_\colorid\()_ycc_neon_consts
|
||||
.else
|
||||
adr x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
|
||||
.endif
|
||||
get_symbol_loc x13, Ljsimd_rgb_ycc_neon_consts
|
||||
ld1 {v0.8h, v1.8h}, [x13]
|
||||
|
||||
ldr OUTPUT_BUF0, [OUTPUT_BUF]
|
||||
@@ -2241,50 +2386,6 @@ asm_function jsimd_convsamp_neon
|
||||
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
|
||||
|
||||
#define F_0_298 2446 /* FIX(0.298631336) */
|
||||
#define F_0_390 3196 /* FIX(0.390180644) */
|
||||
#define F_0_541 4433 /* FIX(0.541196100) */
|
||||
#define F_0_765 6270 /* FIX(0.765366865) */
|
||||
#define F_0_899 7373 /* FIX(0.899976223) */
|
||||
#define F_1_175 9633 /* FIX(1.175875602) */
|
||||
#define F_1_501 12299 /* FIX(1.501321110) */
|
||||
#define F_1_847 15137 /* FIX(1.847759065) */
|
||||
#define F_1_961 16069 /* FIX(1.961570560) */
|
||||
#define F_2_053 16819 /* FIX(2.053119869) */
|
||||
#define F_2_562 20995 /* FIX(2.562915447) */
|
||||
#define F_3_072 25172 /* FIX(3.072711026) */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_fdct_islow_neon_consts:
|
||||
.short F_0_298
|
||||
.short -F_0_390
|
||||
.short F_0_541
|
||||
.short F_0_765
|
||||
.short - F_0_899
|
||||
.short F_1_175
|
||||
.short F_1_501
|
||||
.short - F_1_847
|
||||
.short - F_1_961
|
||||
.short F_2_053
|
||||
.short - F_2_562
|
||||
.short F_3_072
|
||||
.short 0 /* padding */
|
||||
.short 0
|
||||
.short 0
|
||||
.short 0
|
||||
|
||||
#undef F_0_298
|
||||
#undef F_0_390
|
||||
#undef F_0_541
|
||||
#undef F_0_765
|
||||
#undef F_0_899
|
||||
#undef F_1_175
|
||||
#undef F_1_501
|
||||
#undef F_1_847
|
||||
#undef F_1_961
|
||||
#undef F_2_053
|
||||
#undef F_2_562
|
||||
#undef F_3_072
|
||||
#define XFIX_P_0_298 v0.h[0]
|
||||
#define XFIX_N_0_390 v0.h[1]
|
||||
#define XFIX_P_0_541 v0.h[2]
|
||||
@@ -2304,7 +2405,7 @@ asm_function jsimd_fdct_islow_neon
|
||||
TMP .req x9
|
||||
|
||||
/* Load constants */
|
||||
adr TMP, Ljsimd_fdct_islow_neon_consts
|
||||
get_symbol_loc TMP, Ljsimd_fdct_islow_neon_consts
|
||||
ld1 {v0.8h, v1.8h}, [TMP]
|
||||
|
||||
/* Save NEON registers */
|
||||
@@ -2583,20 +2684,13 @@ asm_function jsimd_fdct_islow_neon
|
||||
#define XFIX_0_707106781 v0.h[2]
|
||||
#define XFIX_1_306562965 v0.h[3]
|
||||
|
||||
.balign 16
|
||||
Ljsimd_fdct_ifast_neon_consts:
|
||||
.short (98 * 128) /* XFIX_0_382683433 */
|
||||
.short (139 * 128) /* XFIX_0_541196100 */
|
||||
.short (181 * 128) /* XFIX_0_707106781 */
|
||||
.short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
|
||||
|
||||
asm_function jsimd_fdct_ifast_neon
|
||||
|
||||
DATA .req x0
|
||||
TMP .req x9
|
||||
|
||||
/* Load constants */
|
||||
adr TMP, Ljsimd_fdct_ifast_neon_consts
|
||||
get_symbol_loc TMP, Ljsimd_fdct_ifast_neon_consts
|
||||
ld1 {v0.4h}, [TMP]
|
||||
|
||||
/* Load all DATA into NEON registers with the following allocation:
|
||||
@@ -2775,41 +2869,6 @@ asm_function jsimd_quantize_neon
|
||||
* JSAMPARRAY input_data, JSAMPARRAY output_data);
|
||||
*/
|
||||
|
||||
.balign 16
|
||||
Ljsimd_h2_downsample_neon_consts:
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
|
||||
0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
|
||||
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
|
||||
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
|
||||
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */
|
||||
.byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
|
||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */
|
||||
.byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */
|
||||
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */
|
||||
|
||||
asm_function jsimd_h2v1_downsample_neon
|
||||
IMAGE_WIDTH .req x0
|
||||
MAX_V_SAMP .req x1
|
||||
@@ -2827,7 +2886,7 @@ asm_function jsimd_h2v1_downsample_neon
|
||||
mov TMPDUP, #0x10000
|
||||
lsl TMP2, BLOCK_WIDTH, #4
|
||||
sub TMP2, TMP2, IMAGE_WIDTH
|
||||
adr TMP3, Ljsimd_h2_downsample_neon_consts
|
||||
get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts
|
||||
add TMP3, TMP3, TMP2, lsl #4
|
||||
dup v16.4s, TMPDUP
|
||||
ld1 {v18.16b}, [TMP3]
|
||||
@@ -2906,7 +2965,7 @@ asm_function jsimd_h2v2_downsample_neon
|
||||
lsl TMP2, BLOCK_WIDTH, #4
|
||||
lsl TMPDUP, TMPDUP, #17
|
||||
sub TMP2, TMP2, IMAGE_WIDTH
|
||||
adr TMP3, Ljsimd_h2_downsample_neon_consts
|
||||
get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts
|
||||
orr TMPDUP, TMPDUP, #1
|
||||
add TMP3, TMP3, TMP2, lsl #4
|
||||
dup v16.4s, TMPDUP
|
||||
@@ -3012,41 +3071,6 @@ asm_function jsimd_h2v2_downsample_neon
|
||||
|
||||
.macro generate_jsimd_huff_encode_one_block fast_tbl
|
||||
|
||||
.balign 16
|
||||
.if \fast_tbl == 1
|
||||
Ljsimd_huff_encode_one_block_neon_consts:
|
||||
.else
|
||||
Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
|
||||
.endif
|
||||
.byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
||||
.if \fast_tbl == 1
|
||||
.byte 0, 1, 2, 3, 16, 17, 32, 33, \
|
||||
18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
|
||||
.byte 34, 35, 48, 49, 255, 255, 50, 51, \
|
||||
36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */
|
||||
.byte 8, 9, 22, 23, 36, 37, 50, 51, \
|
||||
255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */
|
||||
.byte 54, 55, 40, 41, 26, 27, 12, 13, \
|
||||
14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */
|
||||
.byte 6, 7, 20, 21, 34, 35, 48, 49, \
|
||||
50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */
|
||||
.byte 42, 43, 28, 29, 14, 15, 30, 31, \
|
||||
44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */
|
||||
.byte 255, 255, 255, 255, 56, 57, 42, 43, \
|
||||
28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */
|
||||
.byte 26, 27, 40, 41, 42, 43, 28, 29, \
|
||||
14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */
|
||||
.byte 255, 255, 255, 255, 0, 1, 255, 255, \
|
||||
255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */
|
||||
.byte 255, 255, 255, 255, 255, 255, 255, 255, \
|
||||
0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */
|
||||
.byte 255, 255, 255, 255, 255, 255, 255, 255, \
|
||||
255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
|
||||
.byte 4, 5, 6, 7, 255, 255, 255, 255, \
|
||||
255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
|
||||
.endif
|
||||
|
||||
.if \fast_tbl == 1
|
||||
asm_function jsimd_huff_encode_one_block_neon
|
||||
.else
|
||||
@@ -3056,11 +3080,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
|
||||
sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
|
||||
/* Save ARM registers */
|
||||
stp x19, x20, [sp]
|
||||
.if \fast_tbl == 1
|
||||
adr x15, Ljsimd_huff_encode_one_block_neon_consts
|
||||
.else
|
||||
adr x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
|
||||
.endif
|
||||
get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts
|
||||
ldr PUT_BUFFER, [x0, #0x10]
|
||||
ldr PUT_BITSw, [x0, #0x18]
|
||||
ldrsh w12, [x2] /* load DC coeff in w12 */
|
||||
@@ -3724,13 +3744,8 @@ generate_jsimd_huff_encode_one_block 0
|
||||
LENEND .req w9
|
||||
BITS .req x5
|
||||
|
||||
.balign 16
|
||||
Ljsimd_encode_mcu_AC_first_prepare_neon_consts:
|
||||
.byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
||||
|
||||
asm_function jsimd_encode_mcu_AC_first_prepare_neon
|
||||
adr T0, Ljsimd_encode_mcu_AC_first_prepare_neon_consts
|
||||
get_symbol_loc T0, Ljsimd_encode_mcu_AC_first_prepare_neon_consts
|
||||
neg w3, w3 /* Al = -Al */
|
||||
eor ZERO.16b, ZERO.16b, ZERO.16b
|
||||
ld1 {ANDMASK.16b}, [T0]
|
||||
@@ -3869,13 +3884,8 @@ asm_function jsimd_encode_mcu_AC_first_prepare_neon
|
||||
LENEND .req w9
|
||||
BITS .req x5
|
||||
|
||||
.balign 16
|
||||
Ljsimd_encode_mcu_AC_refine_prepare_neon_consts:
|
||||
.byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
||||
|
||||
asm_function jsimd_encode_mcu_AC_refine_prepare_neon
|
||||
adr T0, Ljsimd_encode_mcu_AC_refine_prepare_neon_consts
|
||||
get_symbol_loc T0, Ljsimd_encode_mcu_AC_refine_prepare_neon_consts
|
||||
neg w3, w3 /* Al = -Al */
|
||||
movi ONE.8h, #1
|
||||
eor SIGN, SIGN, SIGN
|
||||
|
||||
@@ -1906,10 +1906,11 @@ DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
|
||||
if (xinfo[i].crop) {
|
||||
if ((t[i].r.x % xinfo[i].iMCU_sample_width) != 0 ||
|
||||
(t[i].r.y % xinfo[i].iMCU_sample_height) != 0) {
|
||||
snprintf(errStr, JMSG_LENGTH_MAX,
|
||||
snprintf(this->errStr, JMSG_LENGTH_MAX,
|
||||
"To crop this JPEG image, x must be a multiple of %d\n"
|
||||
"and y must be a multiple of %d.\n",
|
||||
xinfo[i].iMCU_sample_width, xinfo[i].iMCU_sample_height);
|
||||
this->isInstanceError = TRUE;
|
||||
retval = -1; goto bailout;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user