Compare commits
76 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5198654f73 | ||
|
|
0d2908a62b | ||
|
|
ee39375c85 | ||
|
|
50d4088439 | ||
|
|
7e8ed0d448 | ||
|
|
6c1538470f | ||
|
|
6b05623682 | ||
|
|
c716918d0a | ||
|
|
619c20d5e8 | ||
|
|
f8a5b80cb0 | ||
|
|
cc11b90b22 | ||
|
|
0629d2a00d | ||
|
|
32ba839c57 | ||
|
|
1da5cf4251 | ||
|
|
fbf0a5fbc5 | ||
|
|
da75d56d6c | ||
|
|
02939f53a0 | ||
|
|
39b950076a | ||
|
|
c0f5e0b702 | ||
|
|
4966e1eec5 | ||
|
|
67753d1298 | ||
|
|
f446e5d5c7 | ||
|
|
0c8bf27a3c | ||
|
|
0aea1da9f0 | ||
|
|
85e2e0f9c2 | ||
|
|
f2ec34de52 | ||
|
|
5e8fd24131 | ||
|
|
71441f322e | ||
|
|
aa20343efd | ||
|
|
7faa703ebf | ||
|
|
e4189accd3 | ||
|
|
5ba6c7effb | ||
|
|
bf506e11b7 | ||
|
|
cdb6c34e1c | ||
|
|
576eef0509 | ||
|
|
f654cf0e2c | ||
|
|
70d831dc0d | ||
|
|
510e67c542 | ||
|
|
243aba148e | ||
|
|
2a4e7f1bc3 | ||
|
|
6a244cb514 | ||
|
|
2ee9faef10 | ||
|
|
a07787f423 | ||
|
|
ace9d06b9e | ||
|
|
752ee33e86 | ||
|
|
d44ffd9db0 | ||
|
|
598cd994f3 | ||
|
|
fa628eff6a | ||
|
|
933289f509 | ||
|
|
888d4075ee | ||
|
|
ddd54ff8a8 | ||
|
|
0d435698f4 | ||
|
|
63c1674ebc | ||
|
|
864600d707 | ||
|
|
4efb529bb7 | ||
|
|
f8e8039204 | ||
|
|
aa805bc89f | ||
|
|
81a64020e3 | ||
|
|
a546be5141 | ||
|
|
c7dadd2d0b | ||
|
|
7475e59637 | ||
|
|
25e40dc42c | ||
|
|
296c8bad7e | ||
|
|
6b99f99b88 | ||
|
|
779f5622aa | ||
|
|
0cbef40560 | ||
|
|
aed7d4661e | ||
|
|
de852420c0 | ||
|
|
c1afc7921d | ||
|
|
a9cad80d19 | ||
|
|
f03d5df238 | ||
|
|
0e9c14e1bb | ||
|
|
602f5bea74 | ||
|
|
24ad6a0179 | ||
|
|
60ba1963fe | ||
|
|
f7067a9e73 |
32
.gitignore
vendored
32
.gitignore
vendored
@@ -1,14 +1,44 @@
|
||||
Makefile.in
|
||||
Makefile
|
||||
/CMakeFiles
|
||||
/autom4te.cache
|
||||
/aclocal.m4
|
||||
/compile
|
||||
/configure
|
||||
/depcomp
|
||||
/install-sh
|
||||
/libtool
|
||||
/missing
|
||||
/stamp-h1
|
||||
/stamp-h*
|
||||
/java/classnoinst.stamp
|
||||
/pkgscripts/
|
||||
/jconfig.h
|
||||
/jconfigint.h
|
||||
/config.guess
|
||||
/config.h
|
||||
/config.h.in
|
||||
/config.log
|
||||
/config.status
|
||||
/config.sub
|
||||
/ltmain.sh
|
||||
/ar-lib
|
||||
/libjpeg.map
|
||||
/.libs/
|
||||
/simd/.libs/
|
||||
/simd/jsimdcfg.inc
|
||||
*.o
|
||||
*.lo
|
||||
*.la
|
||||
/cjpeg
|
||||
/djpeg
|
||||
/jcstest
|
||||
/jpegtran
|
||||
/jpegyuv
|
||||
/md5/md5cmp
|
||||
/rdjpgcom
|
||||
/test_enc_*
|
||||
/tjbench
|
||||
/tjbenchtest
|
||||
/tjunittest
|
||||
/wrjpgcom
|
||||
/yuvjpeg
|
||||
|
||||
118
BUILDING.txt
118
BUILDING.txt
@@ -7,6 +7,7 @@
|
||||
Build Requirements
|
||||
==================
|
||||
|
||||
-- pkg-config
|
||||
-- autoconf 2.56 or later
|
||||
-- automake 1.7 or later
|
||||
-- libtool 1.4 or later
|
||||
@@ -68,12 +69,25 @@ The following procedure will build mozjpeg on Linux, FreeBSD, Cygwin, and
|
||||
Solaris/x86 systems (on Solaris, this generates a 32-bit library. See below
|
||||
for 64-bit build instructions.)
|
||||
|
||||
Simple Release tar.gz Source Build
|
||||
----------------------------------
|
||||
|
||||
cd {source_directory}
|
||||
./configure [additional configure flags]
|
||||
make
|
||||
|
||||
Non-Release Source Build (e.g. GitHub clone)
|
||||
--------------------------------------------
|
||||
|
||||
cd {source_directory}
|
||||
autoreconf -fiv
|
||||
cd {build_directory}
|
||||
sh {source_directory}/configure [additional configure flags]
|
||||
make
|
||||
|
||||
Results
|
||||
-------
|
||||
|
||||
This will generate the following files under .libs/
|
||||
|
||||
libjpeg.a
|
||||
@@ -309,6 +323,9 @@ Additional build requirements:
|
||||
(https://sourceforge.net/p/libjpeg-turbo/code/HEAD/tree/gas-preprocessor)
|
||||
should be installed in your PATH.
|
||||
|
||||
|
||||
ARM 32-bit Build (Xcode 4.6.x and earlier, LLVM-GCC):
|
||||
|
||||
Set the following shell variables for simplicity:
|
||||
|
||||
Xcode 4.2 and earlier:
|
||||
@@ -317,47 +334,80 @@ Set the following shell variables for simplicity:
|
||||
IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
|
||||
|
||||
IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
|
||||
|
||||
Xcode 4.6.x and earlier:
|
||||
IOS_GCC=$IOS_PLATFORMDIR/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2
|
||||
Xcode 5.0.x and later:
|
||||
IOS_GCC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
|
||||
ARM v6 (code will run on all iOS devices, not SIMD-accelerated):
|
||||
ARMv6 (code will run on all iOS devices, not SIMD-accelerated):
|
||||
[NOTE: Requires Xcode 4.4.x or earlier]
|
||||
IOS_CFLAGS="-march=armv6 -mcpu=arm1176jzf-s -mfpu=vfp"
|
||||
|
||||
ARM v7 (code will run on iPhone 3GS-4S/iPad 1st-3rd Generation and newer):
|
||||
Xcode 4.6.x and earlier:
|
||||
ARMv7 (code will run on iPhone 3GS-4S/iPad 1st-3rd Generation and newer):
|
||||
IOS_CFLAGS="-march=armv7 -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon"
|
||||
Xcode 5.0.x and later:
|
||||
IOS_CFLAGS="-arch armv7"
|
||||
|
||||
ARM v7s (code will run on iPhone 5/iPad 4th Generation and newer):
|
||||
ARMv7s (code will run on iPhone 5/iPad 4th Generation and newer):
|
||||
[NOTE: Requires Xcode 4.5 or later]
|
||||
Xcode 4.6.x and earlier:
|
||||
IOS_CFLAGS="-march=armv7s -mcpu=swift -mtune=swift -mfpu=neon"
|
||||
Xcode 5.0.x and later:
|
||||
IOS_CFLAGS="-arch armv7s"
|
||||
|
||||
Follow the procedure under "Building mozjpeg" above, adding
|
||||
|
||||
--host arm-apple-darwin10 --enable-static --disable-shared \
|
||||
--host arm-apple-darwin10 \
|
||||
CC="$IOS_GCC" LD="$IOS_GCC" \
|
||||
CFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
|
||||
LDFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT $IOS_CFLAGS"
|
||||
|
||||
to the configure command line. If using Xcode 5.0.x or later, also add
|
||||
to the configure command line.
|
||||
|
||||
|
||||
ARM 32-bit Build (Xcode 5.0.x and later, Clang):
|
||||
|
||||
Set the following shell variables for simplicity:
|
||||
|
||||
IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
|
||||
IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
|
||||
IOS_GCC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
|
||||
ARMv7 (code will run on iPhone 3GS-4S/iPad 1st-3rd Generation and newer):
|
||||
IOS_CFLAGS="-arch armv7"
|
||||
|
||||
ARMv7s (code will run on iPhone 5/iPad 4th Generation and newer):
|
||||
IOS_CFLAGS="-arch armv7s"
|
||||
|
||||
Follow the procedure under "Building libjpeg-turbo" above, adding
|
||||
|
||||
--host arm-apple-darwin10 \
|
||||
CC="$IOS_GCC" LD="$IOS_GCC" \
|
||||
CFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
|
||||
LDFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT $IOS_CFLAGS" \
|
||||
CCASFLAGS="-no-integrated-as $IOS_CFLAGS"
|
||||
|
||||
to the configure command line.
|
||||
|
||||
|
||||
ARMv8 64-bit Build (Xcode 5.0.x and later, Clang):
|
||||
|
||||
Code will run on iPhone 5S/iPad Mini 2 and newer.
|
||||
|
||||
Set the following shell variables for simplicity:
|
||||
|
||||
IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
|
||||
IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
|
||||
IOS_GCC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
IOS_CFLAGS="-arch arm64"
|
||||
|
||||
Follow the procedure under "Building libjpeg-turbo" above, adding
|
||||
|
||||
--host aarch64-apple-darwin \
|
||||
CC="$IOS_GCC" LD="$IOS_GCC" \
|
||||
CFLAGS="-isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
|
||||
LDFLAGS="-isysroot $IOS_SYSROOT $IOS_CFLAGS"
|
||||
|
||||
to the configure command line.
|
||||
|
||||
|
||||
NOTE: You can also add -miphoneos-version-min={version} to $IOS_CFLAGS above
|
||||
in order to support older versions of iOS than the default version supported by
|
||||
the SDK.
|
||||
|
||||
Once built, lipo can be used to combine the ARM v6, v7, and/or v7s variants
|
||||
Once built, lipo can be used to combine the ARMv6, v7, v7s, and/or v8 variants
|
||||
into a universal library.
|
||||
|
||||
|
||||
@@ -732,26 +782,32 @@ make udmg [BUILDDIR32={32-bit build directory}]
|
||||
make command line as shown above.
|
||||
|
||||
make iosdmg [BUILDDIR32={32-bit build directory}] \
|
||||
[BUILDDIRARMV6={ARM v6 build directory}] \
|
||||
[BUILDDIRARMV7={ARM v7 build directory}] \
|
||||
[BUILDDIRARMV7S={ARM v7s build directory}]
|
||||
[BUILDDIRARMV6={ARMv6 build directory}] \
|
||||
[BUILDDIRARMV7={ARMv7 build directory}] \
|
||||
[BUILDDIRARMV7S={ARMv7s build directory}] \
|
||||
[BUILDDIRARMV8={ARMv8 build directory}]
|
||||
|
||||
On OS X systems, this creates a Macintosh package and disk image in which the
|
||||
mozjpeg static libraries contain ARM architectures necessary to build
|
||||
iOS applications. If building on an x86-64 system, the binaries will also
|
||||
contain the i386 architecture, as with 'make udmg' above. You should first
|
||||
configure ARM v6, ARM v7, and/or ARM v7s out-of-tree builds of mozjpeg
|
||||
(see "Building mozjpeg for iOS" above.) If you are building an x86-64
|
||||
version of mozjpeg, you should configure a 32-bit out-of-tree build as
|
||||
well. Next, build mozjpeg as you would normally, using an out-of-tree
|
||||
build. When it is built, run 'make iosdmg' from the build directory. The
|
||||
build system will look for the ARM v6 build under {source_directory}/iosarmv6
|
||||
by default, the ARM v7 build under {source_directory}/iosarmv7 by default,
|
||||
the ARM v7s build under {source_directory}/iosarmv7s by default, and (if
|
||||
applicable) the 32-bit build under {source_directory}/osxx86 by default, but
|
||||
you can override this by setting the BUILDDIR32, BUILDDIRARMV6,
|
||||
BUILDDIRARMV7, and/or BUILDDIRARMV7S variables on the make command line as
|
||||
shown above.
|
||||
configure ARMv6, ARMv7, ARMv7s, and/or ARMv8 out-of-tree builds of
|
||||
mozjpeg (see "Building mozjpeg for iOS" above.) If you are
|
||||
building an x86-64 version of mozjpeg, you should configure a 32-bit
|
||||
out-of-tree build as well. Next, build mozjpeg as you would normally,
|
||||
using an out-of-tree build. When it is built, run 'make iosdmg' from the
|
||||
build directory. The build system will look for the ARMv6 build under
|
||||
{source_directory}/iosarmv6 by default, the ARMv7 build under
|
||||
{source_directory}/iosarmv7 by default, the ARMv7s build under
|
||||
{source_directory}/iosarmv7s by default, the ARMv8 build under
|
||||
{source_directory}/iosarmv8 by default, and (if applicable) the 32-bit build
|
||||
under {source_directory}/osxx86 by default, but you can override this by
|
||||
setting the BUILDDIR32, BUILDDIRARMV6, BUILDDIRARMV7, BUILDDIRARMV7S, and/or
|
||||
BUILDDIRARMV8 variables on the make command line as shown above.
|
||||
|
||||
NOTE: If including an ARMv8 build in the package, then you may need to use
|
||||
Xcode's version of lipo instead of the operating system's. To do this, pass
|
||||
an argument of LIPO="xcrun lipo" on the make command line.
|
||||
|
||||
make cygwinpkg
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ if(POLICY CMP0022)
|
||||
endif()
|
||||
|
||||
project(mozjpeg C)
|
||||
set(VERSION 3.0)
|
||||
set(VERSION 3.1)
|
||||
|
||||
if(CYGWIN OR NOT CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
|
||||
execute_process(COMMAND "date" "+%Y%m%d" OUTPUT_VARIABLE BUILD)
|
||||
@@ -557,7 +557,7 @@ foreach(libtype shared static)
|
||||
add_test(djpeg${suffix}-gray-islow-rgb
|
||||
${dir}djpeg${suffix} -dct int -rgb -outfile testout_gray_islow_rgb.ppm
|
||||
testout_gray_islow.jpg)
|
||||
add_test(cjpeg${suffix}-gray-islow-rgb-cmp
|
||||
add_test(djpeg${suffix}-gray-islow-rgb-cmp
|
||||
${CMAKE_COMMAND} -DMD5=${MD5_PPM_GRAY_ISLOW_RGB}
|
||||
-DFILE=testout_gray_islow_rgb.ppm
|
||||
-P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
|
||||
@@ -637,10 +637,10 @@ foreach(libtype shared static)
|
||||
endif()
|
||||
if(WITH_ARITH_DEC)
|
||||
# CC: RGB->YCC SAMP: h2v2 merged IDCT: ifast ENT: arith
|
||||
add_test(cjpeg${suffix}-420m-ifast-ari
|
||||
add_test(djpeg${suffix}-420m-ifast-ari
|
||||
${dir}djpeg${suffix} -fast -ppm -outfile testout_420m_ifast_ari.ppm
|
||||
${CMAKE_SOURCE_DIR}/testimages/testimgari.jpg)
|
||||
add_test(cjpeg${suffix}-420m-ifast-ari-cmp
|
||||
add_test(djpeg${suffix}-420m-ifast-ari-cmp
|
||||
${CMAKE_COMMAND} -DMD5=${MD5_PPM_420M_IFAST_ARI}
|
||||
-DFILE=testout_420m_ifast_ari.ppm
|
||||
-P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
Version 1.0 (March 4, 2014)
|
||||
===========================
|
||||
|
||||
* Add 'jpgcrush' functionality. Figures out which progressive coding configuration uses the fewest bits.
|
||||
@@ -13,6 +13,25 @@ instead of -1 if componentID was > 0 and subsamp was TJSAMP_GRAY.
|
||||
[3] Fixed an issue in tjBufSizeYUV2() wherby it would erroneously return 0
|
||||
instead of -1 if width was < 1.
|
||||
|
||||
[5] The Huffman encoder now uses clz and bsr instructions for bit counting on
|
||||
ARM64 platforms (see 1.4 beta1 [5].)
|
||||
|
||||
[6] The close() method in the TJCompressor and TJDecompressor Java classes is
|
||||
now idempotent. Previously, that method would call the native tjDestroy()
|
||||
function even if the TurboJPEG instance had already been destroyed. This
|
||||
caused an exception to be thrown during finalization, if the close() method had
|
||||
already been called. The exception was caught, but it was still an expensive
|
||||
operation.
|
||||
|
||||
[7] The TurboJPEG API previously generated an error ("Could not determine
|
||||
subsampling type for JPEG image") when attempting to decompress grayscale JPEG
|
||||
images that were compressed with a sampling factor other than 1 (for instance,
|
||||
with 'cjpeg -grayscale -sample 2x2'). Subsampling technically has no meaning
|
||||
with grayscale JPEGs, and thus the horizontal and vertical sampling factors
|
||||
for such images are ignored by the decompressor. However, the TurboJPEG API
|
||||
was being too rigid and was expecting the sampling factors to be equal to 1
|
||||
before it treated the image as a grayscale JPEG.
|
||||
|
||||
[8] cjpeg, djpeg, and jpegtran now accept an argument of -version, which will
|
||||
print the library version and exit.
|
||||
|
||||
@@ -28,6 +47,26 @@ order), the Huffman encoder can produce encoded blocks that approach double the
|
||||
size of the unencoded blocks. Thus, the Huffman local buffer was increased to
|
||||
256 bytes, which should prevent any such issue from re-occurring in the future.
|
||||
|
||||
[10] The new tjPlaneSizeYUV(), tjPlaneWidth(), and tjPlaneHeight() functions
|
||||
were not actually usable on any platform except OS X and Windows, because
|
||||
those functions were not included in the libturbojpeg mapfile. This has been
|
||||
fixed.
|
||||
|
||||
[11] Restored the JPP(), JMETHOD(), and FAR macros in the libjpeg-turbo header
|
||||
files. The JPP() and JMETHOD() macros were originally implemented in libjpeg
|
||||
as a way of supporting non-ANSI compilers that lacked support for prototype
|
||||
parameters. libjpeg-turbo has never supported such compilers, but some
|
||||
software packages still use the macros to define their own prototypes.
|
||||
Similarly, libjpeg-turbo has never supported MS-DOS and other platforms that
|
||||
have far symbols, but some software packages still use the FAR macro. A pretty
|
||||
good argument can be made that this is a bad practice on the part of the
|
||||
software in question, but since this affects more than one package, it's just
|
||||
easier to fix it here.
|
||||
|
||||
[12] Fixed issues that were preventing the ARM 64-bit SIMD code from compiling
|
||||
for iOS, and included an ARMv8 architecture in all of the binaries installed by
|
||||
the "official" libjpeg-turbo SDK for OS X.
|
||||
|
||||
|
||||
1.3.90 (1.4 beta1)
|
||||
==================
|
||||
@@ -280,7 +319,7 @@ configure/CMake switch in order to retain strict API/ABI compatibility with the
|
||||
libjpeg v6b or v7 API/ABI (or with previous versions of libjpeg-turbo.) See
|
||||
README-turbo.txt for more details.
|
||||
|
||||
[13] Added ARM v7s architecture to libjpeg.a and libturbojpeg.a in the official
|
||||
[13] Added ARMv7s architecture to libjpeg.a and libturbojpeg.a in the official
|
||||
libjpeg-turbo binary package for OS X, so that those libraries can be used to
|
||||
build applications that leverage the faster CPUs in the iPhone 5 and iPad 4.
|
||||
|
||||
@@ -363,7 +402,7 @@ K component is assigned a component ID of 1 instead of 4. Although these files
|
||||
are in violation of the spec, other JPEG implementations handle them
|
||||
correctly.
|
||||
|
||||
[7] Added ARM v6 and ARM v7 architectures to libjpeg.a and libturbojpeg.a in
|
||||
[7] Added ARMv6 and ARMv7 architectures to libjpeg.a and libturbojpeg.a in
|
||||
the official libjpeg-turbo binary package for OS X, so that those libraries can
|
||||
be used to build both OS X and iOS applications.
|
||||
|
||||
|
||||
11
Makefile.am
11
Makefile.am
@@ -276,7 +276,10 @@ MD5_JPEG_CROP = b4197f377e621c4e9b1d20471432610d
|
||||
|
||||
endif
|
||||
|
||||
test: testclean all
|
||||
.PHONY: test
|
||||
test: tjquicktest bittest
|
||||
|
||||
tjquicktest: testclean all
|
||||
|
||||
if WITH_TURBOJPEG
|
||||
if WITH_JAVA
|
||||
@@ -294,6 +297,8 @@ endif
|
||||
./tjunittest -yuv -noyuvpad
|
||||
endif
|
||||
|
||||
bittest: testclean all
|
||||
|
||||
# These tests are carefully crafted to provide full coverage of as many of the
|
||||
# underlying algorithms as possible (including all of the SIMD-accelerated
|
||||
# ones.)
|
||||
@@ -598,12 +603,12 @@ udmg: all pkgscripts/makemacpkg pkgscripts/uninstall
|
||||
sh pkgscripts/makemacpkg -build32 ${BUILDDIR32}
|
||||
|
||||
iosdmg: all pkgscripts/makemacpkg pkgscripts/uninstall
|
||||
sh pkgscripts/makemacpkg -build32 ${BUILDDIR32} -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S}
|
||||
sh pkgscripts/makemacpkg -build32 ${BUILDDIR32} -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S} -buildarmv8 ${BUILDDIRARMV8} -lipo "${LIPO}"
|
||||
|
||||
else
|
||||
|
||||
iosdmg: all pkgscripts/makemacpkg pkgscripts/uninstall
|
||||
sh pkgscripts/makemacpkg -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S}
|
||||
sh pkgscripts/makemacpkg -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S} -buildarmv8 ${BUILDDIRARMV8} -lipo "${LIPO}"
|
||||
|
||||
endif
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ The idea is to reduce transfer times for JPEGs on the Web, thus reducing page lo
|
||||
|
||||
More information:
|
||||
|
||||
* [Version 1.0 Announcement](https://blog.mozilla.org/research/2014/03/05/introducing-the-mozjpeg-project/)
|
||||
* [Version 2.0 Announcement](https://blog.mozilla.org/research/2014/07/15/mozilla-advances-jpeg-encoding-with-mozjpeg-2-0/)
|
||||
* [Mailing List](https://lists.mozilla.org/listinfo/dev-mozjpeg)</a>
|
||||
* [Version 3.0 Announcement](https://boomswaggerboom.wordpress.com/2014/12/30/mozjpeg-3-0-released/)
|
||||
* [Version 2.0 Announcement](https://blog.mozilla.org/research/2014/07/15/mozilla-advances-jpeg-encoding-with-mozjpeg-2-0/)
|
||||
* [Version 1.0 Announcement](https://blog.mozilla.org/research/2014/03/05/introducing-the-mozjpeg-project/)
|
||||
|
||||
@@ -74,6 +74,7 @@ JMESSAGE(JWRN_GIF_NOMOREDATA, "Ran out of GIF bits")
|
||||
#ifdef PPM_SUPPORTED
|
||||
JMESSAGE(JERR_PPM_COLORSPACE, "PPM output must be grayscale or RGB")
|
||||
JMESSAGE(JERR_PPM_NONNUMERIC, "Nonnumeric data in PPM file")
|
||||
JMESSAGE(JERR_PPM_TOOLARGE, "Integer value too large in PPM file")
|
||||
JMESSAGE(JERR_PPM_NOT, "Not a PPM/PGM file")
|
||||
JMESSAGE(JTRC_PGM, "%ux%u PGM image")
|
||||
JMESSAGE(JTRC_PGM_TEXT, "%ux%u text PGM image")
|
||||
|
||||
9
cjpeg.c
9
cjpeg.c
@@ -207,6 +207,7 @@ usage (void)
|
||||
fprintf(stderr, " -dct float Use floating-point DCT method%s\n",
|
||||
(JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
|
||||
#endif
|
||||
fprintf(stderr, " -quant-baseline Use 8-bit quantization table entries for baseline JPEG compatibility\n");
|
||||
fprintf(stderr, " -quant-table N Use predefined quantization table N:\n");
|
||||
fprintf(stderr, " - 0 JPEG Annex K\n");
|
||||
fprintf(stderr, " - 1 Flat\n");
|
||||
@@ -450,7 +451,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
|
||||
qtablefile = argv[argn];
|
||||
/* We postpone actually reading the file in case -quality comes later. */
|
||||
|
||||
} else if (keymatch(arg, "quant-table", 2)) {
|
||||
} else if (keymatch(arg, "quant-table", 7)) {
|
||||
int val;
|
||||
if (++argn >= argc) /* advance to next argument */
|
||||
usage();
|
||||
@@ -461,7 +462,11 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
|
||||
usage();
|
||||
}
|
||||
jpeg_set_quality(cinfo, 75, TRUE);
|
||||
|
||||
|
||||
} else if (keymatch(arg, "quant-baseline", 7)) {
|
||||
/* Force quantization table to meet baseline requirements */
|
||||
force_baseline = TRUE;
|
||||
|
||||
} else if (keymatch(arg, "restart", 1)) {
|
||||
/* Restart interval in MCU rows (or in MCUs with 'b'). */
|
||||
long lval;
|
||||
|
||||
14
configure.ac
14
configure.ac
@@ -2,7 +2,7 @@
|
||||
# Process this file with autoconf to produce a configure script.
|
||||
|
||||
AC_PREREQ([2.56])
|
||||
AC_INIT([mozjpeg], [3.0])
|
||||
AC_INIT([mozjpeg], [3.1])
|
||||
BUILD=`date +%Y%m%d`
|
||||
|
||||
AM_INIT_AUTOMAKE([-Wall foreign dist-bzip2])
|
||||
@@ -443,7 +443,11 @@ if test "x${with_simd}" != "xno"; then
|
||||
AC_MSG_RESULT([yes (arm)])
|
||||
AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
|
||||
AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE(
|
||||
[AC_MSG_RESULT([yes])
|
||||
[if test "x$ac_use_gas_preprocessor" = "xyes"; then
|
||||
AC_MSG_RESULT([yes (with gas-preprocessor)])
|
||||
else
|
||||
AC_MSG_RESULT([yes])
|
||||
fi
|
||||
simd_arch=arm],
|
||||
[AC_MSG_RESULT([no])
|
||||
with_simd=no])
|
||||
@@ -459,7 +463,11 @@ if test "x${with_simd}" != "xno"; then
|
||||
AC_MSG_RESULT([yes (arm64)])
|
||||
AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
|
||||
AC_CHECK_COMPATIBLE_ARM64_ASSEMBLER_IFELSE(
|
||||
[AC_MSG_RESULT([yes])
|
||||
[if test "x$ac_use_gas_preprocessor" = "xyes"; then
|
||||
AC_MSG_RESULT([yes (with gas-preprocessor)])
|
||||
else
|
||||
AC_MSG_RESULT([yes])
|
||||
fi
|
||||
simd_arch=aarch64],
|
||||
[AC_MSG_RESULT([no])
|
||||
with_simd=no])
|
||||
|
||||
@@ -567,7 +567,8 @@ public class TJCompressor {
|
||||
* Free the native structures associated with this compressor instance.
|
||||
*/
|
||||
public void close() throws Exception {
|
||||
destroy();
|
||||
if (handle != 0)
|
||||
destroy();
|
||||
}
|
||||
|
||||
protected void finalize() throws Throwable {
|
||||
|
||||
@@ -834,7 +834,8 @@ public class TJDecompressor {
|
||||
* Free the native structures associated with this decompressor instance.
|
||||
*/
|
||||
public void close() throws Exception {
|
||||
destroy();
|
||||
if (handle != 0)
|
||||
destroy();
|
||||
}
|
||||
|
||||
protected void finalize() throws Throwable {
|
||||
|
||||
@@ -940,6 +940,11 @@ jget_arith_rates (j_compress_ptr cinfo, int dc_tbl_no, int ac_tbl_no, arith_rate
|
||||
{
|
||||
int i;
|
||||
arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
|
||||
|
||||
r->arith_dc_L = cinfo->arith_dc_L[dc_tbl_no];
|
||||
r->arith_dc_U = cinfo->arith_dc_U[dc_tbl_no];
|
||||
r->arith_ac_K = cinfo->arith_ac_K[ac_tbl_no];
|
||||
|
||||
for (i = 0; i < DC_STAT_BINS; i++) {
|
||||
int state = entropy->dc_stats[dc_tbl_no][i];
|
||||
int mps_val = state >> 7;
|
||||
|
||||
11
jccoefct.c
11
jccoefct.c
@@ -367,18 +367,23 @@ compress_trellis_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
|
||||
c_derived_tbl actbl_data;
|
||||
c_derived_tbl *actbl = &actbl_data;
|
||||
|
||||
#ifdef C_ARITH_CODING_SUPPORTED
|
||||
arith_rates arith_r_data;
|
||||
arith_rates *arith_r = &arith_r_data;
|
||||
#endif
|
||||
|
||||
compptr = cinfo->cur_comp_info[ci];
|
||||
|
||||
#ifdef C_ARITH_CODING_SUPPORTED
|
||||
if (cinfo->arith_code)
|
||||
jget_arith_rates(cinfo, compptr->dc_tbl_no, compptr->ac_tbl_no, arith_r);
|
||||
else {
|
||||
else
|
||||
#endif
|
||||
{
|
||||
jpeg_make_c_derived_tbl(cinfo, TRUE, compptr->dc_tbl_no, &dctbl);
|
||||
jpeg_make_c_derived_tbl(cinfo, FALSE, compptr->ac_tbl_no, &actbl);
|
||||
}
|
||||
|
||||
|
||||
/* Align the virtual buffer for this component. */
|
||||
buffer = (*cinfo->mem->access_virt_barray)
|
||||
((j_common_ptr) cinfo, coef->whole_image[compptr->component_index],
|
||||
@@ -413,6 +418,7 @@ compress_trellis_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
|
||||
for (block_row = 0; block_row < block_rows; block_row++) {
|
||||
thisblockrow = buffer[block_row];
|
||||
lastblockrow = (block_row > 0) ? buffer[block_row-1] : NULL;
|
||||
#ifdef C_ARITH_CODING_SUPPORTED
|
||||
if (cinfo->arith_code)
|
||||
quantize_trellis_arith(cinfo, arith_r, thisblockrow,
|
||||
buffer_dst[block_row], blocks_across,
|
||||
@@ -421,6 +427,7 @@ compress_trellis_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
|
||||
cinfo->master->norm_coef[compptr->quant_tbl_no],
|
||||
&lastDC, lastblockrow, buffer_dst[block_row-1]);
|
||||
else
|
||||
#endif
|
||||
quantize_trellis(cinfo, dctbl, actbl, thisblockrow,
|
||||
buffer_dst[block_row], blocks_across,
|
||||
cinfo->quant_tbl_ptrs[compptr->quant_tbl_no],
|
||||
|
||||
164
jcdctmgr.c
164
jcdctmgr.c
@@ -22,6 +22,7 @@
|
||||
#include "jpeglib.h"
|
||||
#include "jdct.h" /* Private declarations for DCT subsystem */
|
||||
#include "jsimddct.h"
|
||||
#include "jchuff.h"
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
|
||||
@@ -726,6 +727,17 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
|
||||
|
||||
/* Quantize/descale the coefficients, and store into coef_blocks[] */
|
||||
(*do_quantize) (coef_blocks[bi], divisors, workspace);
|
||||
|
||||
if (do_preprocess) {
|
||||
int i;
|
||||
int maxval = (1 << MAX_COEF_BITS) - 1;
|
||||
for (i = 0; i < 64; i++) {
|
||||
if (coef_blocks[bi][i] < -maxval)
|
||||
coef_blocks[bi][i] = -maxval;
|
||||
if (coef_blocks[bi][i] > maxval)
|
||||
coef_blocks[bi][i] = maxval;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -845,12 +857,22 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
|
||||
|
||||
/* Quantize/descale the coefficients, and store into coef_blocks[] */
|
||||
(*do_quantize) (coef_blocks[bi], divisors, workspace);
|
||||
|
||||
if (do_preprocess) {
|
||||
int i;
|
||||
int maxval = (1 << MAX_COEF_BITS) - 1;
|
||||
for (i = 0; i < 64; i++) {
|
||||
if (coef_blocks[bi][i] < -maxval)
|
||||
coef_blocks[bi][i] = -maxval;
|
||||
if (coef_blocks[bi][i] > maxval)
|
||||
coef_blocks[bi][i] = maxval;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* DCT_FLOAT_SUPPORTED */
|
||||
|
||||
#include "jchuff.h"
|
||||
#include "jpeg_nbits_table.h"
|
||||
|
||||
static const float jpeg_lambda_weights_flat[64] = {
|
||||
@@ -875,7 +897,12 @@ static const float jpeg_lambda_weights_csf_luma[64] = {
|
||||
0.43454f, 0.42146f, 0.34609f, 0.24072f, 0.15975f, 0.10701f, 0.07558f, 0.05875f,
|
||||
};
|
||||
|
||||
#define DC_TRELLIS_CANDIDATES 3
|
||||
#define DC_TRELLIS_MAX_CANDIDATES 9
|
||||
|
||||
LOCAL(int) get_num_dc_trellis_candidates(int dc_quantval) {
|
||||
/* Higher qualities can tolerate higher DC distortion */
|
||||
return MIN(DC_TRELLIS_MAX_CANDIDATES, (2 + 60 / dc_quantval)|1);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actbl, JBLOCKROW coef_blocks, JBLOCKROW src, JDIMENSION num_blocks,
|
||||
@@ -908,12 +935,13 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
|
||||
int zero_run;
|
||||
int run_bits;
|
||||
int rate;
|
||||
float *accumulated_dc_cost[DC_TRELLIS_CANDIDATES];
|
||||
int *dc_cost_backtrack[DC_TRELLIS_CANDIDATES];
|
||||
JCOEF *dc_candidate[DC_TRELLIS_CANDIDATES];
|
||||
float *accumulated_dc_cost[DC_TRELLIS_MAX_CANDIDATES];
|
||||
int *dc_cost_backtrack[DC_TRELLIS_MAX_CANDIDATES];
|
||||
JCOEF *dc_candidate[DC_TRELLIS_MAX_CANDIDATES];
|
||||
int mode = 1;
|
||||
float lambda_table[DCTSIZE2];
|
||||
|
||||
const int dc_trellis_candidates = get_num_dc_trellis_candidates(qtbl->quantval[0]);
|
||||
|
||||
Ss = cinfo->Ss;
|
||||
Se = cinfo->Se;
|
||||
if (Ss == 0)
|
||||
@@ -936,8 +964,9 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
|
||||
accumulated_block_cost[0] = 0;
|
||||
requires_eob[0] = 0;
|
||||
}
|
||||
|
||||
if (cinfo->master->trellis_quant_dc) {
|
||||
for (i = 0; i < DC_TRELLIS_CANDIDATES; i++) {
|
||||
for (i = 0; i < dc_trellis_candidates; i++) {
|
||||
accumulated_dc_cost[i] = (float *)malloc(num_blocks * sizeof(float));
|
||||
dc_cost_backtrack[i] = (int *)malloc(num_blocks * sizeof(int));
|
||||
dc_candidate[i] = (JCOEF *)malloc(num_blocks * sizeof(JCOEF));
|
||||
@@ -991,12 +1020,17 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
|
||||
float dc_candidate_dist;
|
||||
|
||||
qval = (x + q/2) / q; /* quantized value (round nearest) */
|
||||
for (k = 0; k < DC_TRELLIS_CANDIDATES; k++) {
|
||||
for (k = 0; k < dc_trellis_candidates; k++) {
|
||||
int delta;
|
||||
int dc_delta;
|
||||
int bits;
|
||||
|
||||
dc_candidate[k][bi] = qval - DC_TRELLIS_CANDIDATES/2 + k;
|
||||
dc_candidate[k][bi] = qval - dc_trellis_candidates/2 + k;
|
||||
if (dc_candidate[k][bi] >= (1<<MAX_COEF_BITS))
|
||||
dc_candidate[k][bi] = (1<<MAX_COEF_BITS)-1;
|
||||
if (dc_candidate[k][bi] <= -(1<<MAX_COEF_BITS))
|
||||
dc_candidate[k][bi] = -(1<<MAX_COEF_BITS)+1;
|
||||
|
||||
delta = dc_candidate[k][bi] * q - x;
|
||||
dc_candidate_dist = delta * delta * lambda_dc;
|
||||
dc_candidate[k][bi] *= 1 + 2*sign;
|
||||
@@ -1033,7 +1067,7 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
|
||||
accumulated_dc_cost[k][0] = cost;
|
||||
dc_cost_backtrack[k][0] = -1;
|
||||
} else {
|
||||
for (l = 0; l < DC_TRELLIS_CANDIDATES; l++) {
|
||||
for (l = 0; l < dc_trellis_candidates; l++) {
|
||||
dc_delta = dc_candidate[k][bi] - dc_candidate[l][bi-1];
|
||||
|
||||
/* Derive number of suffix bits */
|
||||
@@ -1076,6 +1110,9 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
|
||||
continue;
|
||||
}
|
||||
|
||||
if (qval >= (1<<MAX_COEF_BITS))
|
||||
qval = (1<<MAX_COEF_BITS)-1;
|
||||
|
||||
num_candidates = jpeg_nbits_table[qval];
|
||||
for (k = 0; k < num_candidates; k++) {
|
||||
int delta;
|
||||
@@ -1240,7 +1277,7 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
|
||||
|
||||
if (cinfo->master->trellis_quant_dc) {
|
||||
j = 0;
|
||||
for (i = 1; i < DC_TRELLIS_CANDIDATES; i++) {
|
||||
for (i = 1; i < dc_trellis_candidates; i++) {
|
||||
if (accumulated_dc_cost[i][num_blocks-1] < accumulated_dc_cost[j][num_blocks-1])
|
||||
j = i;
|
||||
}
|
||||
@@ -1252,7 +1289,7 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
|
||||
/* Save DC predictor */
|
||||
*last_dc_val = coef_blocks[num_blocks-1][0];
|
||||
|
||||
for (i = 0; i < DC_TRELLIS_CANDIDATES; i++) {
|
||||
for (i = 0; i < dc_trellis_candidates; i++) {
|
||||
free(accumulated_dc_cost[i]);
|
||||
free(dc_cost_backtrack[i]);
|
||||
free(dc_candidate[i]);
|
||||
@@ -1261,6 +1298,7 @@ quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actb
|
||||
|
||||
}
|
||||
|
||||
#ifdef C_ARITH_CODING_SUPPORTED
|
||||
GLOBAL(void)
|
||||
quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_blocks, JBLOCKROW src, JDIMENSION num_blocks,
|
||||
JQUANT_TBL * qtbl, double *norm_src, double *norm_coef, JCOEF *last_dc_val,
|
||||
@@ -1281,20 +1319,17 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
|
||||
jpeg_lambda_weights_csf_luma :
|
||||
jpeg_lambda_weights_flat;
|
||||
int Ss, Se;
|
||||
int has_eob;
|
||||
float cost_all_zeros;
|
||||
float best_cost_skip;
|
||||
float cost;
|
||||
float run_bits;
|
||||
int rate;
|
||||
float *accumulated_dc_cost[DC_TRELLIS_CANDIDATES];
|
||||
int *dc_cost_backtrack[DC_TRELLIS_CANDIDATES];
|
||||
JCOEF *dc_candidate[DC_TRELLIS_CANDIDATES];
|
||||
float *accumulated_dc_cost[DC_TRELLIS_MAX_CANDIDATES];
|
||||
int *dc_cost_backtrack[DC_TRELLIS_MAX_CANDIDATES];
|
||||
JCOEF *dc_candidate[DC_TRELLIS_MAX_CANDIDATES];
|
||||
int *dc_context[DC_TRELLIS_MAX_CANDIDATES];
|
||||
|
||||
int mode = 1;
|
||||
float lambda_table[DCTSIZE2];
|
||||
|
||||
/* Arithmetic coding context. Set to 0 for now but can refined */
|
||||
int dc_context = 0;
|
||||
const int dc_trellis_candidates = get_num_dc_trellis_candidates(qtbl->quantval[0]);
|
||||
|
||||
Ss = cinfo->Ss;
|
||||
Se = cinfo->Se;
|
||||
@@ -1304,13 +1339,15 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
|
||||
return;
|
||||
|
||||
if (cinfo->master->trellis_quant_dc) {
|
||||
for (i = 0; i < DC_TRELLIS_CANDIDATES; i++) {
|
||||
for (i = 0; i < dc_trellis_candidates; i++) {
|
||||
accumulated_dc_cost[i] = (float *)malloc(num_blocks * sizeof(float));
|
||||
dc_cost_backtrack[i] = (int *)malloc(num_blocks * sizeof(int));
|
||||
dc_candidate[i] = (JCOEF *)malloc(num_blocks * sizeof(JCOEF));
|
||||
dc_context[i] = (int *)malloc(num_blocks * sizeof(int));
|
||||
if (!accumulated_dc_cost[i] ||
|
||||
!dc_cost_backtrack[i] ||
|
||||
!dc_candidate[i]) {
|
||||
!dc_candidate[i] ||
|
||||
!dc_context[i]) {
|
||||
ERREXIT(cinfo, JERR_OUT_OF_MEMORY);
|
||||
}
|
||||
}
|
||||
@@ -1358,15 +1395,16 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
|
||||
float dc_candidate_dist;
|
||||
|
||||
qval = (x + q/2) / q; /* quantized value (round nearest) */
|
||||
for (k = 0; k < DC_TRELLIS_CANDIDATES; k++) {
|
||||
|
||||
/* loop over candidates in current block */
|
||||
for (k = 0; k < dc_trellis_candidates; k++) {
|
||||
int delta;
|
||||
int dc_delta;
|
||||
float bits;
|
||||
int st = dc_context;
|
||||
int m;
|
||||
int v2;
|
||||
|
||||
dc_candidate[k][bi] = qval - DC_TRELLIS_CANDIDATES/2 + k;
|
||||
dc_candidate[k][bi] = qval - dc_trellis_candidates/2 + k;
|
||||
delta = dc_candidate[k][bi] * q - x;
|
||||
dc_candidate_dist = delta * delta * lambda_dc;
|
||||
dc_candidate[k][bi] *= 1 + 2*sign;
|
||||
@@ -1389,13 +1427,20 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
|
||||
dc_candidate_dist += cinfo->master->trellis_delta_dc_weight * (vertical_dist - dc_candidate_dist);
|
||||
}
|
||||
|
||||
if (bi == 0) {
|
||||
dc_delta = dc_candidate[k][bi] - *last_dc_val;
|
||||
/* loop of candidates from previous block */
|
||||
for (l = 0; l < (bi == 0 ? 1 : dc_trellis_candidates); l++) {
|
||||
int dc_pred = (bi == 0 ? *last_dc_val : dc_candidate[l][bi-1]);
|
||||
int updated_dc_context = 0;
|
||||
int st = (bi == 0) ? 0 : dc_context[l][bi-1];
|
||||
dc_delta = dc_candidate[k][bi] - dc_pred;
|
||||
|
||||
bits = r->rate_dc[st][dc_delta != 0];
|
||||
|
||||
if (dc_delta != 0) {
|
||||
bits += r->rate_dc[st+1][dc_delta < 0];
|
||||
st += 2 + (dc_delta < 0);
|
||||
updated_dc_context = (dc_delta < 0) ? 8 : 4;
|
||||
|
||||
dc_delta = abs(dc_delta);
|
||||
|
||||
m = 0;
|
||||
@@ -1411,46 +1456,25 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
|
||||
}
|
||||
}
|
||||
bits += r->rate_dc[st][0];
|
||||
|
||||
if (m < (int) ((1L << r->arith_dc_L) >> 1))
|
||||
updated_dc_context = 0; /* zero diff category */
|
||||
else if (m > (int) ((1L << r->arith_dc_U) >> 1))
|
||||
updated_dc_context += 8; /* large diff category */
|
||||
|
||||
st += 14;
|
||||
while (m >>= 1)
|
||||
bits += r->rate_dc[st][(m & dc_delta) ? 1 : 0];
|
||||
}
|
||||
|
||||
cost = bits + dc_candidate_dist;
|
||||
accumulated_dc_cost[k][0] = cost;
|
||||
dc_cost_backtrack[k][0] = -1;
|
||||
} else {
|
||||
for (l = 0; l < DC_TRELLIS_CANDIDATES; l++) {
|
||||
dc_delta = dc_candidate[k][bi] - dc_candidate[l][bi-1];
|
||||
|
||||
bits = r->rate_dc[st][dc_delta != 0];
|
||||
if (dc_delta != 0) {
|
||||
bits += r->rate_dc[st+1][dc_delta < 0];
|
||||
st += 2 + (dc_delta < 0);
|
||||
dc_delta = abs(dc_delta);
|
||||
|
||||
m = 0;
|
||||
if (dc_delta -= 1) {
|
||||
bits += r->rate_dc[st][1];
|
||||
st = 20;
|
||||
m = 1;
|
||||
v2 = dc_delta;
|
||||
while (v2 >>= 1) {
|
||||
bits += r->rate_dc[st][1];
|
||||
m <<= 1;
|
||||
st++;
|
||||
}
|
||||
}
|
||||
bits += r->rate_dc[st][0];
|
||||
st += 14;
|
||||
while (m >>= 1)
|
||||
bits += r->rate_dc[st][(m & dc_delta) ? 1 : 0];
|
||||
}
|
||||
|
||||
cost = bits + dc_candidate_dist + accumulated_dc_cost[l][bi-1];
|
||||
if (l == 0 || cost < accumulated_dc_cost[k][bi]) {
|
||||
accumulated_dc_cost[k][bi] = cost;
|
||||
dc_cost_backtrack[k][bi] = l;
|
||||
}
|
||||
if (bi != 0)
|
||||
cost += accumulated_dc_cost[l][bi-1];
|
||||
|
||||
if (l == 0 || cost < accumulated_dc_cost[k][bi]) {
|
||||
accumulated_dc_cost[k][bi] = cost;
|
||||
dc_cost_backtrack[k][bi] = (bi == 0 ? -1 : l);
|
||||
dc_context[k][bi] = updated_dc_context;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1520,7 +1544,7 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
|
||||
if (v2 >>= 1) {
|
||||
coef_bits += r->rate_ac[st][1];
|
||||
m <<= 1;
|
||||
st = 189; /* TODO: condition 189/217 */
|
||||
st = (i <= r->arith_ac_K) ? 189 : 217;
|
||||
while (v2 >>= 1) {
|
||||
coef_bits += r->rate_ac[st][1];
|
||||
m <<= 1;
|
||||
@@ -1548,14 +1572,11 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
|
||||
|
||||
last_coeff_idx = Ss-1;
|
||||
best_cost = accumulated_zero_dist[Se] + r->rate_ac[0][1];
|
||||
cost_all_zeros = accumulated_zero_dist[Se];
|
||||
best_cost_skip = cost_all_zeros;
|
||||
|
||||
for (i = Ss; i <= Se; i++) {
|
||||
int z = jpeg_natural_order[i];
|
||||
if (coef_blocks[bi][z] != 0) {
|
||||
float cost = accumulated_cost[i] + accumulated_zero_dist[Se] - accumulated_zero_dist[i];
|
||||
float cost_wo_eob = cost;
|
||||
|
||||
if (i < Se)
|
||||
cost += r->rate_ac[3*(i-1)][1];
|
||||
@@ -1563,13 +1584,10 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
|
||||
if (cost < best_cost) {
|
||||
best_cost = cost;
|
||||
last_coeff_idx = i;
|
||||
best_cost_skip = cost_wo_eob;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
has_eob = (last_coeff_idx < Se) + (last_coeff_idx == Ss-1);
|
||||
|
||||
/* Zero out coefficients that are part of runs */
|
||||
i = Se;
|
||||
while (i >= Ss)
|
||||
@@ -1596,7 +1614,7 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
|
||||
|
||||
if (cinfo->master->trellis_quant_dc) {
|
||||
j = 0;
|
||||
for (i = 1; i < DC_TRELLIS_CANDIDATES; i++) {
|
||||
for (i = 1; i < dc_trellis_candidates; i++) {
|
||||
if (accumulated_dc_cost[i][num_blocks-1] < accumulated_dc_cost[j][num_blocks-1])
|
||||
j = i;
|
||||
}
|
||||
@@ -1608,13 +1626,15 @@ quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_bloc
|
||||
/* Save DC predictor */
|
||||
*last_dc_val = coef_blocks[num_blocks-1][0];
|
||||
|
||||
for (i = 0; i < DC_TRELLIS_CANDIDATES; i++) {
|
||||
for (i = 0; i < dc_trellis_candidates; i++) {
|
||||
free(accumulated_dc_cost[i]);
|
||||
free(dc_cost_backtrack[i]);
|
||||
free(dc_candidate[i]);
|
||||
free(dc_context[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Initialize FDCT manager.
|
||||
|
||||
12
jcext.c
12
jcext.c
@@ -16,7 +16,7 @@
|
||||
|
||||
|
||||
GLOBAL(boolean)
|
||||
jpeg_c_bool_param_supported (j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
|
||||
jpeg_c_bool_param_supported (const j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
|
||||
{
|
||||
switch (param) {
|
||||
case JBOOLEAN_OPTIMIZE_SCANS:
|
||||
@@ -70,7 +70,7 @@ jpeg_c_set_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param,
|
||||
|
||||
|
||||
GLOBAL(boolean)
|
||||
jpeg_c_get_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
|
||||
jpeg_c_get_bool_param (const j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
|
||||
{
|
||||
switch(param) {
|
||||
case JBOOLEAN_OPTIMIZE_SCANS:
|
||||
@@ -98,7 +98,7 @@ jpeg_c_get_bool_param (j_compress_ptr cinfo, J_BOOLEAN_PARAM param)
|
||||
|
||||
|
||||
GLOBAL(boolean)
|
||||
jpeg_c_float_param_supported (j_compress_ptr cinfo, J_FLOAT_PARAM param)
|
||||
jpeg_c_float_param_supported (const j_compress_ptr cinfo, J_FLOAT_PARAM param)
|
||||
{
|
||||
switch (param) {
|
||||
case JFLOAT_LAMBDA_LOG_SCALE1:
|
||||
@@ -131,7 +131,7 @@ jpeg_c_set_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param, float value)
|
||||
|
||||
|
||||
GLOBAL(float)
|
||||
jpeg_c_get_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param)
|
||||
jpeg_c_get_float_param (const j_compress_ptr cinfo, J_FLOAT_PARAM param)
|
||||
{
|
||||
switch (param) {
|
||||
case JFLOAT_LAMBDA_LOG_SCALE1:
|
||||
@@ -149,7 +149,7 @@ jpeg_c_get_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param)
|
||||
|
||||
|
||||
GLOBAL(boolean)
|
||||
jpeg_c_int_param_supported (j_compress_ptr cinfo, J_INT_PARAM param)
|
||||
jpeg_c_int_param_supported (const j_compress_ptr cinfo, J_INT_PARAM param)
|
||||
{
|
||||
switch (param) {
|
||||
case JINT_COMPRESS_PROFILE:
|
||||
@@ -198,7 +198,7 @@ jpeg_c_set_int_param (j_compress_ptr cinfo, J_INT_PARAM param, int value)
|
||||
|
||||
|
||||
GLOBAL(int)
|
||||
jpeg_c_get_int_param (j_compress_ptr cinfo, J_INT_PARAM param)
|
||||
jpeg_c_get_int_param (const j_compress_ptr cinfo, J_INT_PARAM param)
|
||||
{
|
||||
switch (param) {
|
||||
case JINT_COMPRESS_PROFILE:
|
||||
|
||||
2
jchuff.c
2
jchuff.c
@@ -37,7 +37,7 @@
|
||||
*/
|
||||
|
||||
/* NOTE: Both GCC and Clang define __GNUC__ */
|
||||
#if defined __GNUC__ && defined __arm__
|
||||
#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
|
||||
#if !defined __thumb__ || defined __thumb2__
|
||||
#define USE_CLZ_INTRINSIC
|
||||
#endif
|
||||
|
||||
142
jcmainct.c
142
jcmainct.c
@@ -17,14 +17,6 @@
|
||||
#include "jpeglib.h"
|
||||
|
||||
|
||||
/* Note: currently, there is no operating mode in which a full-image buffer
|
||||
* is needed at this step. If there were, that mode could not be used with
|
||||
* "raw data" input, since this module is bypassed in that case. However,
|
||||
* we've left the code here for possible use in special applications.
|
||||
*/
|
||||
#undef FULL_MAIN_BUFFER_SUPPORTED
|
||||
|
||||
|
||||
/* Private buffer controller object */
|
||||
|
||||
typedef struct {
|
||||
@@ -40,13 +32,6 @@ typedef struct {
|
||||
* points to the currently accessible strips of the virtual arrays.
|
||||
*/
|
||||
JSAMPARRAY buffer[MAX_COMPONENTS];
|
||||
|
||||
#ifdef FULL_MAIN_BUFFER_SUPPORTED
|
||||
/* If using full-image storage, this array holds pointers to virtual-array
|
||||
* control blocks for each component. Unused if not full-image storage.
|
||||
*/
|
||||
jvirt_sarray_ptr whole_image[MAX_COMPONENTS];
|
||||
#endif
|
||||
} my_main_controller;
|
||||
|
||||
typedef my_main_controller * my_main_ptr;
|
||||
@@ -56,11 +41,6 @@ typedef my_main_controller * my_main_ptr;
|
||||
METHODDEF(void) process_data_simple_main
|
||||
(j_compress_ptr cinfo, JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
|
||||
JDIMENSION in_rows_avail);
|
||||
#ifdef FULL_MAIN_BUFFER_SUPPORTED
|
||||
METHODDEF(void) process_data_buffer_main
|
||||
(j_compress_ptr cinfo, JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
|
||||
JDIMENSION in_rows_avail);
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
@@ -76,32 +56,14 @@ start_pass_main (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
|
||||
if (cinfo->raw_data_in)
|
||||
return;
|
||||
|
||||
if (pass_mode != JBUF_PASS_THRU)
|
||||
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
|
||||
|
||||
main_ptr->cur_iMCU_row = 0; /* initialize counters */
|
||||
main_ptr->rowgroup_ctr = 0;
|
||||
main_ptr->suspended = FALSE;
|
||||
main_ptr->pass_mode = pass_mode; /* save mode for use by process_data */
|
||||
|
||||
switch (pass_mode) {
|
||||
case JBUF_PASS_THRU:
|
||||
#ifdef FULL_MAIN_BUFFER_SUPPORTED
|
||||
if (main_ptr->whole_image[0] != NULL)
|
||||
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
|
||||
#endif
|
||||
main_ptr->pub.process_data = process_data_simple_main;
|
||||
break;
|
||||
#ifdef FULL_MAIN_BUFFER_SUPPORTED
|
||||
case JBUF_SAVE_SOURCE:
|
||||
case JBUF_CRANK_DEST:
|
||||
case JBUF_SAVE_AND_PASS:
|
||||
if (main_ptr->whole_image[0] == NULL)
|
||||
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
|
||||
main_ptr->pub.process_data = process_data_buffer_main;
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
|
||||
break;
|
||||
}
|
||||
main_ptr->pub.process_data = process_data_simple_main;
|
||||
}
|
||||
|
||||
|
||||
@@ -160,85 +122,6 @@ process_data_simple_main (j_compress_ptr cinfo,
|
||||
}
|
||||
|
||||
|
||||
#ifdef FULL_MAIN_BUFFER_SUPPORTED
|
||||
|
||||
/*
|
||||
* Process some data.
|
||||
* This routine handles all of the modes that use a full-size buffer.
|
||||
*/
|
||||
|
||||
METHODDEF(void)
|
||||
process_data_buffer_main (j_compress_ptr cinfo,
|
||||
JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
|
||||
JDIMENSION in_rows_avail)
|
||||
{
|
||||
my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
|
||||
int ci;
|
||||
jpeg_component_info *compptr;
|
||||
boolean writing = (main_ptr->pass_mode != JBUF_CRANK_DEST);
|
||||
|
||||
while (main_ptr->cur_iMCU_row < cinfo->total_iMCU_rows) {
|
||||
/* Realign the virtual buffers if at the start of an iMCU row. */
|
||||
if (main_ptr->rowgroup_ctr == 0) {
|
||||
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
|
||||
ci++, compptr++) {
|
||||
main_ptr->buffer[ci] = (*cinfo->mem->access_virt_sarray)
|
||||
((j_common_ptr) cinfo, main_ptr->whole_image[ci],
|
||||
main_ptr->cur_iMCU_row * (compptr->v_samp_factor * DCTSIZE),
|
||||
(JDIMENSION) (compptr->v_samp_factor * DCTSIZE), writing);
|
||||
}
|
||||
/* In a read pass, pretend we just read some source data. */
|
||||
if (! writing) {
|
||||
*in_row_ctr += cinfo->max_v_samp_factor * DCTSIZE;
|
||||
main_ptr->rowgroup_ctr = DCTSIZE;
|
||||
}
|
||||
}
|
||||
|
||||
/* If a write pass, read input data until the current iMCU row is full. */
|
||||
/* Note: preprocessor will pad if necessary to fill the last iMCU row. */
|
||||
if (writing) {
|
||||
(*cinfo->prep->pre_process_data) (cinfo,
|
||||
input_buf, in_row_ctr, in_rows_avail,
|
||||
main_ptr->buffer, &main_ptr->rowgroup_ctr,
|
||||
(JDIMENSION) DCTSIZE);
|
||||
/* Return to application if we need more data to fill the iMCU row. */
|
||||
if (main_ptr->rowgroup_ctr < DCTSIZE)
|
||||
return;
|
||||
}
|
||||
|
||||
/* Emit data, unless this is a sink-only pass. */
|
||||
if (main_ptr->pass_mode != JBUF_SAVE_SOURCE) {
|
||||
if (! (*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
|
||||
/* If compressor did not consume the whole row, then we must need to
|
||||
* suspend processing and return to the application. In this situation
|
||||
* we pretend we didn't yet consume the last input row; otherwise, if
|
||||
* it happened to be the last row of the image, the application would
|
||||
* think we were done.
|
||||
*/
|
||||
if (! main_ptr->suspended) {
|
||||
(*in_row_ctr)--;
|
||||
main_ptr->suspended = TRUE;
|
||||
}
|
||||
return;
|
||||
}
|
||||
/* We did finish the row. Undo our little suspension hack if a previous
|
||||
* call suspended; then mark the main buffer empty.
|
||||
*/
|
||||
if (main_ptr->suspended) {
|
||||
(*in_row_ctr)++;
|
||||
main_ptr->suspended = FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
/* If get here, we are done with this iMCU row. Mark buffer empty. */
|
||||
main_ptr->rowgroup_ctr = 0;
|
||||
main_ptr->cur_iMCU_row++;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* FULL_MAIN_BUFFER_SUPPORTED */
|
||||
|
||||
|
||||
/*
|
||||
* Initialize main buffer controller.
|
||||
*/
|
||||
@@ -264,25 +147,8 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
|
||||
* may be of a different size.
|
||||
*/
|
||||
if (need_full_buffer) {
|
||||
#ifdef FULL_MAIN_BUFFER_SUPPORTED
|
||||
/* Allocate a full-image virtual array for each component */
|
||||
/* Note we pad the bottom to a multiple of the iMCU height */
|
||||
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
|
||||
ci++, compptr++) {
|
||||
main_ptr->whole_image[ci] = (*cinfo->mem->request_virt_sarray)
|
||||
((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
|
||||
compptr->width_in_blocks * DCTSIZE,
|
||||
(JDIMENSION) jround_up((long) compptr->height_in_blocks,
|
||||
(long) compptr->v_samp_factor) * DCTSIZE,
|
||||
(JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
|
||||
}
|
||||
#else
|
||||
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
|
||||
#endif
|
||||
} else {
|
||||
#ifdef FULL_MAIN_BUFFER_SUPPORTED
|
||||
main_ptr->whole_image[0] = NULL; /* flag for no virtual arrays */
|
||||
#endif
|
||||
/* Allocate a strip buffer for each component */
|
||||
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
|
||||
ci++, compptr++) {
|
||||
|
||||
@@ -605,7 +605,9 @@ copy_buffer (j_compress_ptr cinfo, int scan_idx)
|
||||
size -= cinfo->dest->free_in_buffer;
|
||||
cinfo->dest->next_output_byte += cinfo->dest->free_in_buffer;
|
||||
cinfo->dest->free_in_buffer = 0;
|
||||
(*cinfo->dest->empty_output_buffer)(cinfo);
|
||||
|
||||
if (!(*cinfo->dest->empty_output_buffer)(cinfo))
|
||||
ERREXIT(cinfo, JERR_UNSUPPORTED_SUSPEND);
|
||||
}
|
||||
|
||||
MEMCOPY(cinfo->dest->next_output_byte, src, size);
|
||||
|
||||
@@ -514,8 +514,9 @@ jinit_downsampler (j_compress_ptr cinfo)
|
||||
#endif
|
||||
downsample->methods[ci] = h2v2_smooth_downsample;
|
||||
downsample->pub.need_context_rows = TRUE;
|
||||
} else {
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
if (jsimd_can_h2v2_downsample())
|
||||
downsample->methods[ci] = jsimd_h2v2_downsample;
|
||||
else
|
||||
|
||||
@@ -69,7 +69,7 @@ jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr * coef_arrays)
|
||||
*/
|
||||
|
||||
GLOBAL(void)
|
||||
jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
|
||||
jpeg_copy_critical_parameters (const j_decompress_ptr srcinfo,
|
||||
j_compress_ptr dstinfo)
|
||||
{
|
||||
JQUANT_TBL ** qtblptr;
|
||||
|
||||
@@ -333,7 +333,7 @@ jpeg_consume_input (j_decompress_ptr cinfo)
|
||||
*/
|
||||
|
||||
GLOBAL(boolean)
|
||||
jpeg_input_complete (j_decompress_ptr cinfo)
|
||||
jpeg_input_complete (const j_decompress_ptr cinfo)
|
||||
{
|
||||
/* Check for valid jpeg object */
|
||||
if (cinfo->global_state < DSTATE_START ||
|
||||
@@ -348,7 +348,7 @@ jpeg_input_complete (j_decompress_ptr cinfo)
|
||||
*/
|
||||
|
||||
GLOBAL(boolean)
|
||||
jpeg_has_multiple_scans (j_decompress_ptr cinfo)
|
||||
jpeg_has_multiple_scans (const j_decompress_ptr cinfo)
|
||||
{
|
||||
/* Only valid after jpeg_read_header completes */
|
||||
if (cinfo->global_state < DSTATE_READY ||
|
||||
|
||||
@@ -254,7 +254,7 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE * infile)
|
||||
|
||||
GLOBAL(void)
|
||||
jpeg_mem_src (j_decompress_ptr cinfo,
|
||||
unsigned char * inbuffer, unsigned long insize)
|
||||
const unsigned char * inbuffer, unsigned long insize)
|
||||
{
|
||||
struct jpeg_source_mgr * src;
|
||||
|
||||
@@ -278,6 +278,6 @@ jpeg_mem_src (j_decompress_ptr cinfo,
|
||||
src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
|
||||
src->term_source = term_source;
|
||||
src->bytes_in_buffer = (size_t) insize;
|
||||
src->next_input_byte = (JOCTET *) inbuffer;
|
||||
src->next_input_byte = (const JOCTET *) inbuffer;
|
||||
}
|
||||
#endif
|
||||
|
||||
2
jerror.h
2
jerror.h
@@ -210,6 +210,8 @@ JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
|
||||
JMESSAGE(JERR_BAD_PARAM, "Bogus parameter")
|
||||
JMESSAGE(JERR_BAD_PARAM_VALUE, "Bogus parameter value")
|
||||
|
||||
JMESSAGE(JERR_UNSUPPORTED_SUSPEND, "I/O suspension not supported in scan optimization")
|
||||
|
||||
#ifdef JMAKE_ENUM_LIST
|
||||
|
||||
JMSG_LASTMSGCODE
|
||||
|
||||
18
jmorecfg.h
18
jmorecfg.h
@@ -180,6 +180,24 @@ typedef unsigned int JDIMENSION;
|
||||
#define EXTERN(type) extern type
|
||||
|
||||
|
||||
/* Originally, this macro was used as a way of defining function prototypes
|
||||
* for both modern compilers as well as older compilers that did not support
|
||||
* prototype parameters. libjpeg-turbo has never supported these older,
|
||||
* non-ANSI compilers, but the macro is still included because there is some
|
||||
* software out there that uses it.
|
||||
*/
|
||||
|
||||
#define JMETHOD(type,methodname,arglist) type (*methodname) arglist
|
||||
|
||||
|
||||
/* libjpeg-turbo no longer supports platforms that have far symbols (MS-DOS),
|
||||
* but again, some software relies on this macro.
|
||||
*/
|
||||
|
||||
#undef FAR
|
||||
#define FAR
|
||||
|
||||
|
||||
/*
|
||||
* On a few systems, type boolean and/or its values FALSE, TRUE may appear
|
||||
* in standard header files. Or you may have conflicts with application-
|
||||
|
||||
@@ -91,6 +91,7 @@ struct jpeg_comp_master {
|
||||
float trellis_delta_dc_weight;
|
||||
};
|
||||
|
||||
#ifdef C_ARITH_CODING_SUPPORTED
|
||||
/* The following two definitions specify the allocation chunk size
|
||||
* for the statistics area.
|
||||
* According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least
|
||||
@@ -110,7 +111,11 @@ struct jpeg_comp_master {
|
||||
typedef struct {
|
||||
float rate_dc[DC_STAT_BINS][2];
|
||||
float rate_ac[AC_STAT_BINS][2];
|
||||
int arith_dc_L;
|
||||
int arith_dc_U;
|
||||
int arith_ac_K;
|
||||
} arith_rates;
|
||||
#endif
|
||||
|
||||
/* Main buffer control (downsampled-data buffer) */
|
||||
struct jpeg_c_main_controller {
|
||||
@@ -386,12 +391,14 @@ EXTERN(void) jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row,
|
||||
JDIMENSION num_blocks);
|
||||
EXTERN(void) jzero_far (void * target, size_t bytestozero);
|
||||
|
||||
#ifdef C_ARITH_CODING_SUPPORTED
|
||||
EXTERN(void) jget_arith_rates (j_compress_ptr cinfo, int dc_tbl_no, int ac_tbl_no, arith_rates *r);
|
||||
|
||||
EXTERN(void) quantize_trellis_arith
|
||||
(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_blocks, JBLOCKROW src, JDIMENSION num_blocks,
|
||||
JQUANT_TBL * qtbl, double *norm_src, double *norm_coef, JCOEF *last_dc_val,
|
||||
JBLOCKROW coef_blocks_above, JBLOCKROW src_above);
|
||||
#endif
|
||||
|
||||
/* Constant tables in jutils.c */
|
||||
#if 0 /* This table is not actually needed in v6a */
|
||||
|
||||
30
jpeglib.h
30
jpeglib.h
@@ -923,6 +923,16 @@ struct jpeg_memory_mgr {
|
||||
typedef boolean (*jpeg_marker_parser_method) (j_decompress_ptr cinfo);
|
||||
|
||||
|
||||
/* Originally, this macro was used as a way of defining function prototypes
|
||||
* for both modern compilers as well as older compilers that did not support
|
||||
* prototype parameters. libjpeg-turbo has never supported these older,
|
||||
* non-ANSI compilers, but the macro is still included because there is some
|
||||
* software out there that uses it.
|
||||
*/
|
||||
|
||||
#define JPP(arglist) arglist
|
||||
|
||||
|
||||
/* Default error-management setup */
|
||||
EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr * err);
|
||||
|
||||
@@ -956,7 +966,7 @@ EXTERN(void) jpeg_stdio_src (j_decompress_ptr cinfo, FILE * infile);
|
||||
/* Data source and destination managers: memory buffers. */
|
||||
EXTERN(void) jpeg_mem_dest (j_compress_ptr cinfo, unsigned char ** outbuffer,
|
||||
unsigned long * outsize);
|
||||
EXTERN(void) jpeg_mem_src (j_decompress_ptr cinfo, unsigned char * inbuffer,
|
||||
EXTERN(void) jpeg_mem_src (j_decompress_ptr cinfo, const unsigned char * inbuffer,
|
||||
unsigned long insize);
|
||||
#endif
|
||||
|
||||
@@ -1036,10 +1046,10 @@ EXTERN(JDIMENSION) jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
|
||||
JDIMENSION max_lines);
|
||||
|
||||
/* Additional entry points for buffered-image mode. */
|
||||
EXTERN(boolean) jpeg_has_multiple_scans (j_decompress_ptr cinfo);
|
||||
EXTERN(boolean) jpeg_has_multiple_scans (const j_decompress_ptr cinfo);
|
||||
EXTERN(boolean) jpeg_start_output (j_decompress_ptr cinfo, int scan_number);
|
||||
EXTERN(boolean) jpeg_finish_output (j_decompress_ptr cinfo);
|
||||
EXTERN(boolean) jpeg_input_complete (j_decompress_ptr cinfo);
|
||||
EXTERN(boolean) jpeg_input_complete (const j_decompress_ptr cinfo);
|
||||
EXTERN(void) jpeg_new_colormap (j_decompress_ptr cinfo);
|
||||
EXTERN(int) jpeg_consume_input (j_decompress_ptr cinfo);
|
||||
/* Return value is one of: */
|
||||
@@ -1068,7 +1078,7 @@ EXTERN(void) jpeg_set_marker_processor (j_decompress_ptr cinfo,
|
||||
EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients (j_decompress_ptr cinfo);
|
||||
EXTERN(void) jpeg_write_coefficients (j_compress_ptr cinfo,
|
||||
jvirt_barray_ptr * coef_arrays);
|
||||
EXTERN(void) jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
|
||||
EXTERN(void) jpeg_copy_critical_parameters (const j_decompress_ptr srcinfo,
|
||||
j_compress_ptr dstinfo);
|
||||
|
||||
/* If you choose to abort compression or decompression before completing
|
||||
@@ -1090,25 +1100,25 @@ EXTERN(void) jpeg_destroy (j_common_ptr cinfo);
|
||||
EXTERN(boolean) jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired);
|
||||
|
||||
/* Accessor functions for extension parameters */
|
||||
EXTERN(boolean) jpeg_c_bool_param_supported (j_compress_ptr cinfo,
|
||||
EXTERN(boolean) jpeg_c_bool_param_supported (const j_compress_ptr cinfo,
|
||||
J_BOOLEAN_PARAM param);
|
||||
EXTERN(void) jpeg_c_set_bool_param (j_compress_ptr cinfo,
|
||||
J_BOOLEAN_PARAM param, boolean value);
|
||||
EXTERN(boolean) jpeg_c_get_bool_param (j_compress_ptr cinfo,
|
||||
EXTERN(boolean) jpeg_c_get_bool_param (const j_compress_ptr cinfo,
|
||||
J_BOOLEAN_PARAM param);
|
||||
|
||||
EXTERN(boolean) jpeg_c_float_param_supported (j_compress_ptr cinfo,
|
||||
EXTERN(boolean) jpeg_c_float_param_supported (const j_compress_ptr cinfo,
|
||||
J_FLOAT_PARAM param);
|
||||
EXTERN(void) jpeg_c_set_float_param (j_compress_ptr cinfo, J_FLOAT_PARAM param,
|
||||
float value);
|
||||
EXTERN(float) jpeg_c_get_float_param (j_compress_ptr cinfo,
|
||||
EXTERN(float) jpeg_c_get_float_param (const j_compress_ptr cinfo,
|
||||
J_FLOAT_PARAM param);
|
||||
|
||||
EXTERN(boolean) jpeg_c_int_param_supported (j_compress_ptr cinfo,
|
||||
EXTERN(boolean) jpeg_c_int_param_supported (const j_compress_ptr cinfo,
|
||||
J_INT_PARAM param);
|
||||
EXTERN(void) jpeg_c_set_int_param (j_compress_ptr cinfo, J_INT_PARAM param,
|
||||
int value);
|
||||
EXTERN(int) jpeg_c_get_int_param (j_compress_ptr cinfo, J_INT_PARAM param);
|
||||
EXTERN(int) jpeg_c_get_int_param (const j_compress_ptr cinfo, J_INT_PARAM param);
|
||||
|
||||
|
||||
/* These marker codes are exported since applications and data source modules
|
||||
|
||||
@@ -503,7 +503,7 @@ main (int argc, char **argv)
|
||||
jpeg_mem_src(&srcinfo, inbuffer, insize);
|
||||
} else
|
||||
#endif
|
||||
jpeg_stdio_src(&srcinfo, fp);
|
||||
jpeg_stdio_src(&srcinfo, fp);
|
||||
|
||||
/* Enable saving of extra markers that we want to copy */
|
||||
jcopy_markers_setup(&srcinfo, copyoption);
|
||||
@@ -572,7 +572,7 @@ main (int argc, char **argv)
|
||||
jpeg_mem_dest(&dstinfo, &outbuffer, &outsize);
|
||||
else
|
||||
#endif
|
||||
jpeg_stdio_dest(&dstinfo, fp);
|
||||
jpeg_stdio_dest(&dstinfo, fp);
|
||||
|
||||
/* Start compressor (note no image data is actually written here) */
|
||||
jpeg_write_coefficients(&dstinfo, dst_coef_arrays);
|
||||
@@ -624,6 +624,9 @@ main (int argc, char **argv)
|
||||
end_progress_monitor((j_common_ptr) &dstinfo);
|
||||
#endif
|
||||
|
||||
free(inbuffer);
|
||||
free(outbuffer);
|
||||
|
||||
/* All done. */
|
||||
exit(jsrcerr.num_warnings + jdsterr.num_warnings ?EXIT_WARNING:EXIT_SUCCESS);
|
||||
return 0; /* suppress no-return-value warnings */
|
||||
|
||||
@@ -112,6 +112,7 @@ int main(int argc, char *argv[]) {
|
||||
|
||||
image_buffer = malloc(frame_width*16 + 2*(frame_width/2)*8);
|
||||
if (!image_buffer) {
|
||||
free(yuv_buffer);
|
||||
fprintf(stderr, "Memory allocation failure!\n");
|
||||
return 1;
|
||||
}
|
||||
@@ -163,6 +164,7 @@ int main(int argc, char *argv[]) {
|
||||
yuv_fd = fopen(yuv_path, "wb");
|
||||
if (!yuv_fd) {
|
||||
fprintf(stderr, "Invalid path to YUV file!");
|
||||
free(yuv_buffer);
|
||||
return 1;
|
||||
}
|
||||
if (fwrite(yuv_buffer, yuv_size, 1, yuv_fd) != 1) {
|
||||
|
||||
2
rdbmp.c
2
rdbmp.c
@@ -381,7 +381,7 @@ start_input_bmp (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
|
||||
return;
|
||||
}
|
||||
|
||||
if (biWidth <= 0 || biHeight <= 0)
|
||||
if (biWidth <= 0 || biHeight <= 0 || biWidth > 0x7fffffffL || biHeight > 0x7fffffffL)
|
||||
ERREXIT(cinfo, JERR_BMP_EMPTY);
|
||||
if (biPlanes != 1)
|
||||
ERREXIT(cinfo, JERR_BMP_BADPLANES);
|
||||
|
||||
4
rdpng.c
4
rdpng.c
@@ -76,6 +76,10 @@ start_input_png (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
|
||||
cinfo->in_color_space = JCS_RGB;
|
||||
cinfo->input_components = 3;
|
||||
}
|
||||
|
||||
if (bit_depth == 16)
|
||||
png_set_strip_16(source->png_ptr);
|
||||
|
||||
cinfo->data_precision = 8;
|
||||
cinfo->image_width = width;
|
||||
cinfo->image_height = height;
|
||||
|
||||
24
rdppm.c
24
rdppm.c
@@ -68,6 +68,7 @@ typedef struct {
|
||||
JSAMPROW pixrow; /* compressor input buffer */
|
||||
size_t buffer_width; /* width of I/O buffer */
|
||||
JSAMPLE *rescale; /* => maxval-remapping array, or NULL */
|
||||
int maxval;
|
||||
} ppm_source_struct;
|
||||
|
||||
typedef ppm_source_struct * ppm_source_ptr;
|
||||
@@ -91,7 +92,7 @@ pbm_getc (FILE * infile)
|
||||
|
||||
|
||||
LOCAL(unsigned int)
|
||||
read_pbm_integer (j_compress_ptr cinfo, FILE * infile)
|
||||
read_pbm_integer (j_compress_ptr cinfo, FILE * infile, int maxval)
|
||||
/* Read an unsigned decimal integer from the PPM file */
|
||||
/* Swallows one trailing character after the integer */
|
||||
/* Note that on a 16-bit-int machine, only values up to 64k can be read. */
|
||||
@@ -115,6 +116,10 @@ read_pbm_integer (j_compress_ptr cinfo, FILE * infile)
|
||||
val *= 10;
|
||||
val += ch - '0';
|
||||
}
|
||||
|
||||
if (val > maxval)
|
||||
ERREXIT(cinfo, JERR_PPM_TOOLARGE);
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
@@ -139,10 +144,11 @@ get_text_gray_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
|
||||
register JSAMPROW ptr;
|
||||
register JSAMPLE *rescale = source->rescale;
|
||||
JDIMENSION col;
|
||||
int maxval = source->maxval;
|
||||
|
||||
ptr = source->pub.buffer[0];
|
||||
for (col = cinfo->image_width; col > 0; col--) {
|
||||
*ptr++ = rescale[read_pbm_integer(cinfo, infile)];
|
||||
*ptr++ = rescale[read_pbm_integer(cinfo, infile, maxval)];
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
@@ -157,12 +163,13 @@ get_text_rgb_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
|
||||
register JSAMPROW ptr;
|
||||
register JSAMPLE *rescale = source->rescale;
|
||||
JDIMENSION col;
|
||||
int maxval = source->maxval;
|
||||
|
||||
ptr = source->pub.buffer[0];
|
||||
for (col = cinfo->image_width; col > 0; col--) {
|
||||
*ptr++ = rescale[read_pbm_integer(cinfo, infile)];
|
||||
*ptr++ = rescale[read_pbm_integer(cinfo, infile)];
|
||||
*ptr++ = rescale[read_pbm_integer(cinfo, infile)];
|
||||
*ptr++ = rescale[read_pbm_integer(cinfo, infile, maxval)];
|
||||
*ptr++ = rescale[read_pbm_integer(cinfo, infile, maxval)];
|
||||
*ptr++ = rescale[read_pbm_integer(cinfo, infile, maxval)];
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
@@ -311,9 +318,9 @@ start_input_ppm (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
|
||||
}
|
||||
|
||||
/* fetch the remaining header info */
|
||||
w = read_pbm_integer(cinfo, source->pub.input_file);
|
||||
h = read_pbm_integer(cinfo, source->pub.input_file);
|
||||
maxval = read_pbm_integer(cinfo, source->pub.input_file);
|
||||
w = read_pbm_integer(cinfo, source->pub.input_file, 65535);
|
||||
h = read_pbm_integer(cinfo, source->pub.input_file, 65535);
|
||||
maxval = read_pbm_integer(cinfo, source->pub.input_file, 65535);
|
||||
|
||||
if (w <= 0 || h <= 0 || maxval <= 0) /* error check */
|
||||
ERREXIT(cinfo, JERR_PPM_NOT);
|
||||
@@ -321,6 +328,7 @@ start_input_ppm (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
|
||||
cinfo->data_precision = BITS_IN_JSAMPLE; /* we always rescale data to this */
|
||||
cinfo->image_width = (JDIMENSION) w;
|
||||
cinfo->image_height = (JDIMENSION) h;
|
||||
source->maxval = maxval;
|
||||
|
||||
/* initialize flags to most common settings */
|
||||
need_iobuffer = TRUE; /* do we need an I/O buffer? */
|
||||
|
||||
@@ -364,7 +364,8 @@ start_input_tga (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
|
||||
if (cmaptype > 1 || /* cmaptype must be 0 or 1 */
|
||||
source->pixel_size < 1 || source->pixel_size > 4 ||
|
||||
(UCH(targaheader[16]) & 7) != 0 || /* bits/pixel must be multiple of 8 */
|
||||
interlace_type != 0) /* currently don't allow interlaced image */
|
||||
interlace_type != 0 || /* currently don't allow interlaced image */
|
||||
width == 0 || height == 0) /* image width/height must be nonzero */
|
||||
ERREXIT(cinfo, JERR_TGA_BADPARMS);
|
||||
|
||||
if (subtype > 8) {
|
||||
|
||||
@@ -17,7 +17,7 @@ onexit()
|
||||
|
||||
usage()
|
||||
{
|
||||
echo "$0 [-build32 [32-bit build dir]] [-buildarmv6 [ARM v6 build dir]] [-buildarmv7 [ARM v7 build dir]] [-buildarmv7s [ARM v7s build dir]]"
|
||||
echo "$0 [-build32 [32-bit build dir]] [-buildarmv6 [ARMv6 build dir]] [-buildarmv7 [ARMv7 build dir]] [-buildarmv7s [ARMv7s build dir] [-buildarmv8 [ARMv8 build dir]] [-lipo [path to lipo]]"
|
||||
exit 1
|
||||
}
|
||||
|
||||
@@ -33,7 +33,10 @@ BUILDDIRARMV7=@abs_top_srcdir@/iosarmv7
|
||||
BUILDARMV7=0
|
||||
BUILDDIRARMV7S=@abs_top_srcdir@/iosarmv7s
|
||||
BUILDARMV7S=0
|
||||
BUILDDIRARMV8=@abs_top_srcdir@/iosarmv8
|
||||
BUILDARMV8=0
|
||||
WITH_JAVA=@WITH_JAVA@
|
||||
LIPO=lipo
|
||||
|
||||
PREFIX=%{__prefix}
|
||||
BINDIR=%{__bindir}
|
||||
@@ -75,6 +78,21 @@ while [ $# -gt 0 ]; do
|
||||
fi
|
||||
fi
|
||||
;;
|
||||
-buildarmv8)
|
||||
BUILDARMV8=1
|
||||
if [ $# -gt 1 ]; then
|
||||
if [[ ! "$2" =~ -.* ]]; then
|
||||
BUILDDIRARMV8=$2; shift
|
||||
fi
|
||||
fi
|
||||
;;
|
||||
-lipo)
|
||||
if [ $# -gt 1 ]; then
|
||||
if [[ ! "$2" =~ -.* ]]; then
|
||||
LIPO=$2; shift
|
||||
fi
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
@@ -110,50 +128,50 @@ if [ $BUILD32 = 1 ]; then
|
||||
popd
|
||||
if [ ! -h $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
|
||||
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
|
||||
lipo -create \
|
||||
$LIPO -create \
|
||||
-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||
-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
|
||||
elif [ ! -h $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
|
||||
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
|
||||
lipo -create \
|
||||
$LIPO -create \
|
||||
-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||
-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
|
||||
fi
|
||||
lipo -create \
|
||||
$LIPO -create \
|
||||
-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.a \
|
||||
-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.a \
|
||||
-output $PKGROOT/$LIBDIR/libjpeg.a
|
||||
lipo -create \
|
||||
$LIPO -create \
|
||||
-arch i386 $TMPDIR/dist.x86/$LIBDIR/libturbojpeg.0.dylib \
|
||||
-arch x86_64 $PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
|
||||
-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
|
||||
lipo -create \
|
||||
$LIPO -create \
|
||||
-arch i386 $TMPDIR/dist.x86/$LIBDIR/libturbojpeg.a \
|
||||
-arch x86_64 $PKGROOT/$LIBDIR/libturbojpeg.a \
|
||||
-output $PKGROOT/$LIBDIR/libturbojpeg.a
|
||||
lipo -create \
|
||||
$LIPO -create \
|
||||
-arch i386 $TMPDIR/dist.x86/$BINDIR/cjpeg \
|
||||
-arch x86_64 $PKGROOT/$BINDIR/cjpeg \
|
||||
-output $PKGROOT/$BINDIR/cjpeg
|
||||
lipo -create \
|
||||
$LIPO -create \
|
||||
-arch i386 $TMPDIR/dist.x86/$BINDIR/djpeg \
|
||||
-arch x86_64 $PKGROOT/$BINDIR/djpeg \
|
||||
-output $PKGROOT/$BINDIR/djpeg
|
||||
lipo -create \
|
||||
$LIPO -create \
|
||||
-arch i386 $TMPDIR/dist.x86/$BINDIR/jpegtran \
|
||||
-arch x86_64 $PKGROOT/$BINDIR/jpegtran \
|
||||
-output $PKGROOT/$BINDIR/jpegtran
|
||||
lipo -create \
|
||||
$LIPO -create \
|
||||
-arch i386 $TMPDIR/dist.x86/$BINDIR/tjbench \
|
||||
-arch x86_64 $PKGROOT/$BINDIR/tjbench \
|
||||
-output $PKGROOT/$BINDIR/tjbench
|
||||
lipo -create \
|
||||
$LIPO -create \
|
||||
-arch i386 $TMPDIR/dist.x86/$BINDIR/rdjpgcom \
|
||||
-arch x86_64 $PKGROOT/$BINDIR/rdjpgcom \
|
||||
-output $PKGROOT/$BINDIR/rdjpgcom
|
||||
lipo -create \
|
||||
$LIPO -create \
|
||||
-arch i386 $TMPDIR/dist.x86/$BINDIR/wrjpgcom \
|
||||
-arch x86_64 $PKGROOT/$BINDIR/wrjpgcom \
|
||||
-output $PKGROOT/$BINDIR/wrjpgcom
|
||||
@@ -162,71 +180,258 @@ fi
|
||||
|
||||
if [ $BUILDARMV6 = 1 ]; then
|
||||
if [ ! -d $BUILDDIRARMV6 ]; then
|
||||
echo ERROR: ARM v6 build directory $BUILDDIRARMV6 does not exist
|
||||
echo ERROR: ARMv6 build directory $BUILDDIRARMV6 does not exist
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -f $BUILDDIRARMV6/Makefile ]; then
|
||||
echo ERROR: ARM v6 build directory $BUILDDIRARMV6 is not configured
|
||||
echo ERROR: ARMv6 build directory $BUILDDIRARMV6 is not configured
|
||||
exit 1
|
||||
fi
|
||||
mkdir -p $TMPDIR/dist.armv6
|
||||
pushd $BUILDDIRARMV6
|
||||
make install DESTDIR=$TMPDIR/dist.armv6
|
||||
popd
|
||||
lipo -create \
|
||||
if [ ! -h $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
|
||||
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||
-arch arm $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
|
||||
elif [ ! -h $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
|
||||
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||
-arch arm $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
|
||||
fi
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libjpeg.a \
|
||||
-arch arm $TMPDIR/dist.armv6/$LIBDIR/libjpeg.a \
|
||||
-output $PKGROOT/$LIBDIR/libjpeg.a
|
||||
lipo -create \
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
|
||||
-arch arm $TMPDIR/dist.armv6/$LIBDIR/libturbojpeg.0.dylib \
|
||||
-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libturbojpeg.a \
|
||||
-arch arm $TMPDIR/dist.armv6/$LIBDIR/libturbojpeg.a \
|
||||
-output $PKGROOT/$LIBDIR/libturbojpeg.a
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/cjpeg \
|
||||
-arch arm $TMPDIR/dist.armv6/$BINDIR/cjpeg \
|
||||
-output $PKGROOT/$BINDIR/cjpeg
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/djpeg \
|
||||
-arch arm $TMPDIR/dist.armv6/$BINDIR/djpeg \
|
||||
-output $PKGROOT/$BINDIR/djpeg
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/jpegtran \
|
||||
-arch arm $TMPDIR/dist.armv6/$BINDIR/jpegtran \
|
||||
-output $PKGROOT/$BINDIR/jpegtran
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/tjbench \
|
||||
-arch arm $TMPDIR/dist.armv6/$BINDIR/tjbench \
|
||||
-output $PKGROOT/$BINDIR/tjbench
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/rdjpgcom \
|
||||
-arch arm $TMPDIR/dist.armv6/$BINDIR/rdjpgcom \
|
||||
-output $PKGROOT/$BINDIR/rdjpgcom
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/wrjpgcom \
|
||||
-arch arm $TMPDIR/dist.armv6/$BINDIR/wrjpgcom \
|
||||
-output $PKGROOT/$BINDIR/wrjpgcom
|
||||
fi
|
||||
|
||||
if [ $BUILDARMV7 = 1 ]; then
|
||||
if [ ! -d $BUILDDIRARMV7 ]; then
|
||||
echo ERROR: ARM v7 build directory $BUILDDIRARMV7 does not exist
|
||||
echo ERROR: ARMv7 build directory $BUILDDIRARMV7 does not exist
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -f $BUILDDIRARMV7/Makefile ]; then
|
||||
echo ERROR: ARM v7 build directory $BUILDDIRARMV7 is not configured
|
||||
echo ERROR: ARMv7 build directory $BUILDDIRARMV7 is not configured
|
||||
exit 1
|
||||
fi
|
||||
mkdir -p $TMPDIR/dist.armv7
|
||||
pushd $BUILDDIRARMV7
|
||||
make install DESTDIR=$TMPDIR/dist.armv7
|
||||
popd
|
||||
lipo -create \
|
||||
if [ ! -h $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
|
||||
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||
-arch arm $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
|
||||
elif [ ! -h $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
|
||||
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||
-arch arm $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
|
||||
fi
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libjpeg.a \
|
||||
-arch arm $TMPDIR/dist.armv7/$LIBDIR/libjpeg.a \
|
||||
-output $PKGROOT/$LIBDIR/libjpeg.a
|
||||
lipo -create \
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
|
||||
-arch arm $TMPDIR/dist.armv7/$LIBDIR/libturbojpeg.0.dylib \
|
||||
-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libturbojpeg.a \
|
||||
-arch arm $TMPDIR/dist.armv7/$LIBDIR/libturbojpeg.a \
|
||||
-output $PKGROOT/$LIBDIR/libturbojpeg.a
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/cjpeg \
|
||||
-arch arm $TMPDIR/dist.armv7/$BINDIR/cjpeg \
|
||||
-output $PKGROOT/$BINDIR/cjpeg
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/djpeg \
|
||||
-arch arm $TMPDIR/dist.armv7/$BINDIR/djpeg \
|
||||
-output $PKGROOT/$BINDIR/djpeg
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/jpegtran \
|
||||
-arch arm $TMPDIR/dist.armv7/$BINDIR/jpegtran \
|
||||
-output $PKGROOT/$BINDIR/jpegtran
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/tjbench \
|
||||
-arch arm $TMPDIR/dist.armv7/$BINDIR/tjbench \
|
||||
-output $PKGROOT/$BINDIR/tjbench
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/rdjpgcom \
|
||||
-arch arm $TMPDIR/dist.armv7/$BINDIR/rdjpgcom \
|
||||
-output $PKGROOT/$BINDIR/rdjpgcom
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/wrjpgcom \
|
||||
-arch arm $TMPDIR/dist.armv7/$BINDIR/wrjpgcom \
|
||||
-output $PKGROOT/$BINDIR/wrjpgcom
|
||||
fi
|
||||
|
||||
if [ $BUILDARMV7S = 1 ]; then
|
||||
if [ ! -d $BUILDDIRARMV7S ]; then
|
||||
echo ERROR: ARM v7s build directory $BUILDDIRARMV7S does not exist
|
||||
echo ERROR: ARMv7s build directory $BUILDDIRARMV7S does not exist
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -f $BUILDDIRARMV7S/Makefile ]; then
|
||||
echo ERROR: ARM v7s build directory $BUILDDIRARMV7S is not configured
|
||||
echo ERROR: ARMv7s build directory $BUILDDIRARMV7S is not configured
|
||||
exit 1
|
||||
fi
|
||||
mkdir -p $TMPDIR/dist.armv7s
|
||||
pushd $BUILDDIRARMV7S
|
||||
make install DESTDIR=$TMPDIR/dist.armv7s
|
||||
popd
|
||||
lipo -create \
|
||||
if [ ! -h $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
|
||||
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||
-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
|
||||
elif [ ! -h $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
|
||||
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||
-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
|
||||
fi
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libjpeg.a \
|
||||
-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.a \
|
||||
-output $PKGROOT/$LIBDIR/libjpeg.a
|
||||
lipo -create \
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
|
||||
-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libturbojpeg.0.dylib \
|
||||
-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libturbojpeg.a \
|
||||
-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libturbojpeg.a \
|
||||
-output $PKGROOT/$LIBDIR/libturbojpeg.a
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/cjpeg \
|
||||
-arch arm $TMPDIR/dist.armv7s/$BINDIR/cjpeg \
|
||||
-output $PKGROOT/$BINDIR/cjpeg
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/djpeg \
|
||||
-arch arm $TMPDIR/dist.armv7s/$BINDIR/djpeg \
|
||||
-output $PKGROOT/$BINDIR/djpeg
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/jpegtran \
|
||||
-arch arm $TMPDIR/dist.armv7s/$BINDIR/jpegtran \
|
||||
-output $PKGROOT/$BINDIR/jpegtran
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/tjbench \
|
||||
-arch arm $TMPDIR/dist.armv7s/$BINDIR/tjbench \
|
||||
-output $PKGROOT/$BINDIR/tjbench
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/rdjpgcom \
|
||||
-arch arm $TMPDIR/dist.armv7s/$BINDIR/rdjpgcom \
|
||||
-output $PKGROOT/$BINDIR/rdjpgcom
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/wrjpgcom \
|
||||
-arch arm $TMPDIR/dist.armv7s/$BINDIR/wrjpgcom \
|
||||
-output $PKGROOT/$BINDIR/wrjpgcom
|
||||
fi
|
||||
|
||||
if [ $BUILDARMV8 = 1 ]; then
|
||||
if [ ! -d $BUILDDIRARMV8 ]; then
|
||||
echo ERROR: ARMv8 build directory $BUILDDIRARMV8 does not exist
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -f $BUILDDIRARMV8/Makefile ]; then
|
||||
echo ERROR: ARMv8 build directory $BUILDDIRARMV8 is not configured
|
||||
exit 1
|
||||
fi
|
||||
mkdir -p $TMPDIR/dist.armv8
|
||||
pushd $BUILDDIRARMV8
|
||||
make install DESTDIR=$TMPDIR/dist.armv8
|
||||
popd
|
||||
if [ ! -h $TMPDIR/dist.armv8/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
|
||||
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||
-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
|
||||
elif [ ! -h $TMPDIR/dist.armv8/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
|
||||
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||
-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
|
||||
fi
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libjpeg.a \
|
||||
-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libjpeg.a \
|
||||
-output $PKGROOT/$LIBDIR/libjpeg.a
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
|
||||
-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libturbojpeg.0.dylib \
|
||||
-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
|
||||
$LIPO -create \
|
||||
$PKGROOT/$LIBDIR/libturbojpeg.a \
|
||||
-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libturbojpeg.a \
|
||||
-output $PKGROOT/$LIBDIR/libturbojpeg.a
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/cjpeg \
|
||||
-arch arm64 $TMPDIR/dist.armv8/$BINDIR/cjpeg \
|
||||
-output $PKGROOT/$BINDIR/cjpeg
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/djpeg \
|
||||
-arch arm64 $TMPDIR/dist.armv8/$BINDIR/djpeg \
|
||||
-output $PKGROOT/$BINDIR/djpeg
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/jpegtran \
|
||||
-arch arm64 $TMPDIR/dist.armv8/$BINDIR/jpegtran \
|
||||
-output $PKGROOT/$BINDIR/jpegtran
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/tjbench \
|
||||
-arch arm64 $TMPDIR/dist.armv8/$BINDIR/tjbench \
|
||||
-output $PKGROOT/$BINDIR/tjbench
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/rdjpgcom \
|
||||
-arch arm64 $TMPDIR/dist.armv8/$BINDIR/rdjpgcom \
|
||||
-output $PKGROOT/$BINDIR/rdjpgcom
|
||||
$LIPO -create \
|
||||
$PKGROOT/$BINDIR/wrjpgcom \
|
||||
-arch arm64 $TMPDIR/dist.armv8/$BINDIR/wrjpgcom \
|
||||
-output $PKGROOT/$BINDIR/wrjpgcom
|
||||
fi
|
||||
|
||||
install_name_tool -id $LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
|
||||
@@ -259,7 +464,7 @@ productbuild --distribution $SRCDIR/release/Distribution.xml \
|
||||
--package-path $TMPDIR/pkg/ --resources $TMPDIR/pkg/ \
|
||||
$TMPDIR/dmg/$PACKAGE_NAME.pkg
|
||||
hdiutil create -fs HFS+ -volname $PACKAGE_NAME-$VERSION \
|
||||
-srcfolder "$TMPDIR/dmg" $TMPDIR/$PACKAGE_NAME-$VERSION.dmg
|
||||
cp $TMPDIR/$PACKAGE_NAME-$VERSION.dmg .
|
||||
-srcfolder "$TMPDIR/dmg" $TMPDIR/$PACKAGE_NAME-$VERSION.dmg
|
||||
cp $TMPDIR/$PACKAGE_NAME-$VERSION.dmg .
|
||||
|
||||
exit
|
||||
|
||||
@@ -111,6 +111,7 @@ Section "Uninstall"
|
||||
|
||||
!ifdef GCC
|
||||
Delete $INSTDIR\bin\libjpeg-@DLL_VERSION@.dll
|
||||
Delete $INSTDIR\bin\libturbojpeg.dll
|
||||
Delete $SYSDIR\libturbojpeg.dll
|
||||
Delete $INSTDIR\lib\libturbojpeg.dll.a"
|
||||
Delete $INSTDIR\lib\libturbojpeg.a"
|
||||
@@ -118,6 +119,7 @@ Section "Uninstall"
|
||||
Delete $INSTDIR\lib\libjpeg.a"
|
||||
!else
|
||||
Delete $INSTDIR\bin\jpeg@DLL_VERSION@.dll
|
||||
Delete $INSTDIR\bin\turbojpeg.dll
|
||||
Delete $SYSDIR\turbojpeg.dll
|
||||
Delete $INSTDIR\lib\jpeg.lib
|
||||
Delete $INSTDIR\lib\jpeg-static.lib
|
||||
|
||||
@@ -72,8 +72,15 @@ endif
|
||||
|
||||
if SIMD_POWERPC
|
||||
|
||||
libsimd_la_SOURCES = jsimd_powerpc.c jsimd_powerpc_altivec.c
|
||||
libsimd_la_CFLAGS = -maltivec
|
||||
libsimd_la_SOURCES = jsimd_powerpc.c \
|
||||
jccolor-altivec.c jcgray-altivec.c \
|
||||
jfdctfst-altivec.c jfdctint-altivec.c \
|
||||
jidctfst-altivec.c jidctint-altivec.c \
|
||||
jquanti-altivec.c
|
||||
libsimd_la_CFLAGS = -maltivec
|
||||
|
||||
jccolor-altivec.lo: jccolext-altivec.c
|
||||
jcgray-altivec.lo: jcgryext-altivec.c
|
||||
|
||||
endif
|
||||
|
||||
|
||||
250
simd/jccolext-altivec.c
Normal file
250
simd/jccolext-altivec.c
Normal file
@@ -0,0 +1,250 @@
|
||||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014, D. R. Commander.
|
||||
* Copyright (C) 2014, Jay Foad.
|
||||
* All rights reserved.
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* This file is included by jccolor-altivec.c */
|
||||
|
||||
|
||||
void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
|
||||
JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows)
|
||||
{
|
||||
JSAMPROW inptr;
|
||||
JSAMPROW outptr0, outptr1, outptr2;
|
||||
int pitch;
|
||||
__vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0}, rgbg0,
|
||||
rgbg1, rgbg2, rgbg3, y, cb, cr;
|
||||
#if RGB_PIXELSIZE == 4
|
||||
__vector unsigned char rgb4;
|
||||
#endif
|
||||
__vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
|
||||
__vector unsigned short y01, y23, cr01, cr23, cb01, cb23;
|
||||
__vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
|
||||
|
||||
/* Constants */
|
||||
__vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
|
||||
pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) },
|
||||
pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) },
|
||||
pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) };
|
||||
__vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) };
|
||||
__vector int pd_onehalf = { __4X(ONE_HALF) },
|
||||
pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
|
||||
__vector unsigned char zero = { __16X(0) },
|
||||
shift_pack_index =
|
||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
|
||||
|
||||
while (--num_rows >= 0) {
|
||||
inptr = *input_buf++;
|
||||
outptr0 = output_buf[0][output_row];
|
||||
outptr1 = output_buf[1][output_row];
|
||||
outptr2 = output_buf[2][output_row];
|
||||
output_row++;
|
||||
|
||||
for (pitch = img_width * RGB_PIXELSIZE; pitch > 0;
|
||||
pitch -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
|
||||
outptr0 += 16, outptr1 += 16, outptr2 += 16) {
|
||||
|
||||
#if RGB_PIXELSIZE == 3
|
||||
/* Load 16 pixels == 48 bytes */
|
||||
if ((size_t)inptr & 15) {
|
||||
__vector unsigned char unaligned_shift_index;
|
||||
rgb0 = vec_ld(0, inptr);
|
||||
if (pitch > 16)
|
||||
rgb1 = vec_ld(16, inptr);
|
||||
else
|
||||
rgb1 = vec_ld(-1, inptr + pitch);
|
||||
if (pitch > 32)
|
||||
rgb2 = vec_ld(32, inptr);
|
||||
else
|
||||
rgb2 = vec_ld(-1, inptr + pitch);
|
||||
if (pitch > 48)
|
||||
rgb3 = vec_ld(48, inptr);
|
||||
else
|
||||
rgb3 = vec_ld(-1, inptr + pitch);
|
||||
unaligned_shift_index = vec_lvsl(0, inptr);
|
||||
rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
|
||||
rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
|
||||
rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
|
||||
} else {
|
||||
rgb0 = vec_ld(0, inptr);
|
||||
if (pitch > 16)
|
||||
rgb1 = vec_ld(16, inptr);
|
||||
if (pitch > 32)
|
||||
rgb2 = vec_ld(32, inptr);
|
||||
}
|
||||
|
||||
/* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
|
||||
* rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
|
||||
* rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
|
||||
*
|
||||
* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
|
||||
* rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
|
||||
* rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
|
||||
* rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
|
||||
*/
|
||||
rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
|
||||
rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
|
||||
rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
|
||||
rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
|
||||
#else
|
||||
/* Load 16 pixels == 64 bytes */
|
||||
if ((size_t)inptr & 15) {
|
||||
__vector unsigned char unaligned_shift_index;
|
||||
rgb0 = vec_ld(0, inptr);
|
||||
if (pitch > 16)
|
||||
rgb1 = vec_ld(16, inptr);
|
||||
else
|
||||
rgb1 = vec_ld(-1, inptr + pitch);
|
||||
if (pitch > 32)
|
||||
rgb2 = vec_ld(32, inptr);
|
||||
else
|
||||
rgb2 = vec_ld(-1, inptr + pitch);
|
||||
if (pitch > 48)
|
||||
rgb3 = vec_ld(48, inptr);
|
||||
else
|
||||
rgb3 = vec_ld(-1, inptr + pitch);
|
||||
if (pitch > 64)
|
||||
rgb4 = vec_ld(64, inptr);
|
||||
else
|
||||
rgb4 = vec_ld(-1, inptr + pitch);
|
||||
unaligned_shift_index = vec_lvsl(0, inptr);
|
||||
rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
|
||||
rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
|
||||
rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
|
||||
rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
|
||||
} else {
|
||||
rgb0 = vec_ld(0, inptr);
|
||||
if (pitch > 16)
|
||||
rgb1 = vec_ld(16, inptr);
|
||||
if (pitch > 32)
|
||||
rgb2 = vec_ld(32, inptr);
|
||||
if (pitch > 48)
|
||||
rgb3 = vec_ld(48, inptr);
|
||||
}
|
||||
|
||||
/* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
|
||||
* rgb0 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
|
||||
* rgb0 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
|
||||
* rgb0 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
|
||||
*
|
||||
* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
|
||||
* rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
|
||||
* rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
|
||||
* rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
|
||||
*/
|
||||
rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
|
||||
rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
|
||||
rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
|
||||
rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
|
||||
#endif
|
||||
|
||||
/* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
|
||||
* bg0 = B0 G0 B1 G1 B2 G2 B3 G3
|
||||
* ...
|
||||
*
|
||||
* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
|
||||
* support unsigned vectors.
|
||||
*/
|
||||
rg0 = (__vector signed short)vec_mergeh(zero, rgbg0);
|
||||
bg0 = (__vector signed short)vec_mergel(zero, rgbg0);
|
||||
rg1 = (__vector signed short)vec_mergeh(zero, rgbg1);
|
||||
bg1 = (__vector signed short)vec_mergel(zero, rgbg1);
|
||||
rg2 = (__vector signed short)vec_mergeh(zero, rgbg2);
|
||||
bg2 = (__vector signed short)vec_mergel(zero, rgbg2);
|
||||
rg3 = (__vector signed short)vec_mergeh(zero, rgbg3);
|
||||
bg3 = (__vector signed short)vec_mergel(zero, rgbg3);
|
||||
|
||||
/* (Original)
|
||||
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
*
|
||||
* (This implementation)
|
||||
* Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
*/
|
||||
|
||||
/* Calculate Y values */
|
||||
|
||||
y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
|
||||
y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
|
||||
y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
|
||||
y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
|
||||
y0 = vec_msums(bg0, pw_f0114_f0250, y0);
|
||||
y1 = vec_msums(bg1, pw_f0114_f0250, y1);
|
||||
y2 = vec_msums(bg2, pw_f0114_f0250, y2);
|
||||
y3 = vec_msums(bg3, pw_f0114_f0250, y3);
|
||||
/* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
|
||||
* each dword into a new 16-bit vector, which is the equivalent of
|
||||
* descaling the 32-bit results (right-shifting by 16 bits) and then
|
||||
* packing them.
|
||||
*/
|
||||
y01 = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
|
||||
shift_pack_index);
|
||||
y23 = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
|
||||
shift_pack_index);
|
||||
y = vec_pack(y01, y23);
|
||||
vec_st(y, 0, outptr0);
|
||||
|
||||
/* Calculate Cb values */
|
||||
cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj);
|
||||
cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj);
|
||||
cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj);
|
||||
cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj);
|
||||
cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000,
|
||||
(__vector unsigned int)cb0);
|
||||
cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000,
|
||||
(__vector unsigned int)cb1);
|
||||
cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000,
|
||||
(__vector unsigned int)cb2);
|
||||
cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000,
|
||||
(__vector unsigned int)cb3);
|
||||
cb01 = vec_perm((__vector unsigned short)cb0,
|
||||
(__vector unsigned short)cb1, shift_pack_index);
|
||||
cb23 = vec_perm((__vector unsigned short)cb2,
|
||||
(__vector unsigned short)cb3, shift_pack_index);
|
||||
cb = vec_pack(cb01, cb23);
|
||||
vec_st(cb, 0, outptr1);
|
||||
|
||||
/* Calculate Cr values */
|
||||
cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj);
|
||||
cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj);
|
||||
cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj);
|
||||
cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj);
|
||||
cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000,
|
||||
(__vector unsigned int)cr0);
|
||||
cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000,
|
||||
(__vector unsigned int)cr1);
|
||||
cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000,
|
||||
(__vector unsigned int)cr2);
|
||||
cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000,
|
||||
(__vector unsigned int)cr3);
|
||||
cr01 = vec_perm((__vector unsigned short)cr0,
|
||||
(__vector unsigned short)cr1, shift_pack_index);
|
||||
cr23 = vec_perm((__vector unsigned short)cr2,
|
||||
(__vector unsigned short)cr3, shift_pack_index);
|
||||
cr = vec_pack(cr01, cr23);
|
||||
vec_st(cr, 0, outptr2);
|
||||
}
|
||||
}
|
||||
}
|
||||
104
simd/jccolor-altivec.c
Normal file
104
simd/jccolor-altivec.c
Normal file
@@ -0,0 +1,104 @@
|
||||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014, D. R. Commander.
|
||||
* All rights reserved.
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* RGB --> YCC CONVERSION */
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
#define F_0_081 5329 /* FIX(0.08131) */
|
||||
#define F_0_114 7471 /* FIX(0.11400) */
|
||||
#define F_0_168 11059 /* FIX(0.16874) */
|
||||
#define F_0_250 16384 /* FIX(0.25000) */
|
||||
#define F_0_299 19595 /* FIX(0.29900) */
|
||||
#define F_0_331 21709 /* FIX(0.33126) */
|
||||
#define F_0_418 27439 /* FIX(0.41869) */
|
||||
#define F_0_500 32768 /* FIX(0.50000) */
|
||||
#define F_0_587 38470 /* FIX(0.58700) */
|
||||
#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
|
||||
|
||||
#define SCALEBITS 16
|
||||
#define ONE_HALF (1 << (SCALEBITS - 1))
|
||||
|
||||
|
||||
#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
|
||||
#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
|
||||
#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
|
||||
#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
|
||||
#include "jccolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
|
||||
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec
|
||||
#include "jccolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX0
|
||||
#undef RGBG_INDEX1
|
||||
#undef RGBG_INDEX2
|
||||
#undef RGBG_INDEX3
|
||||
#undef jsimd_rgb_ycc_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
|
||||
#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec
|
||||
#include "jccolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX
|
||||
#undef jsimd_rgb_ycc_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
|
||||
#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
|
||||
#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
|
||||
#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
|
||||
#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec
|
||||
#include "jccolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX0
|
||||
#undef RGBG_INDEX1
|
||||
#undef RGBG_INDEX2
|
||||
#undef RGBG_INDEX3
|
||||
#undef jsimd_rgb_ycc_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
|
||||
#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec
|
||||
#include "jccolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX
|
||||
#undef jsimd_rgb_ycc_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
|
||||
#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec
|
||||
#include "jccolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX
|
||||
#undef jsimd_rgb_ycc_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
|
||||
#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec
|
||||
#include "jccolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX
|
||||
#undef jsimd_rgb_ycc_convert_altivec
|
||||
99
simd/jcgray-altivec.c
Normal file
99
simd/jcgray-altivec.c
Normal file
@@ -0,0 +1,99 @@
|
||||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014, D. R. Commander.
|
||||
* All rights reserved.
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* RGB --> GRAYSCALE CONVERSION */
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
#define F_0_114 7471 /* FIX(0.11400) */
|
||||
#define F_0_250 16384 /* FIX(0.25000) */
|
||||
#define F_0_299 19595 /* FIX(0.29900) */
|
||||
#define F_0_587 38470 /* FIX(0.58700) */
|
||||
#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
|
||||
|
||||
#define SCALEBITS 16
|
||||
#define ONE_HALF (1 << (SCALEBITS - 1))
|
||||
|
||||
|
||||
#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
|
||||
#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
|
||||
#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
|
||||
#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
|
||||
#include "jcgryext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
|
||||
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec
|
||||
#include "jcgryext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX0
|
||||
#undef RGBG_INDEX1
|
||||
#undef RGBG_INDEX2
|
||||
#undef RGBG_INDEX3
|
||||
#undef jsimd_rgb_gray_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
|
||||
#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec
|
||||
#include "jcgryext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX
|
||||
#undef jsimd_rgb_gray_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
|
||||
#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
|
||||
#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
|
||||
#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
|
||||
#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec
|
||||
#include "jcgryext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX0
|
||||
#undef RGBG_INDEX1
|
||||
#undef RGBG_INDEX2
|
||||
#undef RGBG_INDEX3
|
||||
#undef jsimd_rgb_gray_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
|
||||
#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec
|
||||
#include "jcgryext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX
|
||||
#undef jsimd_rgb_gray_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
|
||||
#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec
|
||||
#include "jcgryext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX
|
||||
#undef jsimd_rgb_gray_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
|
||||
#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec
|
||||
#include "jcgryext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX
|
||||
#undef jsimd_rgb_gray_convert_altivec
|
||||
200
simd/jcgryext-altivec.c
Normal file
200
simd/jcgryext-altivec.c
Normal file
@@ -0,0 +1,200 @@
|
||||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014, D. R. Commander.
|
||||
* Copyright (C) 2014, Jay Foad.
|
||||
* All rights reserved.
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* This file is included by jcgray-altivec.c */
|
||||
|
||||
|
||||
void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
|
||||
JSAMPARRAY input_buf,
|
||||
JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows)
|
||||
{
|
||||
JSAMPROW inptr, outptr;
|
||||
int pitch;
|
||||
__vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0}, rgbg0,
|
||||
rgbg1, rgbg2, rgbg3, y;
|
||||
#if RGB_PIXELSIZE == 4
|
||||
__vector unsigned char rgb4;
|
||||
#endif
|
||||
__vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
|
||||
__vector unsigned short y01, y23;
|
||||
__vector int y0, y1, y2, y3;
|
||||
|
||||
/* Constants */
|
||||
__vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
|
||||
pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) };
|
||||
__vector int pd_onehalf = { __4X(ONE_HALF) };
|
||||
__vector unsigned char zero = { __16X(0) },
|
||||
shift_pack_index =
|
||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
|
||||
|
||||
while (--num_rows >= 0) {
|
||||
inptr = *input_buf++;
|
||||
outptr = output_buf[0][output_row];
|
||||
output_row++;
|
||||
|
||||
for (pitch = img_width * RGB_PIXELSIZE; pitch > 0;
|
||||
pitch -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
|
||||
outptr += 16) {
|
||||
|
||||
#if RGB_PIXELSIZE == 3
|
||||
/* Load 16 pixels == 48 bytes */
|
||||
if ((size_t)inptr & 15) {
|
||||
__vector unsigned char unaligned_shift_index;
|
||||
rgb0 = vec_ld(0, inptr);
|
||||
if (pitch > 16)
|
||||
rgb1 = vec_ld(16, inptr);
|
||||
else
|
||||
rgb1 = vec_ld(-1, inptr + pitch);
|
||||
if (pitch > 32)
|
||||
rgb2 = vec_ld(32, inptr);
|
||||
else
|
||||
rgb2 = vec_ld(-1, inptr + pitch);
|
||||
if (pitch > 48)
|
||||
rgb3 = vec_ld(48, inptr);
|
||||
else
|
||||
rgb3 = vec_ld(-1, inptr + pitch);
|
||||
unaligned_shift_index = vec_lvsl(0, inptr);
|
||||
rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
|
||||
rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
|
||||
rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
|
||||
} else {
|
||||
rgb0 = vec_ld(0, inptr);
|
||||
if (pitch > 16)
|
||||
rgb1 = vec_ld(16, inptr);
|
||||
if (pitch > 32)
|
||||
rgb2 = vec_ld(32, inptr);
|
||||
}
|
||||
|
||||
/* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
|
||||
* rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
|
||||
* rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
|
||||
*
|
||||
* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
|
||||
* rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
|
||||
* rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
|
||||
* rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
|
||||
*/
|
||||
rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
|
||||
rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
|
||||
rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
|
||||
rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
|
||||
#else
|
||||
/* Load 16 pixels == 64 bytes */
|
||||
if ((size_t)inptr & 15) {
|
||||
__vector unsigned char unaligned_shift_index;
|
||||
rgb0 = vec_ld(0, inptr);
|
||||
if (pitch > 16)
|
||||
rgb1 = vec_ld(16, inptr);
|
||||
else
|
||||
rgb1 = vec_ld(-1, inptr + pitch);
|
||||
if (pitch > 32)
|
||||
rgb2 = vec_ld(32, inptr);
|
||||
else
|
||||
rgb2 = vec_ld(-1, inptr + pitch);
|
||||
if (pitch > 48)
|
||||
rgb3 = vec_ld(48, inptr);
|
||||
else
|
||||
rgb3 = vec_ld(-1, inptr + pitch);
|
||||
if (pitch > 64)
|
||||
rgb4 = vec_ld(64, inptr);
|
||||
else
|
||||
rgb4 = vec_ld(-1, inptr + pitch);
|
||||
unaligned_shift_index = vec_lvsl(0, inptr);
|
||||
rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
|
||||
rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
|
||||
rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
|
||||
rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
|
||||
} else {
|
||||
rgb0 = vec_ld(0, inptr);
|
||||
if (pitch > 16)
|
||||
rgb1 = vec_ld(16, inptr);
|
||||
if (pitch > 32)
|
||||
rgb2 = vec_ld(32, inptr);
|
||||
if (pitch > 48)
|
||||
rgb3 = vec_ld(48, inptr);
|
||||
}
|
||||
|
||||
/* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
|
||||
* rgb0 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
|
||||
* rgb0 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
|
||||
* rgb0 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
|
||||
*
|
||||
* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
|
||||
* rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
|
||||
* rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
|
||||
* rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
|
||||
*/
|
||||
rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
|
||||
rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
|
||||
rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
|
||||
rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
|
||||
#endif
|
||||
|
||||
/* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
|
||||
* bg0 = B0 G0 B1 G1 B2 G2 B3 G3
|
||||
* ...
|
||||
*
|
||||
* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
|
||||
* support unsigned vectors.
|
||||
*/
|
||||
rg0 = (__vector signed short)vec_mergeh(zero, rgbg0);
|
||||
bg0 = (__vector signed short)vec_mergel(zero, rgbg0);
|
||||
rg1 = (__vector signed short)vec_mergeh(zero, rgbg1);
|
||||
bg1 = (__vector signed short)vec_mergel(zero, rgbg1);
|
||||
rg2 = (__vector signed short)vec_mergeh(zero, rgbg2);
|
||||
bg2 = (__vector signed short)vec_mergel(zero, rgbg2);
|
||||
rg3 = (__vector signed short)vec_mergeh(zero, rgbg3);
|
||||
bg3 = (__vector signed short)vec_mergel(zero, rgbg3);
|
||||
|
||||
/* (Original)
|
||||
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
*
|
||||
* (This implementation)
|
||||
* Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
*/
|
||||
|
||||
/* Calculate Y values */
|
||||
|
||||
y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
|
||||
y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
|
||||
y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
|
||||
y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
|
||||
y0 = vec_msums(bg0, pw_f0114_f0250, y0);
|
||||
y1 = vec_msums(bg1, pw_f0114_f0250, y1);
|
||||
y2 = vec_msums(bg2, pw_f0114_f0250, y2);
|
||||
y3 = vec_msums(bg3, pw_f0114_f0250, y3);
|
||||
/* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
|
||||
* each dword into a new 16-bit vector, which is the equivalent of
|
||||
* descaling the 32-bit results (right-shifting by 16 bits) and then
|
||||
* packing them.
|
||||
*/
|
||||
y01 = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
|
||||
shift_pack_index);
|
||||
y23 = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
|
||||
shift_pack_index);
|
||||
y = vec_pack(y01, y23);
|
||||
vec_st(y, 0, outptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
156
simd/jfdctfst-altivec.c
Normal file
156
simd/jfdctfst-altivec.c
Normal file
@@ -0,0 +1,156 @@
|
||||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014, D. R. Commander.
|
||||
* All rights reserved.
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* FAST INTEGER FORWARD DCT
|
||||
*
|
||||
* This is similar to the SSE2 implementation, except that we left-shift the
|
||||
* constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
|
||||
* vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
|
||||
* the elements in arg3 + the most significant 17 bits of
|
||||
* (the elements in arg1 * the elements in arg2).
|
||||
*/
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
#define F_0_382 98 /* FIX(0.382683433) */
|
||||
#define F_0_541 139 /* FIX(0.541196100) */
|
||||
#define F_0_707 181 /* FIX(0.707106781) */
|
||||
#define F_1_306 334 /* FIX(1.306562965) */
|
||||
|
||||
#define CONST_BITS 8
|
||||
#define PRE_MULTIPLY_SCALE_BITS 2
|
||||
#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
|
||||
|
||||
|
||||
#define DO_FDCT() \
|
||||
{ \
|
||||
/* Even part */ \
|
||||
\
|
||||
tmp10 = vec_add(tmp0, tmp3); \
|
||||
tmp13 = vec_sub(tmp0, tmp3); \
|
||||
tmp11 = vec_add(tmp1, tmp2); \
|
||||
tmp12 = vec_sub(tmp1, tmp2); \
|
||||
\
|
||||
out0 = vec_add(tmp10, tmp11); \
|
||||
out4 = vec_sub(tmp10, tmp11); \
|
||||
\
|
||||
z1 = vec_add(tmp12, tmp13); \
|
||||
z1 = vec_sl(z1, pre_multiply_scale_bits); \
|
||||
z1 = vec_madds(z1, pw_0707, zero); \
|
||||
\
|
||||
out2 = vec_add(tmp13, z1); \
|
||||
out6 = vec_sub(tmp13, z1); \
|
||||
\
|
||||
/* Odd part */ \
|
||||
\
|
||||
tmp10 = vec_add(tmp4, tmp5); \
|
||||
tmp11 = vec_add(tmp5, tmp6); \
|
||||
tmp12 = vec_add(tmp6, tmp7); \
|
||||
\
|
||||
tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \
|
||||
tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
|
||||
z5 = vec_sub(tmp10, tmp12); \
|
||||
z5 = vec_madds(z5, pw_0382, zero); \
|
||||
\
|
||||
z2 = vec_madds(tmp10, pw_0541, z5); \
|
||||
z4 = vec_madds(tmp12, pw_1306, z5); \
|
||||
\
|
||||
tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
|
||||
z3 = vec_madds(tmp11, pw_0707, zero); \
|
||||
\
|
||||
z11 = vec_add(tmp7, z3); \
|
||||
z13 = vec_sub(tmp7, z3); \
|
||||
\
|
||||
out5 = vec_add(z13, z2); \
|
||||
out3 = vec_sub(z13, z2); \
|
||||
out1 = vec_add(z11, z4); \
|
||||
out7 = vec_sub(z11, z4); \
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
jsimd_fdct_ifast_altivec (DCTELEM *data)
|
||||
{
|
||||
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
col0, col1, col2, col3, col4, col5, col6, col7,
|
||||
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
|
||||
z1, z2, z3, z4, z5, z11, z13,
|
||||
out0, out1, out2, out3, out4, out5, out6, out7;
|
||||
|
||||
/* Constants */
|
||||
__vector short zero = vec_splat_s16(0),
|
||||
pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
|
||||
pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
|
||||
pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
|
||||
pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
|
||||
__vector unsigned short
|
||||
pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
|
||||
|
||||
/* Pass 1: process rows */
|
||||
|
||||
row0 = vec_ld(0, data);
|
||||
row1 = vec_ld(16, data);
|
||||
row2 = vec_ld(32, data);
|
||||
row3 = vec_ld(48, data);
|
||||
row4 = vec_ld(64, data);
|
||||
row5 = vec_ld(80, data);
|
||||
row6 = vec_ld(96, data);
|
||||
row7 = vec_ld(112, data);
|
||||
|
||||
TRANSPOSE(row, col);
|
||||
|
||||
tmp0 = vec_add(col0, col7);
|
||||
tmp7 = vec_sub(col0, col7);
|
||||
tmp1 = vec_add(col1, col6);
|
||||
tmp6 = vec_sub(col1, col6);
|
||||
tmp2 = vec_add(col2, col5);
|
||||
tmp5 = vec_sub(col2, col5);
|
||||
tmp3 = vec_add(col3, col4);
|
||||
tmp4 = vec_sub(col3, col4);
|
||||
|
||||
DO_FDCT();
|
||||
|
||||
/* Pass 2: process columns */
|
||||
|
||||
TRANSPOSE(out, row);
|
||||
|
||||
tmp0 = vec_add(row0, row7);
|
||||
tmp7 = vec_sub(row0, row7);
|
||||
tmp1 = vec_add(row1, row6);
|
||||
tmp6 = vec_sub(row1, row6);
|
||||
tmp2 = vec_add(row2, row5);
|
||||
tmp5 = vec_sub(row2, row5);
|
||||
tmp3 = vec_add(row3, row4);
|
||||
tmp4 = vec_sub(row3, row4);
|
||||
|
||||
DO_FDCT();
|
||||
|
||||
vec_st(out0, 0, data);
|
||||
vec_st(out1, 16, data);
|
||||
vec_st(out2, 32, data);
|
||||
vec_st(out3, 48, data);
|
||||
vec_st(out4, 64, data);
|
||||
vec_st(out5, 80, data);
|
||||
vec_st(out6, 96, data);
|
||||
vec_st(out7, 112, data);
|
||||
}
|
||||
262
simd/jfdctint-altivec.c
Normal file
262
simd/jfdctint-altivec.c
Normal file
@@ -0,0 +1,262 @@
|
||||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014, D. R. Commander.
|
||||
* All rights reserved.
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* SLOW INTEGER FORWARD DCT */
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
#define F_0_298 2446 /* FIX(0.298631336) */
|
||||
#define F_0_390 3196 /* FIX(0.390180644) */
|
||||
#define F_0_541 4433 /* FIX(0.541196100) */
|
||||
#define F_0_765 6270 /* FIX(0.765366865) */
|
||||
#define F_0_899 7373 /* FIX(0.899976223) */
|
||||
#define F_1_175 9633 /* FIX(1.175875602) */
|
||||
#define F_1_501 12299 /* FIX(1.501321110) */
|
||||
#define F_1_847 15137 /* FIX(1.847759065) */
|
||||
#define F_1_961 16069 /* FIX(1.961570560) */
|
||||
#define F_2_053 16819 /* FIX(2.053119869) */
|
||||
#define F_2_562 20995 /* FIX(2.562915447) */
|
||||
#define F_3_072 25172 /* FIX(3.072711026) */
|
||||
|
||||
#define CONST_BITS 13
|
||||
#define PASS1_BITS 2
|
||||
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
|
||||
|
||||
|
||||
#define DO_FDCT_COMMON(PASS) \
|
||||
{ \
|
||||
/* (Original) \
|
||||
* z1 = (tmp12 + tmp13) * 0.541196100; \
|
||||
* data2 = z1 + tmp13 * 0.765366865; \
|
||||
* data6 = z1 + tmp12 * -1.847759065; \
|
||||
* \
|
||||
* (This implementation) \
|
||||
* data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
|
||||
* data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
|
||||
*/ \
|
||||
\
|
||||
tmp1312l = vec_mergeh(tmp13, tmp12); \
|
||||
tmp1312h = vec_mergel(tmp13, tmp12); \
|
||||
\
|
||||
out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \
|
||||
out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \
|
||||
out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \
|
||||
out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \
|
||||
\
|
||||
out2l = vec_sra(out2l, descale_p##PASS); \
|
||||
out2h = vec_sra(out2h, descale_p##PASS); \
|
||||
out6l = vec_sra(out6l, descale_p##PASS); \
|
||||
out6h = vec_sra(out6h, descale_p##PASS); \
|
||||
\
|
||||
out2 = vec_pack(out2l, out2h); \
|
||||
out6 = vec_pack(out6l, out6h); \
|
||||
\
|
||||
/* Odd part */ \
|
||||
\
|
||||
z3 = vec_add(tmp4, tmp6); \
|
||||
z4 = vec_add(tmp5, tmp7); \
|
||||
\
|
||||
/* (Original) \
|
||||
* z5 = (z3 + z4) * 1.175875602; \
|
||||
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
|
||||
* z3 += z5; z4 += z5; \
|
||||
* \
|
||||
* (This implementation) \
|
||||
* z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
|
||||
* z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
|
||||
*/ \
|
||||
\
|
||||
z34l = vec_mergeh(z3, z4); \
|
||||
z34h = vec_mergel(z3, z4); \
|
||||
\
|
||||
z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \
|
||||
z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \
|
||||
z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \
|
||||
z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \
|
||||
\
|
||||
/* (Original) \
|
||||
* z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \
|
||||
* tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \
|
||||
* tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \
|
||||
* z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
|
||||
* data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \
|
||||
* data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \
|
||||
* \
|
||||
* (This implementation) \
|
||||
* tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
|
||||
* tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
|
||||
* tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
|
||||
* tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
|
||||
* data7 = tmp4 + z3; data5 = tmp5 + z4; \
|
||||
* data3 = tmp6 + z3; data1 = tmp7 + z4; \
|
||||
*/ \
|
||||
\
|
||||
tmp47l = vec_mergeh(tmp4, tmp7); \
|
||||
tmp47h = vec_mergel(tmp4, tmp7); \
|
||||
\
|
||||
out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \
|
||||
out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \
|
||||
out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \
|
||||
out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \
|
||||
\
|
||||
out7l = vec_sra(out7l, descale_p##PASS); \
|
||||
out7h = vec_sra(out7h, descale_p##PASS); \
|
||||
out1l = vec_sra(out1l, descale_p##PASS); \
|
||||
out1h = vec_sra(out1h, descale_p##PASS); \
|
||||
\
|
||||
out7 = vec_pack(out7l, out7h); \
|
||||
out1 = vec_pack(out1l, out1h); \
|
||||
\
|
||||
tmp56l = vec_mergeh(tmp5, tmp6); \
|
||||
tmp56h = vec_mergel(tmp5, tmp6); \
|
||||
\
|
||||
out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \
|
||||
out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \
|
||||
out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \
|
||||
out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \
|
||||
\
|
||||
out5l = vec_sra(out5l, descale_p##PASS); \
|
||||
out5h = vec_sra(out5h, descale_p##PASS); \
|
||||
out3l = vec_sra(out3l, descale_p##PASS); \
|
||||
out3h = vec_sra(out3h, descale_p##PASS); \
|
||||
\
|
||||
out5 = vec_pack(out5l, out5h); \
|
||||
out3 = vec_pack(out3l, out3h); \
|
||||
}
|
||||
|
||||
#define DO_FDCT_ROWS() \
|
||||
{ \
|
||||
/* Even part */ \
|
||||
\
|
||||
tmp10 = vec_add(tmp0, tmp3); \
|
||||
tmp13 = vec_sub(tmp0, tmp3); \
|
||||
tmp11 = vec_add(tmp1, tmp2); \
|
||||
tmp12 = vec_sub(tmp1, tmp2); \
|
||||
\
|
||||
out0 = vec_add(tmp10, tmp11); \
|
||||
out0 = vec_sl(out0, pass1_bits); \
|
||||
out4 = vec_sub(tmp10, tmp11); \
|
||||
out4 = vec_sl(out4, pass1_bits); \
|
||||
\
|
||||
DO_FDCT_COMMON(1); \
|
||||
}
|
||||
|
||||
#define DO_FDCT_COLS() \
|
||||
{ \
|
||||
/* Even part */ \
|
||||
\
|
||||
tmp10 = vec_add(tmp0, tmp3); \
|
||||
tmp13 = vec_sub(tmp0, tmp3); \
|
||||
tmp11 = vec_add(tmp1, tmp2); \
|
||||
tmp12 = vec_sub(tmp1, tmp2); \
|
||||
\
|
||||
out0 = vec_add(tmp10, tmp11); \
|
||||
out0 = vec_add(out0, pw_descale_p2x); \
|
||||
out0 = vec_sra(out0, pass1_bits); \
|
||||
out4 = vec_sub(tmp10, tmp11); \
|
||||
out4 = vec_add(out4, pw_descale_p2x); \
|
||||
out4 = vec_sra(out4, pass1_bits); \
|
||||
\
|
||||
DO_FDCT_COMMON(2); \
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
jsimd_fdct_islow_altivec (DCTELEM *data)
|
||||
{
|
||||
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
col0, col1, col2, col3, col4, col5, col6, col7,
|
||||
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
|
||||
tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
|
||||
z3, z4, z34l, z34h,
|
||||
out0, out1, out2, out3, out4, out5, out6, out7;
|
||||
__vector int z3l, z3h, z4l, z4h,
|
||||
out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
|
||||
out7l, out7h;
|
||||
|
||||
/* Constants */
|
||||
__vector short
|
||||
pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
|
||||
pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
|
||||
pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
|
||||
pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
|
||||
pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
|
||||
pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
|
||||
pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
|
||||
pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
|
||||
pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
|
||||
__vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
|
||||
__vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
|
||||
pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
|
||||
__vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
|
||||
descale_p2 = { __4X(DESCALE_P2) };
|
||||
|
||||
/* Pass 1: process rows */
|
||||
|
||||
row0 = vec_ld(0, data);
|
||||
row1 = vec_ld(16, data);
|
||||
row2 = vec_ld(32, data);
|
||||
row3 = vec_ld(48, data);
|
||||
row4 = vec_ld(64, data);
|
||||
row5 = vec_ld(80, data);
|
||||
row6 = vec_ld(96, data);
|
||||
row7 = vec_ld(112, data);
|
||||
|
||||
TRANSPOSE(row, col);
|
||||
|
||||
tmp0 = vec_add(col0, col7);
|
||||
tmp7 = vec_sub(col0, col7);
|
||||
tmp1 = vec_add(col1, col6);
|
||||
tmp6 = vec_sub(col1, col6);
|
||||
tmp2 = vec_add(col2, col5);
|
||||
tmp5 = vec_sub(col2, col5);
|
||||
tmp3 = vec_add(col3, col4);
|
||||
tmp4 = vec_sub(col3, col4);
|
||||
|
||||
DO_FDCT_ROWS();
|
||||
|
||||
/* Pass 2: process columns */
|
||||
|
||||
TRANSPOSE(out, row);
|
||||
|
||||
tmp0 = vec_add(row0, row7);
|
||||
tmp7 = vec_sub(row0, row7);
|
||||
tmp1 = vec_add(row1, row6);
|
||||
tmp6 = vec_sub(row1, row6);
|
||||
tmp2 = vec_add(row2, row5);
|
||||
tmp5 = vec_sub(row2, row5);
|
||||
tmp3 = vec_add(row3, row4);
|
||||
tmp4 = vec_sub(row3, row4);
|
||||
|
||||
DO_FDCT_COLS();
|
||||
|
||||
vec_st(out0, 0, data);
|
||||
vec_st(out1, 16, data);
|
||||
vec_st(out2, 32, data);
|
||||
vec_st(out3, 48, data);
|
||||
vec_st(out4, 64, data);
|
||||
vec_st(out5, 80, data);
|
||||
vec_st(out6, 96, data);
|
||||
vec_st(out7, 112, data);
|
||||
}
|
||||
@@ -444,11 +444,11 @@ EXTN(jsimd_fdct_islow_mmx):
|
||||
psubw mm6,mm4 ; mm6=tmp12
|
||||
|
||||
movq mm7,mm5
|
||||
paddw mm5,mm0 ; mm5=tmp10+tmp11
|
||||
psubw mm7,mm0 ; mm7=tmp10-tmp11
|
||||
paddsw mm5,mm0 ; mm5=tmp10+tmp11
|
||||
psubsw mm7,mm0 ; mm7=tmp10-tmp11
|
||||
|
||||
paddw mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
|
||||
paddw mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
|
||||
paddsw mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
|
||||
paddsw mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
|
||||
psraw mm5,PASS1_BITS ; mm5=data0
|
||||
psraw mm7,PASS1_BITS ; mm7=data4
|
||||
|
||||
|
||||
@@ -454,11 +454,11 @@ EXTN(jsimd_fdct_islow_sse2):
|
||||
psubw xmm6,xmm4 ; xmm6=tmp12
|
||||
|
||||
movdqa xmm5,xmm7
|
||||
paddw xmm7,xmm2 ; xmm7=tmp10+tmp11
|
||||
psubw xmm5,xmm2 ; xmm5=tmp10-tmp11
|
||||
paddsw xmm7,xmm2 ; xmm7=tmp10+tmp11
|
||||
psubsw xmm5,xmm2 ; xmm5=tmp10-tmp11
|
||||
|
||||
paddw xmm7,[rel PW_DESCALE_P2X]
|
||||
paddw xmm5,[rel PW_DESCALE_P2X]
|
||||
paddsw xmm7,[rel PW_DESCALE_P2X]
|
||||
paddsw xmm5,[rel PW_DESCALE_P2X]
|
||||
psraw xmm7,PASS1_BITS ; xmm7=data0
|
||||
psraw xmm5,PASS1_BITS ; xmm5=data4
|
||||
|
||||
|
||||
@@ -462,11 +462,11 @@ EXTN(jsimd_fdct_islow_sse2):
|
||||
psubw xmm6,xmm4 ; xmm6=tmp12
|
||||
|
||||
movdqa xmm5,xmm7
|
||||
paddw xmm7,xmm2 ; xmm7=tmp10+tmp11
|
||||
psubw xmm5,xmm2 ; xmm5=tmp10-tmp11
|
||||
paddsw xmm7,xmm2 ; xmm7=tmp10+tmp11
|
||||
psubsw xmm5,xmm2 ; xmm5=tmp10-tmp11
|
||||
|
||||
paddw xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
|
||||
paddw xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
|
||||
paddsw xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
|
||||
paddsw xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
|
||||
psraw xmm7,PASS1_BITS ; xmm7=data0
|
||||
psraw xmm5,PASS1_BITS ; xmm5=data4
|
||||
|
||||
|
||||
256
simd/jidctfst-altivec.c
Normal file
256
simd/jidctfst-altivec.c
Normal file
@@ -0,0 +1,256 @@
|
||||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014, D. R. Commander.
|
||||
* All rights reserved.
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* FAST INTEGER INVERSE DCT
|
||||
*
|
||||
* This is similar to the SSE2 implementation, except that we left-shift the
|
||||
* constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
|
||||
* vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
|
||||
* the elements in arg3 + the most significant 17 bits of
|
||||
* (the elements in arg1 * the elements in arg2).
|
||||
*/
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
#define F_1_082 277 /* FIX(1.082392200) */
|
||||
#define F_1_414 362 /* FIX(1.414213562) */
|
||||
#define F_1_847 473 /* FIX(1.847759065) */
|
||||
#define F_2_613 669 /* FIX(2.613125930) */
|
||||
#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
|
||||
|
||||
#define CONST_BITS 8
|
||||
#define PASS1_BITS 2
|
||||
#define PRE_MULTIPLY_SCALE_BITS 2
|
||||
#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
|
||||
|
||||
|
||||
#define DO_IDCT(in) \
|
||||
{ \
|
||||
/* Even part */ \
|
||||
\
|
||||
tmp10 = vec_add(in##0, in##4); \
|
||||
tmp11 = vec_sub(in##0, in##4); \
|
||||
tmp13 = vec_add(in##2, in##6); \
|
||||
\
|
||||
tmp12 = vec_sub(in##2, in##6); \
|
||||
tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
|
||||
tmp12 = vec_madds(tmp12, pw_F1414, zero); \
|
||||
tmp12 = vec_sub(tmp12, tmp13); \
|
||||
\
|
||||
tmp0 = vec_add(tmp10, tmp13); \
|
||||
tmp3 = vec_sub(tmp10, tmp13); \
|
||||
tmp1 = vec_add(tmp11, tmp12); \
|
||||
tmp2 = vec_sub(tmp11, tmp12); \
|
||||
\
|
||||
/* Odd part */ \
|
||||
\
|
||||
z13 = vec_add(in##5, in##3); \
|
||||
z10 = vec_sub(in##5, in##3); \
|
||||
z10s = vec_sl(z10, pre_multiply_scale_bits); \
|
||||
z11 = vec_add(in##1, in##7); \
|
||||
z12s = vec_sub(in##1, in##7); \
|
||||
z12s = vec_sl(z12s, pre_multiply_scale_bits); \
|
||||
\
|
||||
tmp11 = vec_sub(z11, z13); \
|
||||
tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
|
||||
tmp11 = vec_madds(tmp11, pw_F1414, zero); \
|
||||
\
|
||||
tmp7 = vec_add(z11, z13); \
|
||||
\
|
||||
/* To avoid overflow... \
|
||||
* \
|
||||
* (Original) \
|
||||
* tmp12 = -2.613125930 * z10 + z5; \
|
||||
* \
|
||||
* (This implementation) \
|
||||
* tmp12 = (-1.613125930 - 1) * z10 + z5; \
|
||||
* = -1.613125930 * z10 - z10 + z5; \
|
||||
*/ \
|
||||
\
|
||||
z5 = vec_add(z10s, z12s); \
|
||||
z5 = vec_madds(z5, pw_F1847, zero); \
|
||||
\
|
||||
tmp10 = vec_madds(z12s, pw_F1082, zero); \
|
||||
tmp10 = vec_sub(tmp10, z5); \
|
||||
tmp12 = vec_madds(z10s, pw_MF1613, z5); \
|
||||
tmp12 = vec_sub(tmp12, z10); \
|
||||
\
|
||||
tmp6 = vec_sub(tmp12, tmp7); \
|
||||
tmp5 = vec_sub(tmp11, tmp6); \
|
||||
tmp4 = vec_add(tmp10, tmp5); \
|
||||
\
|
||||
out0 = vec_add(tmp0, tmp7); \
|
||||
out1 = vec_add(tmp1, tmp6); \
|
||||
out2 = vec_add(tmp2, tmp5); \
|
||||
out3 = vec_sub(tmp3, tmp4); \
|
||||
out4 = vec_add(tmp3, tmp4); \
|
||||
out5 = vec_sub(tmp2, tmp5); \
|
||||
out6 = vec_sub(tmp1, tmp6); \
|
||||
out7 = vec_sub(tmp0, tmp7); \
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
{
|
||||
short *dct_table = (short *)dct_table_;
|
||||
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
col0, col1, col2, col3, col4, col5, col6, col7,
|
||||
quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
|
||||
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
|
||||
z5, z10, z10s, z11, z12s, z13,
|
||||
out0, out1, out2, out3, out4, out5, out6, out7;
|
||||
__vector signed char outb;
|
||||
int *outptr;
|
||||
|
||||
/* Constants */
|
||||
__vector short zero = { __8X(0) },
|
||||
pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
|
||||
pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
|
||||
pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
|
||||
pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
|
||||
__vector unsigned short
|
||||
pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
|
||||
pass1_bits3 = { __8X(PASS1_BITS + 3) };
|
||||
__vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
|
||||
|
||||
/* Pass 1: process columns */
|
||||
|
||||
col0 = vec_ld(0, coef_block);
|
||||
col1 = vec_ld(16, coef_block);
|
||||
col2 = vec_ld(32, coef_block);
|
||||
col3 = vec_ld(48, coef_block);
|
||||
col4 = vec_ld(64, coef_block);
|
||||
col5 = vec_ld(80, coef_block);
|
||||
col6 = vec_ld(96, coef_block);
|
||||
col7 = vec_ld(112, coef_block);
|
||||
|
||||
tmp1 = vec_or(col1, col2);
|
||||
tmp2 = vec_or(col3, col4);
|
||||
tmp1 = vec_or(tmp1, tmp2);
|
||||
tmp3 = vec_or(col5, col6);
|
||||
tmp3 = vec_or(tmp3, col7);
|
||||
tmp1 = vec_or(tmp1, tmp3);
|
||||
|
||||
quant0 = *(__vector short *)&dct_table[0];
|
||||
col0 = vec_mladd(col0, quant0, zero);
|
||||
|
||||
if (vec_all_eq(tmp1, zero)) {
|
||||
/* AC terms all zero */
|
||||
|
||||
row0 = vec_splat(col0, 0);
|
||||
row1 = vec_splat(col0, 1);
|
||||
row2 = vec_splat(col0, 2);
|
||||
row3 = vec_splat(col0, 3);
|
||||
row4 = vec_splat(col0, 4);
|
||||
row5 = vec_splat(col0, 5);
|
||||
row6 = vec_splat(col0, 6);
|
||||
row7 = vec_splat(col0, 7);
|
||||
|
||||
} else {
|
||||
|
||||
quant1 = *(__vector short *)&dct_table[8];
|
||||
quant2 = *(__vector short *)&dct_table[16];
|
||||
quant3 = *(__vector short *)&dct_table[24];
|
||||
quant4 = *(__vector short *)&dct_table[32];
|
||||
quant5 = *(__vector short *)&dct_table[40];
|
||||
quant6 = *(__vector short *)&dct_table[48];
|
||||
quant7 = *(__vector short *)&dct_table[56];
|
||||
|
||||
col1 = vec_mladd(col1, quant1, zero);
|
||||
col2 = vec_mladd(col2, quant2, zero);
|
||||
col3 = vec_mladd(col3, quant3, zero);
|
||||
col4 = vec_mladd(col4, quant4, zero);
|
||||
col5 = vec_mladd(col5, quant5, zero);
|
||||
col6 = vec_mladd(col6, quant6, zero);
|
||||
col7 = vec_mladd(col7, quant7, zero);
|
||||
|
||||
DO_IDCT(col);
|
||||
|
||||
TRANSPOSE(out, row);
|
||||
}
|
||||
|
||||
/* Pass 2: process rows */
|
||||
|
||||
DO_IDCT(row);
|
||||
|
||||
out0 = vec_sra(out0, pass1_bits3);
|
||||
out1 = vec_sra(out1, pass1_bits3);
|
||||
out2 = vec_sra(out2, pass1_bits3);
|
||||
out3 = vec_sra(out3, pass1_bits3);
|
||||
out4 = vec_sra(out4, pass1_bits3);
|
||||
out5 = vec_sra(out5, pass1_bits3);
|
||||
out6 = vec_sra(out6, pass1_bits3);
|
||||
out7 = vec_sra(out7, pass1_bits3);
|
||||
|
||||
TRANSPOSE(out, col);
|
||||
|
||||
outb = vec_packs(col0, col0);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[0] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col1, col1);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[1] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col2, col2);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[2] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col3, col3);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[3] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col4, col4);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[4] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col5, col5);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[5] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col6, col6);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[6] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col7, col7);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[7] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
}
|
||||
358
simd/jidctint-altivec.c
Normal file
358
simd/jidctint-altivec.c
Normal file
@@ -0,0 +1,358 @@
|
||||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014, D. R. Commander.
|
||||
* All rights reserved.
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* SLOW INTEGER INVERSE DCT */
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
#define F_0_298 2446 /* FIX(0.298631336) */
|
||||
#define F_0_390 3196 /* FIX(0.390180644) */
|
||||
#define F_0_541 4433 /* FIX(0.541196100) */
|
||||
#define F_0_765 6270 /* FIX(0.765366865) */
|
||||
#define F_0_899 7373 /* FIX(0.899976223) */
|
||||
#define F_1_175 9633 /* FIX(1.175875602) */
|
||||
#define F_1_501 12299 /* FIX(1.501321110) */
|
||||
#define F_1_847 15137 /* FIX(1.847759065) */
|
||||
#define F_1_961 16069 /* FIX(1.961570560) */
|
||||
#define F_2_053 16819 /* FIX(2.053119869) */
|
||||
#define F_2_562 20995 /* FIX(2.562915447) */
|
||||
#define F_3_072 25172 /* FIX(3.072711026) */
|
||||
|
||||
#define CONST_BITS 13
|
||||
#define PASS1_BITS 2
|
||||
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
|
||||
|
||||
|
||||
#define DO_IDCT(in, PASS) \
|
||||
{ \
|
||||
/* Even part \
|
||||
* \
|
||||
* (Original) \
|
||||
* z1 = (z2 + z3) * 0.541196100; \
|
||||
* tmp2 = z1 + z3 * -1.847759065; \
|
||||
* tmp3 = z1 + z2 * 0.765366865; \
|
||||
* \
|
||||
* (This implementation) \
|
||||
* tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
|
||||
* tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
|
||||
*/ \
|
||||
\
|
||||
in##26l = vec_mergeh(in##2, in##6); \
|
||||
in##26h = vec_mergel(in##2, in##6); \
|
||||
\
|
||||
tmp3l = vec_msums(in##26l, pw_f130_f054, zero32); \
|
||||
tmp3h = vec_msums(in##26h, pw_f130_f054, zero32); \
|
||||
tmp2l = vec_msums(in##26l, pw_f054_mf130, zero32); \
|
||||
tmp2h = vec_msums(in##26h, pw_f054_mf130, zero32); \
|
||||
\
|
||||
tmp0 = vec_add(in##0, in##4); \
|
||||
tmp1 = vec_sub(in##0, in##4); \
|
||||
\
|
||||
tmp0l = vec_unpackh(tmp0); \
|
||||
tmp0h = vec_unpackl(tmp0); \
|
||||
tmp0l = vec_sl(tmp0l, const_bits); \
|
||||
tmp0h = vec_sl(tmp0h, const_bits); \
|
||||
tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
|
||||
tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
|
||||
\
|
||||
tmp10l = vec_add(tmp0l, tmp3l); \
|
||||
tmp10h = vec_add(tmp0h, tmp3h); \
|
||||
tmp13l = vec_sub(tmp0l, tmp3l); \
|
||||
tmp13h = vec_sub(tmp0h, tmp3h); \
|
||||
\
|
||||
tmp1l = vec_unpackh(tmp1); \
|
||||
tmp1h = vec_unpackl(tmp1); \
|
||||
tmp1l = vec_sl(tmp1l, const_bits); \
|
||||
tmp1h = vec_sl(tmp1h, const_bits); \
|
||||
tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
|
||||
tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
|
||||
\
|
||||
tmp11l = vec_add(tmp1l, tmp2l); \
|
||||
tmp11h = vec_add(tmp1h, tmp2h); \
|
||||
tmp12l = vec_sub(tmp1l, tmp2l); \
|
||||
tmp12h = vec_sub(tmp1h, tmp2h); \
|
||||
\
|
||||
/* Odd part */ \
|
||||
\
|
||||
z3 = vec_add(in##3, in##7); \
|
||||
z4 = vec_add(in##1, in##5); \
|
||||
\
|
||||
/* (Original) \
|
||||
* z5 = (z3 + z4) * 1.175875602; \
|
||||
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
|
||||
* z3 += z5; z4 += z5; \
|
||||
* \
|
||||
* (This implementation) \
|
||||
* z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
|
||||
* z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
|
||||
*/ \
|
||||
\
|
||||
z34l = vec_mergeh(z3, z4); \
|
||||
z34h = vec_mergel(z3, z4); \
|
||||
\
|
||||
z3l = vec_msums(z34l, pw_mf078_f117, zero32); \
|
||||
z3h = vec_msums(z34h, pw_mf078_f117, zero32); \
|
||||
z4l = vec_msums(z34l, pw_f117_f078, zero32); \
|
||||
z4h = vec_msums(z34h, pw_f117_f078, zero32); \
|
||||
\
|
||||
/* (Original) \
|
||||
* z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
|
||||
* tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
|
||||
* tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
|
||||
* z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
|
||||
* tmp0 += z1 + z3; tmp1 += z2 + z4; \
|
||||
* tmp2 += z2 + z3; tmp3 += z1 + z4; \
|
||||
* \
|
||||
* (This implementation) \
|
||||
* tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
|
||||
* tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
|
||||
* tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
|
||||
* tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
|
||||
* tmp0 += z3; tmp1 += z4; \
|
||||
* tmp2 += z3; tmp3 += z4; \
|
||||
*/ \
|
||||
\
|
||||
in##71l = vec_mergeh(in##7, in##1); \
|
||||
in##71h = vec_mergel(in##7, in##1); \
|
||||
\
|
||||
tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
|
||||
tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
|
||||
tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
|
||||
tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
|
||||
\
|
||||
in##53l = vec_mergeh(in##5, in##3); \
|
||||
in##53h = vec_mergel(in##5, in##3); \
|
||||
\
|
||||
tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
|
||||
tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
|
||||
tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
|
||||
tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
|
||||
\
|
||||
/* Final output stage */ \
|
||||
\
|
||||
out0l = vec_add(tmp10l, tmp3l); \
|
||||
out0h = vec_add(tmp10h, tmp3h); \
|
||||
out7l = vec_sub(tmp10l, tmp3l); \
|
||||
out7h = vec_sub(tmp10h, tmp3h); \
|
||||
\
|
||||
out0l = vec_sra(out0l, descale_p##PASS); \
|
||||
out0h = vec_sra(out0h, descale_p##PASS); \
|
||||
out7l = vec_sra(out7l, descale_p##PASS); \
|
||||
out7h = vec_sra(out7h, descale_p##PASS); \
|
||||
\
|
||||
out0 = vec_pack(out0l, out0h); \
|
||||
out7 = vec_pack(out7l, out7h); \
|
||||
\
|
||||
out1l = vec_add(tmp11l, tmp2l); \
|
||||
out1h = vec_add(tmp11h, tmp2h); \
|
||||
out6l = vec_sub(tmp11l, tmp2l); \
|
||||
out6h = vec_sub(tmp11h, tmp2h); \
|
||||
\
|
||||
out1l = vec_sra(out1l, descale_p##PASS); \
|
||||
out1h = vec_sra(out1h, descale_p##PASS); \
|
||||
out6l = vec_sra(out6l, descale_p##PASS); \
|
||||
out6h = vec_sra(out6h, descale_p##PASS); \
|
||||
\
|
||||
out1 = vec_pack(out1l, out1h); \
|
||||
out6 = vec_pack(out6l, out6h); \
|
||||
\
|
||||
out2l = vec_add(tmp12l, tmp1l); \
|
||||
out2h = vec_add(tmp12h, tmp1h); \
|
||||
out5l = vec_sub(tmp12l, tmp1l); \
|
||||
out5h = vec_sub(tmp12h, tmp1h); \
|
||||
\
|
||||
out2l = vec_sra(out2l, descale_p##PASS); \
|
||||
out2h = vec_sra(out2h, descale_p##PASS); \
|
||||
out5l = vec_sra(out5l, descale_p##PASS); \
|
||||
out5h = vec_sra(out5h, descale_p##PASS); \
|
||||
\
|
||||
out2 = vec_pack(out2l, out2h); \
|
||||
out5 = vec_pack(out5l, out5h); \
|
||||
\
|
||||
out3l = vec_add(tmp13l, tmp0l); \
|
||||
out3h = vec_add(tmp13h, tmp0h); \
|
||||
out4l = vec_sub(tmp13l, tmp0l); \
|
||||
out4h = vec_sub(tmp13h, tmp0h); \
|
||||
\
|
||||
out3l = vec_sra(out3l, descale_p##PASS); \
|
||||
out3h = vec_sra(out3h, descale_p##PASS); \
|
||||
out4l = vec_sra(out4l, descale_p##PASS); \
|
||||
out4h = vec_sra(out4h, descale_p##PASS); \
|
||||
\
|
||||
out3 = vec_pack(out3l, out3h); \
|
||||
out4 = vec_pack(out4l, out4h); \
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
{
|
||||
short *dct_table = (short *)dct_table_;
|
||||
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
col0, col1, col2, col3, col4, col5, col6, col7,
|
||||
quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
|
||||
tmp0, tmp1, tmp2, tmp3, z3, z4,
|
||||
z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
|
||||
row71l, row71h, row26l, row26h, row53l, row53h,
|
||||
out0, out1, out2, out3, out4, out5, out6, out7;
|
||||
__vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
|
||||
tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
|
||||
z3l, z3h, z4l, z4h,
|
||||
out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
|
||||
out5l, out5h, out6l, out6h, out7l, out7h;
|
||||
__vector signed char outb;
|
||||
int *outptr;
|
||||
|
||||
/* Constants */
|
||||
__vector short zero16 = { __8X(0) },
|
||||
pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
|
||||
pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
|
||||
pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
|
||||
pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
|
||||
pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
|
||||
pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
|
||||
pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
|
||||
pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
|
||||
__vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
|
||||
__vector int zero32 = { __4X(0) },
|
||||
pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
|
||||
pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
|
||||
__vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
|
||||
descale_p2 = { __4X(DESCALE_P2) },
|
||||
const_bits = { __4X(CONST_BITS) };
|
||||
__vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
|
||||
|
||||
/* Pass 1: process columns */
|
||||
|
||||
col0 = *(__vector short *)&coef_block[0];
|
||||
col1 = *(__vector short *)&coef_block[8];
|
||||
col2 = *(__vector short *)&coef_block[16];
|
||||
col3 = *(__vector short *)&coef_block[24];
|
||||
col4 = *(__vector short *)&coef_block[32];
|
||||
col5 = *(__vector short *)&coef_block[40];
|
||||
col6 = *(__vector short *)&coef_block[48];
|
||||
col7 = *(__vector short *)&coef_block[56];
|
||||
|
||||
tmp1 = vec_or(col1, col2);
|
||||
tmp2 = vec_or(col3, col4);
|
||||
tmp1 = vec_or(tmp1, tmp2);
|
||||
tmp3 = vec_or(col5, col6);
|
||||
tmp3 = vec_or(tmp3, col7);
|
||||
tmp1 = vec_or(tmp1, tmp3);
|
||||
|
||||
quant0 = *(__vector short *)&dct_table[0];
|
||||
col0 = vec_mladd(col0, quant0, zero16);
|
||||
|
||||
if (vec_all_eq(tmp1, zero16)) {
|
||||
/* AC terms all zero */
|
||||
|
||||
col0 = vec_sl(col0, pass1_bits);
|
||||
|
||||
row0 = vec_splat(col0, 0);
|
||||
row1 = vec_splat(col0, 1);
|
||||
row2 = vec_splat(col0, 2);
|
||||
row3 = vec_splat(col0, 3);
|
||||
row4 = vec_splat(col0, 4);
|
||||
row5 = vec_splat(col0, 5);
|
||||
row6 = vec_splat(col0, 6);
|
||||
row7 = vec_splat(col0, 7);
|
||||
|
||||
} else {
|
||||
|
||||
quant1 = *(__vector short *)&dct_table[8];
|
||||
quant2 = *(__vector short *)&dct_table[16];
|
||||
quant3 = *(__vector short *)&dct_table[24];
|
||||
quant4 = *(__vector short *)&dct_table[32];
|
||||
quant5 = *(__vector short *)&dct_table[40];
|
||||
quant6 = *(__vector short *)&dct_table[48];
|
||||
quant7 = *(__vector short *)&dct_table[56];
|
||||
|
||||
col1 = vec_mladd(col1, quant1, zero16);
|
||||
col2 = vec_mladd(col2, quant2, zero16);
|
||||
col3 = vec_mladd(col3, quant3, zero16);
|
||||
col4 = vec_mladd(col4, quant4, zero16);
|
||||
col5 = vec_mladd(col5, quant5, zero16);
|
||||
col6 = vec_mladd(col6, quant6, zero16);
|
||||
col7 = vec_mladd(col7, quant7, zero16);
|
||||
|
||||
DO_IDCT(col, 1);
|
||||
|
||||
TRANSPOSE(out, row);
|
||||
}
|
||||
|
||||
/* Pass 2: process rows */
|
||||
|
||||
DO_IDCT(row, 2);
|
||||
|
||||
TRANSPOSE(out, col);
|
||||
|
||||
outb = vec_packs(col0, col0);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[0] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col1, col1);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[1] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col2, col2);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[2] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col3, col3);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[3] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col4, col4);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[4] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col5, col5);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[5] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col6, col6);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[6] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col7, col7);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[7] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
}
|
||||
236
simd/jquanti-altivec.c
Normal file
236
simd/jquanti-altivec.c
Normal file
@@ -0,0 +1,236 @@
|
||||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014, D. R. Commander.
|
||||
* All rights reserved.
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
/* NOTE: The address will either be aligned or offset by 8 bytes, so we can
|
||||
* always get the data we want by using a single vector load (although we may
|
||||
* have to permute the result.)
|
||||
*/
|
||||
#define LOAD_ROW(row) { \
|
||||
elemptr = sample_data[row] + start_col; \
|
||||
in##row = vec_ld(0, elemptr); \
|
||||
if ((size_t)elemptr & 15) \
|
||||
in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
DCTELEM * workspace)
|
||||
{
|
||||
JSAMPROW elemptr;
|
||||
__vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
|
||||
__vector short out0, out1, out2, out3, out4, out5, out6, out7;
|
||||
|
||||
/* Constants */
|
||||
__vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
|
||||
__vector unsigned char zero = { __16X(0) };
|
||||
|
||||
LOAD_ROW(0);
|
||||
LOAD_ROW(1);
|
||||
LOAD_ROW(2);
|
||||
LOAD_ROW(3);
|
||||
LOAD_ROW(4);
|
||||
LOAD_ROW(5);
|
||||
LOAD_ROW(6);
|
||||
LOAD_ROW(7);
|
||||
|
||||
out0 = (__vector short)vec_mergeh(zero, in0);
|
||||
out1 = (__vector short)vec_mergeh(zero, in1);
|
||||
out2 = (__vector short)vec_mergeh(zero, in2);
|
||||
out3 = (__vector short)vec_mergeh(zero, in3);
|
||||
out4 = (__vector short)vec_mergeh(zero, in4);
|
||||
out5 = (__vector short)vec_mergeh(zero, in5);
|
||||
out6 = (__vector short)vec_mergeh(zero, in6);
|
||||
out7 = (__vector short)vec_mergeh(zero, in7);
|
||||
|
||||
out0 = vec_sub(out0, pw_centerjsamp);
|
||||
out1 = vec_sub(out1, pw_centerjsamp);
|
||||
out2 = vec_sub(out2, pw_centerjsamp);
|
||||
out3 = vec_sub(out3, pw_centerjsamp);
|
||||
out4 = vec_sub(out4, pw_centerjsamp);
|
||||
out5 = vec_sub(out5, pw_centerjsamp);
|
||||
out6 = vec_sub(out6, pw_centerjsamp);
|
||||
out7 = vec_sub(out7, pw_centerjsamp);
|
||||
|
||||
vec_st(out0, 0, workspace);
|
||||
vec_st(out1, 16, workspace);
|
||||
vec_st(out2, 32, workspace);
|
||||
vec_st(out3, 48, workspace);
|
||||
vec_st(out4, 64, workspace);
|
||||
vec_st(out5, 80, workspace);
|
||||
vec_st(out6, 96, workspace);
|
||||
vec_st(out7, 112, workspace);
|
||||
}
|
||||
|
||||
|
||||
#define WORD_BIT 16
|
||||
|
||||
/* There is no AltiVec unsigned multiply instruction, hence this. */
|
||||
|
||||
#define MULTIPLY(vs0, vs1, out) { \
|
||||
tmpe = vec_mule((__vector unsigned short)vs0, \
|
||||
(__vector unsigned short)vs1); \
|
||||
tmpo = vec_mulo((__vector unsigned short)vs0, \
|
||||
(__vector unsigned short)vs1); \
|
||||
out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
|
||||
(__vector unsigned short)tmpo, \
|
||||
shift_pack_index); \
|
||||
}
|
||||
|
||||
void
|
||||
jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM * divisors,
|
||||
DCTELEM * workspace)
|
||||
{
|
||||
__vector short row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
__vector short row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s;
|
||||
__vector short corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7;
|
||||
__vector short recip0, recip1, recip2, recip3, recip4, recip5, recip6,
|
||||
recip7;
|
||||
__vector short scale0, scale1, scale2, scale3, scale4, scale5, scale6,
|
||||
scale7;
|
||||
__vector unsigned int tmpe, tmpo;
|
||||
|
||||
/* Constants */
|
||||
__vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
|
||||
__vector unsigned char shift_pack_index =
|
||||
{ 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
|
||||
|
||||
row0 = vec_ld(0, workspace);
|
||||
row1 = vec_ld(16, workspace);
|
||||
row2 = vec_ld(32, workspace);
|
||||
row3 = vec_ld(48, workspace);
|
||||
row4 = vec_ld(64, workspace);
|
||||
row5 = vec_ld(80, workspace);
|
||||
row6 = vec_ld(96, workspace);
|
||||
row7 = vec_ld(112, workspace);
|
||||
|
||||
/* Branch-less absolute value */
|
||||
row0s = vec_sra(row0, pw_word_bit_m1);
|
||||
row1s = vec_sra(row1, pw_word_bit_m1);
|
||||
row2s = vec_sra(row2, pw_word_bit_m1);
|
||||
row3s = vec_sra(row3, pw_word_bit_m1);
|
||||
row4s = vec_sra(row4, pw_word_bit_m1);
|
||||
row5s = vec_sra(row5, pw_word_bit_m1);
|
||||
row6s = vec_sra(row6, pw_word_bit_m1);
|
||||
row7s = vec_sra(row7, pw_word_bit_m1);
|
||||
row0 = vec_xor(row0, row0s);
|
||||
row1 = vec_xor(row1, row1s);
|
||||
row2 = vec_xor(row2, row2s);
|
||||
row3 = vec_xor(row3, row3s);
|
||||
row4 = vec_xor(row4, row4s);
|
||||
row5 = vec_xor(row5, row5s);
|
||||
row6 = vec_xor(row6, row6s);
|
||||
row7 = vec_xor(row7, row7s);
|
||||
row0 = vec_sub(row0, row0s);
|
||||
row1 = vec_sub(row1, row1s);
|
||||
row2 = vec_sub(row2, row2s);
|
||||
row3 = vec_sub(row3, row3s);
|
||||
row4 = vec_sub(row4, row4s);
|
||||
row5 = vec_sub(row5, row5s);
|
||||
row6 = vec_sub(row6, row6s);
|
||||
row7 = vec_sub(row7, row7s);
|
||||
|
||||
corr0 = vec_ld(DCTSIZE2 * 2, divisors);
|
||||
corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
|
||||
corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
|
||||
corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
|
||||
corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
|
||||
corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
|
||||
corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
|
||||
corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
|
||||
|
||||
row0 = vec_add(row0, corr0);
|
||||
row1 = vec_add(row1, corr1);
|
||||
row2 = vec_add(row2, corr2);
|
||||
row3 = vec_add(row3, corr3);
|
||||
row4 = vec_add(row4, corr4);
|
||||
row5 = vec_add(row5, corr5);
|
||||
row6 = vec_add(row6, corr6);
|
||||
row7 = vec_add(row7, corr7);
|
||||
|
||||
recip0 = vec_ld(0, divisors);
|
||||
recip1 = vec_ld(16, divisors);
|
||||
recip2 = vec_ld(32, divisors);
|
||||
recip3 = vec_ld(48, divisors);
|
||||
recip4 = vec_ld(64, divisors);
|
||||
recip5 = vec_ld(80, divisors);
|
||||
recip6 = vec_ld(96, divisors);
|
||||
recip7 = vec_ld(112, divisors);
|
||||
|
||||
MULTIPLY(row0, recip0, row0);
|
||||
MULTIPLY(row1, recip1, row1);
|
||||
MULTIPLY(row2, recip2, row2);
|
||||
MULTIPLY(row3, recip3, row3);
|
||||
MULTIPLY(row4, recip4, row4);
|
||||
MULTIPLY(row5, recip5, row5);
|
||||
MULTIPLY(row6, recip6, row6);
|
||||
MULTIPLY(row7, recip7, row7);
|
||||
|
||||
scale0 = vec_ld(DCTSIZE2 * 4, divisors);
|
||||
scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
|
||||
scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
|
||||
scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
|
||||
scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
|
||||
scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
|
||||
scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
|
||||
scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
|
||||
|
||||
MULTIPLY(row0, scale0, row0);
|
||||
MULTIPLY(row1, scale1, row1);
|
||||
MULTIPLY(row2, scale2, row2);
|
||||
MULTIPLY(row3, scale3, row3);
|
||||
MULTIPLY(row4, scale4, row4);
|
||||
MULTIPLY(row5, scale5, row5);
|
||||
MULTIPLY(row6, scale6, row6);
|
||||
MULTIPLY(row7, scale7, row7);
|
||||
|
||||
row0 = vec_xor(row0, row0s);
|
||||
row1 = vec_xor(row1, row1s);
|
||||
row2 = vec_xor(row2, row2s);
|
||||
row3 = vec_xor(row3, row3s);
|
||||
row4 = vec_xor(row4, row4s);
|
||||
row5 = vec_xor(row5, row5s);
|
||||
row6 = vec_xor(row6, row6s);
|
||||
row7 = vec_xor(row7, row7s);
|
||||
row0 = vec_sub(row0, row0s);
|
||||
row1 = vec_sub(row1, row1s);
|
||||
row2 = vec_sub(row2, row2s);
|
||||
row3 = vec_sub(row3, row3s);
|
||||
row4 = vec_sub(row4, row4s);
|
||||
row5 = vec_sub(row5, row5s);
|
||||
row6 = vec_sub(row6, row6s);
|
||||
row7 = vec_sub(row7, row7s);
|
||||
|
||||
vec_st(row0, 0, coef_block);
|
||||
vec_st(row1, 16, coef_block);
|
||||
vec_st(row2, 32, coef_block);
|
||||
vec_st(row3, 48, coef_block);
|
||||
vec_st(row4, 64, coef_block);
|
||||
vec_st(row5, 80, coef_block);
|
||||
vec_st(row6, 96, coef_block);
|
||||
vec_st(row7, 112, coef_block);
|
||||
}
|
||||
60
simd/jsimd.h
60
simd/jsimd.h
@@ -116,6 +116,28 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_mips_dspr2
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows);
|
||||
|
||||
EXTERN(void) jsimd_rgb_ycc_convert_altivec
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows);
|
||||
EXTERN(void) jsimd_extrgb_ycc_convert_altivec
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows);
|
||||
EXTERN(void) jsimd_extrgbx_ycc_convert_altivec
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows);
|
||||
EXTERN(void) jsimd_extbgr_ycc_convert_altivec
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows);
|
||||
EXTERN(void) jsimd_extbgrx_ycc_convert_altivec
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows);
|
||||
EXTERN(void) jsimd_extxbgr_ycc_convert_altivec
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows);
|
||||
EXTERN(void) jsimd_extxrgb_ycc_convert_altivec
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows);
|
||||
|
||||
/* RGB & extended RGB --> Grayscale Colorspace Conversion */
|
||||
EXTERN(void) jsimd_rgb_gray_convert_mmx
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
@@ -184,6 +206,28 @@ EXTERN(void) jsimd_extxrgb_gray_convert_mips_dspr2
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows);
|
||||
|
||||
EXTERN(void) jsimd_rgb_gray_convert_altivec
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows);
|
||||
EXTERN(void) jsimd_extrgb_gray_convert_altivec
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows);
|
||||
EXTERN(void) jsimd_extrgbx_gray_convert_altivec
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows);
|
||||
EXTERN(void) jsimd_extbgr_gray_convert_altivec
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows);
|
||||
EXTERN(void) jsimd_extbgrx_gray_convert_altivec
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows);
|
||||
EXTERN(void) jsimd_extxbgr_gray_convert_altivec
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows);
|
||||
EXTERN(void) jsimd_extxrgb_gray_convert_altivec
|
||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows);
|
||||
|
||||
/* YCC --> RGB & extended RGB Colorspace Conversion */
|
||||
EXTERN(void) jsimd_ycc_rgb_convert_mmx
|
||||
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
@@ -524,6 +568,9 @@ EXTERN(void) jsimd_convsamp_neon
|
||||
EXTERN(void) jsimd_convsamp_mips_dspr2
|
||||
(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace);
|
||||
|
||||
EXTERN(void) jsimd_convsamp_altivec
|
||||
(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace);
|
||||
|
||||
/* Floating Point Sample Conversion */
|
||||
EXTERN(void) jsimd_convsamp_float_3dnow
|
||||
(JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace);
|
||||
@@ -545,6 +592,8 @@ EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM * data);
|
||||
|
||||
EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM * data);
|
||||
|
||||
EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM * data);
|
||||
|
||||
/* Fast Integer Forward DCT */
|
||||
EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM * data);
|
||||
|
||||
@@ -576,6 +625,9 @@ EXTERN(void) jsimd_quantize_neon
|
||||
EXTERN(void) jsimd_quantize_mips_dspr2
|
||||
(JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace);
|
||||
|
||||
EXTERN(void) jsimd_quantize_altivec
|
||||
(JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace);
|
||||
|
||||
/* Floating Point Quantization */
|
||||
EXTERN(void) jsimd_quantize_float_3dnow
|
||||
(JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace);
|
||||
@@ -644,6 +696,10 @@ EXTERN(void) jsimd_idct_islow_mips_dspr2
|
||||
(void * dct_table, JCOEFPTR coef_block, int * output_buf,
|
||||
JSAMPLE * output_col);
|
||||
|
||||
EXTERN(void) jsimd_idct_islow_altivec
|
||||
(void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col);
|
||||
|
||||
/* Fast Integer Inverse DCT */
|
||||
EXTERN(void) jsimd_idct_ifast_mmx
|
||||
(void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
@@ -665,6 +721,10 @@ EXTERN(void) jsimd_idct_ifast_rows_mips_dspr2
|
||||
(DCTELEM * wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
|
||||
const int * idct_coefs);
|
||||
|
||||
EXTERN(void) jsimd_idct_ifast_altivec
|
||||
(void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col);
|
||||
|
||||
/* Floating Point Inverse DCT */
|
||||
EXTERN(void) jsimd_idct_float_3dnow
|
||||
(void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
|
||||
@@ -29,6 +29,14 @@
|
||||
#include "jsimd.h"
|
||||
#include <altivec.h>
|
||||
|
||||
|
||||
/* Common code */
|
||||
|
||||
#define __4X(a) a, a, a, a
|
||||
#define __4X2(a, b) a, b, a, b, a, b, a, b
|
||||
#define __8X(a) __4X(a), __4X(a)
|
||||
#define __16X(a) __8X(a), __8X(a)
|
||||
|
||||
#define TRANSPOSE(row, col) \
|
||||
{ \
|
||||
__vector short row04l, row04h, row15l, row15h, \
|
||||
@@ -47,7 +55,7 @@
|
||||
row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
|
||||
\
|
||||
/* transpose coefficients (phase 2) */ \
|
||||
col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61} */ \
|
||||
col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \
|
||||
col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
|
||||
col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
|
||||
col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
|
||||
@@ -58,7 +66,7 @@
|
||||
\
|
||||
/* transpose coefficients (phase 3) */ \
|
||||
col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
|
||||
col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71} */ \
|
||||
col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \
|
||||
col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
|
||||
col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
|
||||
col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
|
||||
@@ -66,125 +74,3 @@
|
||||
col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
|
||||
col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
|
||||
}
|
||||
|
||||
static const __vector short constants __attribute__((aligned(16))) =
|
||||
{
|
||||
98 << 5, /* FIX(0.382683433) */
|
||||
139 << 5, /* FIX(0.541196100) */
|
||||
181 << 5, /* FIX(0.707106781) */
|
||||
334 << 5 /* FIX(1.306562965) */
|
||||
};
|
||||
|
||||
#define DO_DCT() \
|
||||
{ \
|
||||
/* Even part */ \
|
||||
\
|
||||
tmp10 = vec_add(tmp0, tmp3); \
|
||||
tmp13 = vec_sub(tmp0, tmp3); \
|
||||
tmp11 = vec_add(tmp1, tmp2); \
|
||||
tmp12 = vec_sub(tmp1, tmp2); \
|
||||
\
|
||||
out0 = vec_add(tmp10, tmp11); \
|
||||
out4 = vec_sub(tmp10, tmp11); \
|
||||
\
|
||||
z1 = vec_add(tmp12, tmp13); \
|
||||
z1 = vec_sl(z1, PRE_MULTIPLY_SCALE_BITS); \
|
||||
z1 = vec_madds(z1, PW_0707, zero); \
|
||||
\
|
||||
out2 = vec_add(tmp13, z1); \
|
||||
out6 = vec_sub(tmp13, z1); \
|
||||
\
|
||||
/* Odd part */ \
|
||||
\
|
||||
tmp10 = vec_add(tmp4, tmp5); \
|
||||
tmp11 = vec_add(tmp5, tmp6); \
|
||||
tmp12 = vec_add(tmp6, tmp7); \
|
||||
\
|
||||
tmp10 = vec_sl(tmp10, PRE_MULTIPLY_SCALE_BITS); \
|
||||
tmp12 = vec_sl(tmp12, PRE_MULTIPLY_SCALE_BITS); \
|
||||
z5 = vec_sub(tmp10, tmp12); \
|
||||
z5 = vec_madds(z5, PW_0382, zero); \
|
||||
\
|
||||
z2 = vec_madds(tmp10, PW_0541, zero); \
|
||||
z2 = vec_add(z2, z5); \
|
||||
\
|
||||
z4 = vec_madds(tmp12, PW_1306, zero); \
|
||||
z4 = vec_add(z4, z5); \
|
||||
\
|
||||
tmp11 = vec_sl(tmp11, PRE_MULTIPLY_SCALE_BITS); \
|
||||
z3 = vec_madds(tmp11, PW_0707, zero); \
|
||||
\
|
||||
z11 = vec_add(tmp7, z3); \
|
||||
z13 = vec_sub(tmp7, z3); \
|
||||
\
|
||||
out5 = vec_add(z13, z2); \
|
||||
out3 = vec_sub(z13, z2); \
|
||||
out1 = vec_add(z11, z4); \
|
||||
out7 = vec_sub(z11, z4); \
|
||||
}
|
||||
|
||||
void
|
||||
jsimd_fdct_ifast_altivec (DCTELEM *data)
|
||||
{
|
||||
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
col0, col1, col2, col3, col4, col5, col6, col7,
|
||||
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
|
||||
z1, z2, z3, z4, z5, z11, z13,
|
||||
out0, out1, out2, out3, out4, out5, out6, out7;
|
||||
|
||||
/* Constants */
|
||||
__vector short zero = vec_splat_s16(0),
|
||||
PW_0382 = vec_splat(constants, 0),
|
||||
PW_0541 = vec_splat(constants, 1),
|
||||
PW_0707 = vec_splat(constants, 2),
|
||||
PW_1306 = vec_splat(constants, 3);
|
||||
__vector unsigned short PRE_MULTIPLY_SCALE_BITS = vec_splat_u16(2);
|
||||
|
||||
/* Pass 1: process rows. */
|
||||
|
||||
row0 = *(__vector short *)&data[0];
|
||||
row1 = *(__vector short *)&data[8];
|
||||
row2 = *(__vector short *)&data[16];
|
||||
row3 = *(__vector short *)&data[24];
|
||||
row4 = *(__vector short *)&data[32];
|
||||
row5 = *(__vector short *)&data[40];
|
||||
row6 = *(__vector short *)&data[48];
|
||||
row7 = *(__vector short *)&data[56];
|
||||
|
||||
TRANSPOSE(row, col);
|
||||
|
||||
tmp0 = vec_add(col0, col7);
|
||||
tmp7 = vec_sub(col0, col7);
|
||||
tmp1 = vec_add(col1, col6);
|
||||
tmp6 = vec_sub(col1, col6);
|
||||
tmp2 = vec_add(col2, col5);
|
||||
tmp5 = vec_sub(col2, col5);
|
||||
tmp3 = vec_add(col3, col4);
|
||||
tmp4 = vec_sub(col3, col4);
|
||||
|
||||
DO_DCT();
|
||||
|
||||
/* Pass 2: process columns. */
|
||||
|
||||
TRANSPOSE(out, row);
|
||||
|
||||
tmp0 = vec_add(row0, row7);
|
||||
tmp7 = vec_sub(row0, row7);
|
||||
tmp1 = vec_add(row1, row6);
|
||||
tmp6 = vec_sub(row1, row6);
|
||||
tmp2 = vec_add(row2, row5);
|
||||
tmp5 = vec_sub(row2, row5);
|
||||
tmp3 = vec_add(row3, row4);
|
||||
tmp4 = vec_sub(row3, row4);
|
||||
|
||||
DO_DCT();
|
||||
|
||||
*(__vector short *)&data[0] = out0;
|
||||
*(__vector short *)&data[8] = out1;
|
||||
*(__vector short *)&data[16] = out2;
|
||||
*(__vector short *)&data[24] = out3;
|
||||
*(__vector short *)&data[32] = out4;
|
||||
*(__vector short *)&data[40] = out5;
|
||||
*(__vector short *)&data[48] = out6;
|
||||
*(__vector short *)&data[56] = out7;
|
||||
}
|
||||
@@ -6,6 +6,7 @@
|
||||
* Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
|
||||
* Copyright (C) 2013-2014, Linaro Limited
|
||||
* Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
|
||||
* Copyright (C) 2014, D. R. Commander. All rights reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
@@ -197,21 +198,21 @@ _\fname:
|
||||
tmp13 = q1; \
|
||||
}
|
||||
|
||||
#define XFIX_0_899976223 v0.4h[0]
|
||||
#define XFIX_0_541196100 v0.4h[1]
|
||||
#define XFIX_2_562915447 v0.4h[2]
|
||||
#define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3]
|
||||
#define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0]
|
||||
#define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1]
|
||||
#define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2]
|
||||
#define XFIX_1_175875602 v1.4h[3]
|
||||
#define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0]
|
||||
#define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1]
|
||||
#define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2]
|
||||
#define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3]
|
||||
#define XFIX_0_899976223 v0.h[0]
|
||||
#define XFIX_0_541196100 v0.h[1]
|
||||
#define XFIX_2_562915447 v0.h[2]
|
||||
#define XFIX_0_298631336_MINUS_0_899976223 v0.h[3]
|
||||
#define XFIX_1_501321110_MINUS_0_899976223 v1.h[0]
|
||||
#define XFIX_2_053119869_MINUS_2_562915447 v1.h[1]
|
||||
#define XFIX_0_541196100_PLUS_0_765366865 v1.h[2]
|
||||
#define XFIX_1_175875602 v1.h[3]
|
||||
#define XFIX_1_175875602_MINUS_0_390180644 v2.h[0]
|
||||
#define XFIX_0_541196100_MINUS_1_847759065 v2.h[1]
|
||||
#define XFIX_3_072711026_MINUS_2_562915447 v2.h[2]
|
||||
#define XFIX_1_175875602_MINUS_1_961570560 v2.h[3]
|
||||
|
||||
.balign 16
|
||||
jsimd_idct_islow_neon_consts:
|
||||
Ljsimd_idct_islow_neon_consts:
|
||||
.short FIX_0_899976223 /* d0[0] */
|
||||
.short FIX_0_541196100 /* d0[1] */
|
||||
.short FIX_2_562915447 /* d0[2] */
|
||||
@@ -256,54 +257,54 @@ asm_function jsimd_idct_islow_neon
|
||||
/* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
|
||||
sub sp, sp, 272
|
||||
str x15, [sp], 16
|
||||
adr x15, jsimd_idct_islow_neon_consts
|
||||
st1 {v0.8b - v3.8b}, [sp], 32
|
||||
st1 {v4.8b - v7.8b}, [sp], 32
|
||||
st1 {v8.8b - v11.8b}, [sp], 32
|
||||
st1 {v12.8b - v15.8b}, [sp], 32
|
||||
st1 {v16.8b - v19.8b}, [sp], 32
|
||||
st1 {v20.8b - v23.8b}, [sp], 32
|
||||
st1 {v24.8b - v27.8b}, [sp], 32
|
||||
st1 {v28.8b - v31.8b}, [sp], 32
|
||||
adr x15, Ljsimd_idct_islow_neon_consts
|
||||
st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
|
||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
|
||||
ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
|
||||
mul v16.4h, v16.4h, v0.4h
|
||||
mul v17.4h, v17.4h, v1.4h
|
||||
ins v16.2d[1], v17.2d[0] /* 128 bit q8 */
|
||||
ins v16.d[1], v17.d[0] /* 128 bit q8 */
|
||||
ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
|
||||
mul v18.4h, v18.4h, v2.4h
|
||||
mul v19.4h, v19.4h, v3.4h
|
||||
ins v18.2d[1], v19.2d[0] /* 128 bit q9 */
|
||||
ins v18.d[1], v19.d[0] /* 128 bit q9 */
|
||||
ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
|
||||
mul v20.4h, v20.4h, v4.4h
|
||||
mul v21.4h, v21.4h, v5.4h
|
||||
ins v20.2d[1], v21.2d[0] /* 128 bit q10 */
|
||||
ins v20.d[1], v21.d[0] /* 128 bit q10 */
|
||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
|
||||
mul v22.4h, v22.4h, v6.4h
|
||||
mul v23.4h, v23.4h, v7.4h
|
||||
ins v22.2d[1], v23.2d[0] /* 128 bit q11 */
|
||||
ins v22.d[1], v23.d[0] /* 128 bit q11 */
|
||||
ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
|
||||
mul v24.4h, v24.4h, v0.4h
|
||||
mul v25.4h, v25.4h, v1.4h
|
||||
ins v24.2d[1], v25.2d[0] /* 128 bit q12 */
|
||||
ins v24.d[1], v25.d[0] /* 128 bit q12 */
|
||||
ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
|
||||
mul v28.4h, v28.4h, v4.4h
|
||||
mul v29.4h, v29.4h, v5.4h
|
||||
ins v28.2d[1], v29.2d[0] /* 128 bit q14 */
|
||||
ins v28.d[1], v29.d[0] /* 128 bit q14 */
|
||||
mul v26.4h, v26.4h, v2.4h
|
||||
mul v27.4h, v27.4h, v3.4h
|
||||
ins v26.2d[1], v27.2d[0] /* 128 bit q13 */
|
||||
ins v26.d[1], v27.d[0] /* 128 bit q13 */
|
||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */
|
||||
add x15, x15, #16
|
||||
mul v30.4h, v30.4h, v6.4h
|
||||
mul v31.4h, v31.4h, v7.4h
|
||||
ins v30.2d[1], v31.2d[0] /* 128 bit q15 */
|
||||
ins v30.d[1], v31.d[0] /* 128 bit q15 */
|
||||
/* Go to the bottom of the stack */
|
||||
sub sp, sp, 352
|
||||
stp x4, x5, [sp], 16
|
||||
st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */
|
||||
st1 {v12.4h - v15.4h}, [sp], 32
|
||||
st1 {v8.4h, v9.4h, v10.4h, v11.4h}, [sp], 32 /* save NEON registers */
|
||||
st1 {v12.4h, v13.4h, v14.4h, v15.4h}, [sp], 32
|
||||
/* 1-D IDCT, pass 1, left 4x8 half */
|
||||
add v4.4h, ROW7L.4h, ROW3L.4h
|
||||
add v5.4h, ROW5L.4h, ROW1L.4h
|
||||
@@ -378,7 +379,7 @@ asm_function jsimd_idct_islow_neon
|
||||
rshrn ROW0L.4h, v12.4s, #11
|
||||
rshrn ROW4L.4h, v6.4s, #11
|
||||
|
||||
beq 3f /* Go to do some special handling for the sparse right 4x8 half */
|
||||
b.eq 3f /* Go to do some special handling for the sparse right 4x8 half */
|
||||
|
||||
/* 1-D IDCT, pass 1, right 4x8 half */
|
||||
ld1 {v2.4h}, [x15] /* reload constants */
|
||||
@@ -553,33 +554,33 @@ asm_function jsimd_idct_islow_neon
|
||||
shrn ROW4R.4h, v6.4s, #16
|
||||
|
||||
2: /* Descale to 8-bit and range limit */
|
||||
ins v16.2d[1], v17.2d[0]
|
||||
ins v18.2d[1], v19.2d[0]
|
||||
ins v20.2d[1], v21.2d[0]
|
||||
ins v22.2d[1], v23.2d[0]
|
||||
ins v16.d[1], v17.d[0]
|
||||
ins v18.d[1], v19.d[0]
|
||||
ins v20.d[1], v21.d[0]
|
||||
ins v22.d[1], v23.d[0]
|
||||
sqrshrn v16.8b, v16.8h, #2
|
||||
sqrshrn2 v16.16b, v18.8h, #2
|
||||
sqrshrn v18.8b, v20.8h, #2
|
||||
sqrshrn2 v18.16b, v22.8h, #2
|
||||
|
||||
/* vpop {v8.4h - d15.4h} */ /* restore NEON registers */
|
||||
ld1 {v8.4h - v11.4h}, [sp], 32
|
||||
ld1 {v12.4h - v15.4h}, [sp], 32
|
||||
ins v24.2d[1], v25.2d[0]
|
||||
ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [sp], 32
|
||||
ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [sp], 32
|
||||
ins v24.d[1], v25.d[0]
|
||||
|
||||
sqrshrn v20.8b, v24.8h, #2
|
||||
/* Transpose the final 8-bit samples and do signed->unsigned conversion */
|
||||
/* trn1 v16.8h, v16.8h, v18.8h */
|
||||
transpose v16, v18, v3, .16b, .8h
|
||||
ins v26.2d[1], v27.2d[0]
|
||||
ins v28.2d[1], v29.2d[0]
|
||||
ins v30.2d[1], v31.2d[0]
|
||||
ins v26.d[1], v27.d[0]
|
||||
ins v28.d[1], v29.d[0]
|
||||
ins v30.d[1], v31.d[0]
|
||||
sqrshrn2 v20.16b, v26.8h, #2
|
||||
sqrshrn v22.8b, v28.8h, #2
|
||||
movi v0.16b, #(CENTERJSAMPLE)
|
||||
sqrshrn2 v22.16b, v30.8h, #2
|
||||
transpose_single v16, v17, v3, .2d, .8b
|
||||
transpose_single v18, v19, v3, .2d, .8b
|
||||
transpose_single v16, v17, v3, .d, .8b
|
||||
transpose_single v18, v19, v3, .d, .8b
|
||||
add v16.8b, v16.8b, v0.8b
|
||||
add v17.8b, v17.8b, v0.8b
|
||||
add v18.8b, v18.8b, v0.8b
|
||||
@@ -590,7 +591,7 @@ asm_function jsimd_idct_islow_neon
|
||||
add TMP1, TMP1, OUTPUT_COL
|
||||
add TMP2, TMP2, OUTPUT_COL
|
||||
st1 {v16.8b}, [TMP1]
|
||||
transpose_single v20, v21, v3, .2d, .8b
|
||||
transpose_single v20, v21, v3, .d, .8b
|
||||
st1 {v17.8b}, [TMP2]
|
||||
ldp TMP1, TMP2, [OUTPUT_BUF], 16
|
||||
add TMP1, TMP1, OUTPUT_COL
|
||||
@@ -605,7 +606,7 @@ asm_function jsimd_idct_islow_neon
|
||||
add TMP2, TMP2, OUTPUT_COL
|
||||
add TMP3, TMP3, OUTPUT_COL
|
||||
add TMP4, TMP4, OUTPUT_COL
|
||||
transpose_single v22, v23, v3, .2d, .8b
|
||||
transpose_single v22, v23, v3, .d, .8b
|
||||
st1 {v20.8b}, [TMP1]
|
||||
add v22.8b, v22.8b, v0.8b
|
||||
add v23.8b, v23.8b, v0.8b
|
||||
@@ -613,14 +614,14 @@ asm_function jsimd_idct_islow_neon
|
||||
st1 {v22.8b}, [TMP3]
|
||||
st1 {v23.8b}, [TMP4]
|
||||
ldr x15, [sp], 16
|
||||
ld1 {v0.8b - v3.8b}, [sp], 32
|
||||
ld1 {v4.8b - v7.8b}, [sp], 32
|
||||
ld1 {v8.8b - v11.8b}, [sp], 32
|
||||
ld1 {v12.8b - v15.8b}, [sp], 32
|
||||
ld1 {v16.8b - v19.8b}, [sp], 32
|
||||
ld1 {v20.8b - v23.8b}, [sp], 32
|
||||
ld1 {v24.8b - v27.8b}, [sp], 32
|
||||
ld1 {v28.8b - v31.8b}, [sp], 32
|
||||
ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
blr x30
|
||||
|
||||
3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
|
||||
@@ -636,17 +637,17 @@ asm_function jsimd_idct_islow_neon
|
||||
transpose ROW0L, ROW2L, v3, .16b, .2s
|
||||
transpose ROW5L, ROW7L, v3, .16b, .2s
|
||||
cmp x0, #0
|
||||
beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
|
||||
b.eq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
|
||||
|
||||
/* Only row 0 is non-zero for the right 4x8 half */
|
||||
dup ROW1R.4h, ROW0R.4h[1]
|
||||
dup ROW2R.4h, ROW0R.4h[2]
|
||||
dup ROW3R.4h, ROW0R.4h[3]
|
||||
dup ROW4R.4h, ROW0R.4h[0]
|
||||
dup ROW5R.4h, ROW0R.4h[1]
|
||||
dup ROW6R.4h, ROW0R.4h[2]
|
||||
dup ROW7R.4h, ROW0R.4h[3]
|
||||
dup ROW0R.4h, ROW0R.4h[0]
|
||||
dup ROW1R.4h, ROW0R.h[1]
|
||||
dup ROW2R.4h, ROW0R.h[2]
|
||||
dup ROW3R.4h, ROW0R.h[3]
|
||||
dup ROW4R.4h, ROW0R.h[0]
|
||||
dup ROW5R.4h, ROW0R.h[1]
|
||||
dup ROW6R.4h, ROW0R.h[2]
|
||||
dup ROW7R.4h, ROW0R.h[3]
|
||||
dup ROW0R.4h, ROW0R.h[0]
|
||||
b 1b /* Go to 'normal' second pass */
|
||||
|
||||
4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
|
||||
@@ -770,13 +771,13 @@ asm_function jsimd_idct_islow_neon
|
||||
* per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
|
||||
*/
|
||||
|
||||
#define XFIX_1_082392200 v0.4h[0]
|
||||
#define XFIX_1_414213562 v0.4h[1]
|
||||
#define XFIX_1_847759065 v0.4h[2]
|
||||
#define XFIX_2_613125930 v0.4h[3]
|
||||
#define XFIX_1_082392200 v0.h[0]
|
||||
#define XFIX_1_414213562 v0.h[1]
|
||||
#define XFIX_1_847759065 v0.h[2]
|
||||
#define XFIX_2_613125930 v0.h[3]
|
||||
|
||||
.balign 16
|
||||
jsimd_idct_ifast_neon_consts:
|
||||
Ljsimd_idct_ifast_neon_consts:
|
||||
.short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
|
||||
.short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
|
||||
.short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
|
||||
@@ -810,12 +811,12 @@ asm_function jsimd_idct_ifast_neon
|
||||
/* Save NEON registers used in fast IDCT */
|
||||
sub sp, sp, #176
|
||||
stp x22, x23, [sp], 16
|
||||
adr x23, jsimd_idct_ifast_neon_consts
|
||||
st1 {v0.8b - v3.8b}, [sp], 32
|
||||
st1 {v4.8b - v7.8b}, [sp], 32
|
||||
st1 {v8.8b - v11.8b}, [sp], 32
|
||||
st1 {v12.8b - v15.8b}, [sp], 32
|
||||
st1 {v16.8b - v19.8b}, [sp], 32
|
||||
adr x23, Ljsimd_idct_ifast_neon_consts
|
||||
st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32
|
||||
ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
|
||||
ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32
|
||||
@@ -909,24 +910,24 @@ asm_function jsimd_idct_ifast_neon
|
||||
trn2 v15.4s, v18.4s, v15.4s
|
||||
/* vswp v14.4h, v10-MSB.4h */
|
||||
umov x22, v14.d[0]
|
||||
ins v14.2d[0], v10.2d[1]
|
||||
ins v10.2d[1], x22
|
||||
ins v14.d[0], v10.d[1]
|
||||
ins v10.d[1], x22
|
||||
/* vswp v13.4h, v9MSB.4h */
|
||||
|
||||
umov x22, v13.d[0]
|
||||
ins v13.2d[0], v9.2d[1]
|
||||
ins v9.2d[1], x22
|
||||
ins v13.d[0], v9.d[1]
|
||||
ins v9.d[1], x22
|
||||
/* 1-D IDCT, pass 2 */
|
||||
sub v2.8h, v10.8h, v14.8h
|
||||
/* vswp v15.4h, v11MSB.4h */
|
||||
umov x22, v15.d[0]
|
||||
ins v15.2d[0], v11.2d[1]
|
||||
ins v11.2d[1], x22
|
||||
ins v15.d[0], v11.d[1]
|
||||
ins v11.d[1], x22
|
||||
add v14.8h, v10.8h, v14.8h
|
||||
/* vswp v12.4h, v8-MSB.4h */
|
||||
umov x22, v12.d[0]
|
||||
ins v12.2d[0], v8.2d[1]
|
||||
ins v8.2d[1], x22
|
||||
ins v12.d[0], v8.d[1]
|
||||
ins v8.d[1], x22
|
||||
sub v1.8h, v11.8h, v13.8h
|
||||
add v13.8h, v11.8h, v13.8h
|
||||
sub v5.8h, v9.8h, v15.8h
|
||||
@@ -997,13 +998,13 @@ asm_function jsimd_idct_ifast_neon
|
||||
trn1 v9.4s, v9.4s, v11.4s
|
||||
trn2 v11.4s, v18.4s, v11.4s
|
||||
/* make copy */
|
||||
ins v17.2d[0], v8.2d[1]
|
||||
ins v17.d[0], v8.d[1]
|
||||
/* Transpose d16-d17-msb */
|
||||
mov v18.16b, v8.16b
|
||||
trn1 v8.8b, v8.8b, v17.8b
|
||||
trn2 v17.8b, v18.8b, v17.8b
|
||||
/* make copy */
|
||||
ins v19.2d[0], v9.2d[1]
|
||||
ins v19.d[0], v9.d[1]
|
||||
mov v18.16b, v9.16b
|
||||
trn1 v9.8b, v9.8b, v19.8b
|
||||
trn2 v19.8b, v18.8b, v19.8b
|
||||
@@ -1018,7 +1019,7 @@ asm_function jsimd_idct_ifast_neon
|
||||
add TMP2, TMP2, OUTPUT_COL
|
||||
st1 {v9.8b}, [TMP1]
|
||||
/* make copy */
|
||||
ins v7.2d[0], v10.2d[1]
|
||||
ins v7.d[0], v10.d[1]
|
||||
mov v18.16b, v10.16b
|
||||
trn1 v10.8b, v10.8b, v7.8b
|
||||
trn2 v7.8b, v18.8b, v7.8b
|
||||
@@ -1031,7 +1032,7 @@ asm_function jsimd_idct_ifast_neon
|
||||
add TMP5, TMP5, OUTPUT_COL
|
||||
st1 {v10.8b}, [TMP1]
|
||||
/* make copy */
|
||||
ins v16.2d[0], v11.2d[1]
|
||||
ins v16.d[0], v11.d[1]
|
||||
mov v18.16b, v11.16b
|
||||
trn1 v11.8b, v11.8b, v16.8b
|
||||
trn2 v16.8b, v18.8b, v16.8b
|
||||
@@ -1040,11 +1041,11 @@ asm_function jsimd_idct_ifast_neon
|
||||
st1 {v16.8b}, [TMP5]
|
||||
sub sp, sp, #176
|
||||
ldp x22, x23, [sp], 16
|
||||
ld1 {v0.8b - v3.8b}, [sp], 32
|
||||
ld1 {v4.8b - v7.8b}, [sp], 32
|
||||
ld1 {v8.8b - v11.8b}, [sp], 32
|
||||
ld1 {v12.8b - v15.8b}, [sp], 32
|
||||
ld1 {v16.8b - v19.8b}, [sp], 32
|
||||
ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
blr x30
|
||||
|
||||
.unreq DCT_TABLE
|
||||
@@ -1095,38 +1096,38 @@ asm_function jsimd_idct_ifast_neon
|
||||
#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
|
||||
|
||||
.balign 16
|
||||
jsimd_idct_4x4_neon_consts:
|
||||
.short FIX_1_847759065 /* v0.4h[0] */
|
||||
.short -FIX_0_765366865 /* v0.4h[1] */
|
||||
.short -FIX_0_211164243 /* v0.4h[2] */
|
||||
.short FIX_1_451774981 /* v0.4h[3] */
|
||||
Ljsimd_idct_4x4_neon_consts:
|
||||
.short FIX_1_847759065 /* v0.h[0] */
|
||||
.short -FIX_0_765366865 /* v0.h[1] */
|
||||
.short -FIX_0_211164243 /* v0.h[2] */
|
||||
.short FIX_1_451774981 /* v0.h[3] */
|
||||
.short -FIX_2_172734803 /* d1[0] */
|
||||
.short FIX_1_061594337 /* d1[1] */
|
||||
.short -FIX_0_509795579 /* d1[2] */
|
||||
.short -FIX_0_601344887 /* d1[3] */
|
||||
.short FIX_0_899976223 /* v2.4h[0] */
|
||||
.short FIX_2_562915447 /* v2.4h[1] */
|
||||
.short 1 << (CONST_BITS+1) /* v2.4h[2] */
|
||||
.short 0 /* v2.4h[3] */
|
||||
.short FIX_0_899976223 /* v2.h[0] */
|
||||
.short FIX_2_562915447 /* v2.h[1] */
|
||||
.short 1 << (CONST_BITS+1) /* v2.h[2] */
|
||||
.short 0 /* v2.h[3] */
|
||||
|
||||
.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
|
||||
smull v28.4s, \x4, v2.4h[2]
|
||||
smlal v28.4s, \x8, v0.4h[0]
|
||||
smlal v28.4s, \x14, v0.4h[1]
|
||||
smull v28.4s, \x4, v2.h[2]
|
||||
smlal v28.4s, \x8, v0.h[0]
|
||||
smlal v28.4s, \x14, v0.h[1]
|
||||
|
||||
smull v26.4s, \x16, v1.4h[2]
|
||||
smlal v26.4s, \x12, v1.4h[3]
|
||||
smlal v26.4s, \x10, v2.4h[0]
|
||||
smlal v26.4s, \x6, v2.4h[1]
|
||||
smull v26.4s, \x16, v1.h[2]
|
||||
smlal v26.4s, \x12, v1.h[3]
|
||||
smlal v26.4s, \x10, v2.h[0]
|
||||
smlal v26.4s, \x6, v2.h[1]
|
||||
|
||||
smull v30.4s, \x4, v2.4h[2]
|
||||
smlsl v30.4s, \x8, v0.4h[0]
|
||||
smlsl v30.4s, \x14, v0.4h[1]
|
||||
smull v30.4s, \x4, v2.h[2]
|
||||
smlsl v30.4s, \x8, v0.h[0]
|
||||
smlsl v30.4s, \x14, v0.h[1]
|
||||
|
||||
smull v24.4s, \x16, v0.4h[2]
|
||||
smlal v24.4s, \x12, v0.4h[3]
|
||||
smlal v24.4s, \x10, v1.4h[0]
|
||||
smlal v24.4s, \x6, v1.4h[1]
|
||||
smull v24.4s, \x16, v0.h[2]
|
||||
smlal v24.4s, \x12, v0.h[3]
|
||||
smlal v24.4s, \x10, v1.h[0]
|
||||
smlal v24.4s, \x6, v1.h[1]
|
||||
|
||||
add v20.4s, v28.4s, v26.4s
|
||||
sub v28.4s, v28.4s, v26.4s
|
||||
@@ -1171,15 +1172,15 @@ asm_function jsimd_idct_4x4_neon
|
||||
sub sp, sp, 272
|
||||
str x15, [sp], 16
|
||||
/* Load constants (v3.4h is just used for padding) */
|
||||
adr TMP4, jsimd_idct_4x4_neon_consts
|
||||
st1 {v0.8b - v3.8b}, [sp], 32
|
||||
st1 {v4.8b - v7.8b}, [sp], 32
|
||||
st1 {v8.8b - v11.8b}, [sp], 32
|
||||
st1 {v12.8b - v15.8b}, [sp], 32
|
||||
st1 {v16.8b - v19.8b}, [sp], 32
|
||||
st1 {v20.8b - v23.8b}, [sp], 32
|
||||
st1 {v24.8b - v27.8b}, [sp], 32
|
||||
st1 {v28.8b - v31.8b}, [sp], 32
|
||||
adr TMP4, Ljsimd_idct_4x4_neon_consts
|
||||
st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
|
||||
|
||||
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
||||
@@ -1203,45 +1204,45 @@ asm_function jsimd_idct_4x4_neon
|
||||
ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
|
||||
mul v4.4h, v4.4h, v18.4h
|
||||
mul v5.4h, v5.4h, v19.4h
|
||||
ins v4.2d[1], v5.2d[0] /* 128 bit q4 */
|
||||
ins v4.d[1], v5.d[0] /* 128 bit q4 */
|
||||
ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
|
||||
mul v6.4h, v6.4h, v20.4h
|
||||
mul v7.4h, v7.4h, v21.4h
|
||||
ins v6.2d[1], v7.2d[0] /* 128 bit q6 */
|
||||
ins v6.d[1], v7.d[0] /* 128 bit q6 */
|
||||
mul v8.4h, v8.4h, v22.4h
|
||||
mul v9.4h, v9.4h, v23.4h
|
||||
ins v8.2d[1], v9.2d[0] /* 128 bit q8 */
|
||||
ins v8.d[1], v9.d[0] /* 128 bit q8 */
|
||||
add DCT_TABLE, DCT_TABLE, #16
|
||||
ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
|
||||
mul v10.4h, v10.4h, v24.4h
|
||||
mul v11.4h, v11.4h, v25.4h
|
||||
ins v10.2d[1], v11.2d[0] /* 128 bit q10 */
|
||||
ins v10.d[1], v11.d[0] /* 128 bit q10 */
|
||||
mul v12.4h, v12.4h, v26.4h
|
||||
mul v13.4h, v13.4h, v27.4h
|
||||
ins v12.2d[1], v13.2d[0] /* 128 bit q12 */
|
||||
ins v12.d[1], v13.d[0] /* 128 bit q12 */
|
||||
ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
|
||||
mul v14.4h, v14.4h, v28.4h
|
||||
mul v15.4h, v15.4h, v29.4h
|
||||
ins v14.2d[1], v15.2d[0] /* 128 bit q14 */
|
||||
ins v14.d[1], v15.d[0] /* 128 bit q14 */
|
||||
mul v16.4h, v16.4h, v30.4h
|
||||
mul v17.4h, v17.4h, v31.4h
|
||||
ins v16.2d[1], v17.2d[0] /* 128 bit q16 */
|
||||
ins v16.d[1], v17.d[0] /* 128 bit q16 */
|
||||
|
||||
/* Pass 1 */
|
||||
idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4.4h, v6.4h, v8.4h, v10.4h
|
||||
transpose_4x4 v4, v6, v8, v10, v3
|
||||
ins v10.2d[1], v11.2d[0]
|
||||
ins v10.d[1], v11.d[0]
|
||||
idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5.4h, v7.4h, v9.4h, v11.4h
|
||||
transpose_4x4 v5, v7, v9, v11, v3
|
||||
ins v10.2d[1], v11.2d[0]
|
||||
ins v10.d[1], v11.d[0]
|
||||
/* Pass 2 */
|
||||
idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4h, v27.4h, v28.4h, v29.4h
|
||||
transpose_4x4 v26, v27, v28, v29, v3
|
||||
|
||||
/* Range limit */
|
||||
movi v30.8h, #0x80
|
||||
ins v26.2d[1], v27.2d[0]
|
||||
ins v28.2d[1], v29.2d[0]
|
||||
ins v26.d[1], v27.d[0]
|
||||
ins v28.d[1], v29.d[0]
|
||||
add v26.8h, v26.8h, v30.8h
|
||||
add v28.8h, v28.8h, v30.8h
|
||||
sqxtun v26.8b, v26.8h
|
||||
@@ -1286,14 +1287,14 @@ asm_function jsimd_idct_4x4_neon
|
||||
/* vpop {v8.4h - v15.4h} ;not available */
|
||||
sub sp, sp, #272
|
||||
ldr x15, [sp], 16
|
||||
ld1 {v0.8b - v3.8b}, [sp], 32
|
||||
ld1 {v4.8b - v7.8b}, [sp], 32
|
||||
ld1 {v8.8b - v11.8b}, [sp], 32
|
||||
ld1 {v12.8b - v15.8b}, [sp], 32
|
||||
ld1 {v16.8b - v19.8b}, [sp], 32
|
||||
ld1 {v20.8b - v23.8b}, [sp], 32
|
||||
ld1 {v24.8b - v27.8b}, [sp], 32
|
||||
ld1 {v28.8b - v31.8b}, [sp], 32
|
||||
ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
blr x30
|
||||
|
||||
.unreq DCT_TABLE
|
||||
@@ -1325,7 +1326,7 @@ asm_function jsimd_idct_4x4_neon
|
||||
*/
|
||||
|
||||
.balign 8
|
||||
jsimd_idct_2x2_neon_consts:
|
||||
Ljsimd_idct_2x2_neon_consts:
|
||||
.short -FIX_0_720959822 /* v14[0] */
|
||||
.short FIX_0_850430095 /* v14[1] */
|
||||
.short -FIX_1_272758580 /* v14[2] */
|
||||
@@ -1333,10 +1334,10 @@ jsimd_idct_2x2_neon_consts:
|
||||
|
||||
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
|
||||
sshll v15.4s, \x4, #15
|
||||
smull v26.4s, \x6, v14.4h[3]
|
||||
smlal v26.4s, \x10, v14.4h[2]
|
||||
smlal v26.4s, \x12, v14.4h[1]
|
||||
smlal v26.4s, \x16, v14.4h[0]
|
||||
smull v26.4s, \x6, v14.h[3]
|
||||
smlal v26.4s, \x10, v14.h[2]
|
||||
smlal v26.4s, \x12, v14.h[1]
|
||||
smlal v26.4s, \x16, v14.h[0]
|
||||
|
||||
add v20.4s, v15.4s, v26.4s
|
||||
sub v15.4s, v15.4s, v26.4s
|
||||
@@ -1367,14 +1368,14 @@ asm_function jsimd_idct_2x2_neon
|
||||
str x15, [sp], 16
|
||||
|
||||
/* Load constants */
|
||||
adr TMP2, jsimd_idct_2x2_neon_consts
|
||||
st1 {v4.8b - v7.8b}, [sp], 32
|
||||
st1 {v8.8b - v11.8b}, [sp], 32
|
||||
st1 {v12.8b - v15.8b}, [sp], 32
|
||||
st1 {v16.8b - v19.8b}, [sp], 32
|
||||
st1 {v21.8b - v22.8b}, [sp], 16
|
||||
st1 {v24.8b - v27.8b}, [sp], 32
|
||||
st1 {v30.8b - v31.8b}, [sp], 16
|
||||
adr TMP2, Ljsimd_idct_2x2_neon_consts
|
||||
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
st1 {v21.8b, v22.8b}, [sp], 16
|
||||
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
st1 {v30.8b, v31.8b}, [sp], 16
|
||||
ld1 {v14.4h}, [TMP2]
|
||||
|
||||
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
||||
@@ -1400,25 +1401,25 @@ asm_function jsimd_idct_2x2_neon
|
||||
ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
|
||||
mul v4.4h, v4.4h, v18.4h
|
||||
mul v5.4h, v5.4h, v19.4h
|
||||
ins v4.2d[1], v5.2d[0]
|
||||
ins v4.d[1], v5.d[0]
|
||||
mul v6.4h, v6.4h, v20.4h
|
||||
mul v7.4h, v7.4h, v21.4h
|
||||
ins v6.2d[1], v7.2d[0]
|
||||
ins v6.d[1], v7.d[0]
|
||||
add DCT_TABLE, DCT_TABLE, #16
|
||||
ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
|
||||
mul v10.4h, v10.4h, v24.4h
|
||||
mul v11.4h, v11.4h, v25.4h
|
||||
ins v10.2d[1], v11.2d[0]
|
||||
ins v10.d[1], v11.d[0]
|
||||
add DCT_TABLE, DCT_TABLE, #16
|
||||
ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
|
||||
mul v12.4h, v12.4h, v26.4h
|
||||
mul v13.4h, v13.4h, v27.4h
|
||||
ins v12.2d[1], v13.2d[0]
|
||||
ins v12.d[1], v13.d[0]
|
||||
add DCT_TABLE, DCT_TABLE, #16
|
||||
ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
|
||||
mul v16.4h, v16.4h, v30.4h
|
||||
mul v17.4h, v17.4h, v31.4h
|
||||
ins v16.2d[1], v17.2d[0]
|
||||
ins v16.d[1], v17.d[0]
|
||||
|
||||
/* Pass 1 */
|
||||
#if 0
|
||||
@@ -1427,14 +1428,14 @@ asm_function jsimd_idct_2x2_neon
|
||||
idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
|
||||
transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
|
||||
#else
|
||||
smull v26.4s, v6.4h, v14.4h[3]
|
||||
smlal v26.4s, v10.4h, v14.4h[2]
|
||||
smlal v26.4s, v12.4h, v14.4h[1]
|
||||
smlal v26.4s, v16.4h, v14.4h[0]
|
||||
smull v24.4s, v7.4h, v14.4h[3]
|
||||
smlal v24.4s, v11.4h, v14.4h[2]
|
||||
smlal v24.4s, v13.4h, v14.4h[1]
|
||||
smlal v24.4s, v17.4h, v14.4h[0]
|
||||
smull v26.4s, v6.4h, v14.h[3]
|
||||
smlal v26.4s, v10.4h, v14.h[2]
|
||||
smlal v26.4s, v12.4h, v14.h[1]
|
||||
smlal v26.4s, v16.4h, v14.h[0]
|
||||
smull v24.4s, v7.4h, v14.h[3]
|
||||
smlal v24.4s, v11.4h, v14.h[2]
|
||||
smlal v24.4s, v13.4h, v14.h[1]
|
||||
smlal v24.4s, v17.4h, v14.h[0]
|
||||
sshll v15.4s, v4.4h, #15
|
||||
sshll v30.4s, v5.4h, #15
|
||||
add v20.4s, v15.4s, v26.4s
|
||||
@@ -1445,12 +1446,12 @@ asm_function jsimd_idct_2x2_neon
|
||||
sub v15.4s, v30.4s, v24.4s
|
||||
rshrn v5.4h, v20.4s, #13
|
||||
rshrn v7.4h, v15.4s, #13
|
||||
ins v4.2d[1], v5.2d[0]
|
||||
ins v6.2d[1], v7.2d[0]
|
||||
ins v4.d[1], v5.d[0]
|
||||
ins v6.d[1], v7.d[0]
|
||||
transpose v4, v6, v3, .16b, .8h
|
||||
transpose v6, v10, v3, .16b, .4s
|
||||
ins v11.2d[0], v10.2d[1]
|
||||
ins v7.2d[0], v6.2d[1]
|
||||
ins v11.d[0], v10.d[1]
|
||||
ins v7.d[0], v6.d[1]
|
||||
#endif
|
||||
|
||||
/* Pass 2 */
|
||||
@@ -1458,10 +1459,10 @@ asm_function jsimd_idct_2x2_neon
|
||||
|
||||
/* Range limit */
|
||||
movi v30.8h, #0x80
|
||||
ins v26.2d[1], v27.2d[0]
|
||||
ins v26.d[1], v27.d[0]
|
||||
add v26.8h, v26.8h, v30.8h
|
||||
sqxtun v30.8b, v26.8h
|
||||
ins v26.2d[0], v30.2d[0]
|
||||
ins v26.d[0], v30.d[0]
|
||||
sqxtun v27.8b, v26.8h
|
||||
|
||||
/* Store results to the output buffer */
|
||||
@@ -1476,13 +1477,13 @@ asm_function jsimd_idct_2x2_neon
|
||||
|
||||
sub sp, sp, #208
|
||||
ldr x15, [sp], 16
|
||||
ld1 {v4.8b - v7.8b}, [sp], 32
|
||||
ld1 {v8.8b - v11.8b}, [sp], 32
|
||||
ld1 {v12.8b - v15.8b}, [sp], 32
|
||||
ld1 {v16.8b - v19.8b}, [sp], 32
|
||||
ld1 {v21.8b - v22.8b}, [sp], 16
|
||||
ld1 {v24.8b - v27.8b}, [sp], 32
|
||||
ld1 {v30.8b - v31.8b}, [sp], 16
|
||||
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
ld1 {v21.8b, v22.8b}, [sp], 16
|
||||
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
ld1 {v30.8b, v31.8b}, [sp], 16
|
||||
blr x30
|
||||
|
||||
.unreq DCT_TABLE
|
||||
@@ -1514,9 +1515,9 @@ asm_function jsimd_idct_2x2_neon
|
||||
ld1 {v4.8b}, [U], 8
|
||||
ld1 {v5.8b}, [V], 8
|
||||
ld1 {v0.8b}, [Y], 8
|
||||
prfm PLDL1KEEP, [U, #64]
|
||||
prfm PLDL1KEEP, [V, #64]
|
||||
prfm PLDL1KEEP, [Y, #64]
|
||||
prfm pldl1keep, [U, #64]
|
||||
prfm pldl1keep, [V, #64]
|
||||
prfm pldl1keep, [Y, #64]
|
||||
.elseif \size == 4
|
||||
ld1 {v4.b}[0], [U], 1
|
||||
ld1 {v4.b}[1], [U], 1
|
||||
@@ -1606,14 +1607,14 @@ asm_function jsimd_idct_2x2_neon
|
||||
.macro do_yuv_to_rgb_stage1
|
||||
uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
|
||||
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
|
||||
smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
|
||||
smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
|
||||
smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
|
||||
smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
|
||||
smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
|
||||
smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
|
||||
smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
|
||||
smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
|
||||
smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
|
||||
smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
|
||||
smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
|
||||
smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
|
||||
smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
|
||||
smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
|
||||
smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
|
||||
smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
|
||||
.endm
|
||||
|
||||
.macro do_yuv_to_rgb_stage2
|
||||
@@ -1656,18 +1657,18 @@ asm_function jsimd_idct_2x2_neon
|
||||
sqxtun v1\g_offs\defsize, v20.8h
|
||||
ld1 {v0.8b}, [Y], 8
|
||||
sqxtun v1\r_offs\defsize, v24.8h
|
||||
prfm PLDL1KEEP, [U, #64]
|
||||
prfm PLDL1KEEP, [V, #64]
|
||||
prfm PLDL1KEEP, [Y, #64]
|
||||
prfm pldl1keep, [U, #64]
|
||||
prfm pldl1keep, [V, #64]
|
||||
prfm pldl1keep, [Y, #64]
|
||||
sqxtun v1\b_offs\defsize, v28.8h
|
||||
uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
|
||||
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
|
||||
smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
|
||||
smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
|
||||
smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
|
||||
smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
|
||||
smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
|
||||
smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
|
||||
smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
|
||||
smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
|
||||
smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
|
||||
smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
|
||||
smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
|
||||
smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
|
||||
.else /**************************** rgb565 ***********************************/
|
||||
sqshlu v21.8h, v20.8h, #8
|
||||
sqshlu v25.8h, v24.8h, #8
|
||||
@@ -1675,21 +1676,21 @@ asm_function jsimd_idct_2x2_neon
|
||||
uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
|
||||
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
|
||||
ld1 {v0.8b}, [Y], 8
|
||||
smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
|
||||
smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
|
||||
smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
|
||||
smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
|
||||
smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
|
||||
smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
|
||||
smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
|
||||
smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
|
||||
sri v25.8h, v21.8h, #5
|
||||
smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
|
||||
smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
|
||||
prfm PLDL1KEEP, [U, #64]
|
||||
prfm PLDL1KEEP, [V, #64]
|
||||
prfm PLDL1KEEP, [Y, #64]
|
||||
smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
|
||||
smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
|
||||
prfm pldl1keep, [U, #64]
|
||||
prfm pldl1keep, [V, #64]
|
||||
prfm pldl1keep, [Y, #64]
|
||||
sri v25.8h, v29.8h, #11
|
||||
.endif
|
||||
do_store \bpp, 8
|
||||
smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
|
||||
smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
|
||||
smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
|
||||
smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
|
||||
.endm
|
||||
|
||||
.macro do_yuv_to_rgb
|
||||
@@ -1702,7 +1703,7 @@ asm_function jsimd_idct_2x2_neon
|
||||
*/
|
||||
|
||||
.balign 16
|
||||
jsimd_ycc_\colorid\()_neon_consts:
|
||||
Ljsimd_ycc_\colorid\()_neon_consts:
|
||||
.short 0, 0, 0, 0
|
||||
.short 22971, -11277, -23401, 29033
|
||||
.short -128, -128, -128, -128
|
||||
@@ -1717,7 +1718,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
|
||||
INPUT_BUF0 .req x5
|
||||
INPUT_BUF1 .req x6
|
||||
INPUT_BUF2 .req INPUT_BUF
|
||||
INPUT_BUF2 .req x1
|
||||
|
||||
RGB .req x7
|
||||
Y .req x8
|
||||
@@ -1728,16 +1729,16 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
sub sp, sp, 336
|
||||
str x15, [sp], 16
|
||||
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
|
||||
adr x15, jsimd_ycc_\colorid\()_neon_consts
|
||||
adr x15, Ljsimd_ycc_\colorid\()_neon_consts
|
||||
/* Save NEON registers */
|
||||
st1 {v0.8b - v3.8b}, [sp], 32
|
||||
st1 {v4.8b - v7.8b}, [sp], 32
|
||||
st1 {v8.8b - v11.8b}, [sp], 32
|
||||
st1 {v12.8b - v15.8b}, [sp], 32
|
||||
st1 {v16.8b - v19.8b}, [sp], 32
|
||||
st1 {v20.8b - v23.8b}, [sp], 32
|
||||
st1 {v24.8b - v27.8b}, [sp], 32
|
||||
st1 {v28.8b - v31.8b}, [sp], 32
|
||||
st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
ld1 {v0.4h, v1.4h}, [x15], 16
|
||||
ld1 {v2.8h}, [x15]
|
||||
|
||||
@@ -1748,8 +1749,8 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
stp x8, x9, [sp], 16
|
||||
stp x10, x30, [sp], 16
|
||||
ldr INPUT_BUF0, [INPUT_BUF]
|
||||
ldr INPUT_BUF1, [INPUT_BUF, 8]
|
||||
ldr INPUT_BUF2, [INPUT_BUF, 16]
|
||||
ldr INPUT_BUF1, [INPUT_BUF, #8]
|
||||
ldr INPUT_BUF2, [INPUT_BUF, #16]
|
||||
.unreq INPUT_BUF
|
||||
|
||||
/* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
|
||||
@@ -1758,7 +1759,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
|
||||
/* Outer loop over scanlines */
|
||||
cmp NUM_ROWS, #1
|
||||
blt 9f
|
||||
b.lt 9f
|
||||
0:
|
||||
lsl x16, INPUT_ROW, #3
|
||||
ldr Y, [INPUT_BUF0, x16]
|
||||
@@ -1770,60 +1771,60 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
|
||||
/* Inner loop over pixels */
|
||||
subs N, N, #8
|
||||
blt 3f
|
||||
b.lt 3f
|
||||
do_load 8
|
||||
do_yuv_to_rgb_stage1
|
||||
subs N, N, #8
|
||||
blt 2f
|
||||
b.lt 2f
|
||||
1:
|
||||
do_yuv_to_rgb_stage2_store_load_stage1
|
||||
subs N, N, #8
|
||||
bge 1b
|
||||
b.ge 1b
|
||||
2:
|
||||
do_yuv_to_rgb_stage2
|
||||
do_store \bpp, 8
|
||||
tst N, #7
|
||||
beq 8f
|
||||
b.eq 8f
|
||||
3:
|
||||
tst N, #4
|
||||
beq 3f
|
||||
b.eq 3f
|
||||
do_load 4
|
||||
3:
|
||||
tst N, #2
|
||||
beq 4f
|
||||
b.eq 4f
|
||||
do_load 2
|
||||
4:
|
||||
tst N, #1
|
||||
beq 5f
|
||||
b.eq 5f
|
||||
do_load 1
|
||||
5:
|
||||
do_yuv_to_rgb
|
||||
tst N, #4
|
||||
beq 6f
|
||||
b.eq 6f
|
||||
do_store \bpp, 4
|
||||
6:
|
||||
tst N, #2
|
||||
beq 7f
|
||||
b.eq 7f
|
||||
do_store \bpp, 2
|
||||
7:
|
||||
tst N, #1
|
||||
beq 8f
|
||||
b.eq 8f
|
||||
do_store \bpp, 1
|
||||
8:
|
||||
subs NUM_ROWS, NUM_ROWS, #1
|
||||
bgt 0b
|
||||
b.gt 0b
|
||||
9:
|
||||
/* Restore all registers and return */
|
||||
sub sp, sp, #336
|
||||
ldr x15, [sp], 16
|
||||
ld1 {v0.8b - v3.8b}, [sp], 32
|
||||
ld1 {v4.8b - v7.8b}, [sp], 32
|
||||
ld1 {v8.8b - v11.8b}, [sp], 32
|
||||
ld1 {v12.8b - v15.8b}, [sp], 32
|
||||
ld1 {v16.8b - v19.8b}, [sp], 32
|
||||
ld1 {v20.8b - v23.8b}, [sp], 32
|
||||
ld1 {v24.8b - v27.8b}, [sp], 32
|
||||
ld1 {v28.8b - v31.8b}, [sp], 32
|
||||
ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
/* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
|
||||
ldp x4, x5, [sp], 16
|
||||
ldp x6, x7, [sp], 16
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* jsimd_powerpc64.c
|
||||
* jsimd_powerpc.c
|
||||
*
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright 2009-2011, 2014 D. R. Commander
|
||||
@@ -42,12 +42,38 @@ init_simd (void)
|
||||
GLOBAL(int)
|
||||
jsimd_can_rgb_ycc (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_rgb_gray (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -68,6 +94,37 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows)
|
||||
{
|
||||
void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
|
||||
|
||||
switch(cinfo->in_color_space) {
|
||||
case JCS_EXT_RGB:
|
||||
altivecfct=jsimd_extrgb_ycc_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
case JCS_EXT_RGBA:
|
||||
altivecfct=jsimd_extrgbx_ycc_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
altivecfct=jsimd_extbgr_ycc_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
case JCS_EXT_BGRA:
|
||||
altivecfct=jsimd_extbgrx_ycc_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
case JCS_EXT_ABGR:
|
||||
altivecfct=jsimd_extxbgr_ycc_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
case JCS_EXT_ARGB:
|
||||
altivecfct=jsimd_extxrgb_ycc_convert_altivec;
|
||||
break;
|
||||
default:
|
||||
altivecfct=jsimd_rgb_ycc_convert_altivec;
|
||||
break;
|
||||
}
|
||||
|
||||
altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -75,6 +132,37 @@ jsimd_rgb_gray_convert (j_compress_ptr cinfo,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows)
|
||||
{
|
||||
void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
|
||||
|
||||
switch(cinfo->in_color_space) {
|
||||
case JCS_EXT_RGB:
|
||||
altivecfct=jsimd_extrgb_gray_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
case JCS_EXT_RGBA:
|
||||
altivecfct=jsimd_extrgbx_gray_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
altivecfct=jsimd_extbgr_gray_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
case JCS_EXT_BGRA:
|
||||
altivecfct=jsimd_extbgrx_gray_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
case JCS_EXT_ABGR:
|
||||
altivecfct=jsimd_extxbgr_gray_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
case JCS_EXT_ARGB:
|
||||
altivecfct=jsimd_extxrgb_gray_convert_altivec;
|
||||
break;
|
||||
default:
|
||||
altivecfct=jsimd_rgb_gray_convert_altivec;
|
||||
break;
|
||||
}
|
||||
|
||||
altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -202,6 +290,21 @@ jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
|
||||
GLOBAL(int)
|
||||
jsimd_can_convsamp (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -215,6 +318,7 @@ GLOBAL(void)
|
||||
jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
DCTELEM * workspace)
|
||||
{
|
||||
jsimd_convsamp_altivec(sample_data, start_col, workspace);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -226,6 +330,17 @@ jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
GLOBAL(int)
|
||||
jsimd_can_fdct_islow (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -255,6 +370,7 @@ jsimd_can_fdct_float (void)
|
||||
GLOBAL(void)
|
||||
jsimd_fdct_islow (DCTELEM * data)
|
||||
{
|
||||
jsimd_fdct_islow_altivec(data);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -271,6 +387,19 @@ jsimd_fdct_float (FAST_FLOAT * data)
|
||||
GLOBAL(int)
|
||||
jsimd_can_quantize (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -284,6 +413,7 @@ GLOBAL(void)
|
||||
jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
|
||||
DCTELEM * workspace)
|
||||
{
|
||||
jsimd_quantize_altivec(coef_block, divisors, workspace);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -321,12 +451,34 @@ jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_islow (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_ifast (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -341,6 +493,8 @@ jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
|
||||
output_col);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -348,6 +502,8 @@ jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
|
||||
output_col);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
|
||||
@@ -50,4 +50,7 @@ TURBOJPEG_1.4
|
||||
tjDecompressToYUVPlanes;
|
||||
tjEncodeYUV3;
|
||||
tjEncodeYUVPlanes;
|
||||
tjPlaneHeight;
|
||||
tjPlaneSizeYUV;
|
||||
tjPlaneWidth;
|
||||
} TURBOJPEG_1.2;
|
||||
|
||||
@@ -76,6 +76,9 @@ TURBOJPEG_1.4
|
||||
tjDecompressToYUVPlanes;
|
||||
tjEncodeYUV3;
|
||||
tjEncodeYUVPlanes;
|
||||
tjPlaneHeight;
|
||||
tjPlaneSizeYUV;
|
||||
tjPlaneWidth;
|
||||
Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII;
|
||||
Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFromYUV___3_3B_3II_3III_3BII;
|
||||
Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIIIII_3_3B_3I_3III;
|
||||
|
||||
@@ -320,6 +320,14 @@ static int setDecompDefaults(struct jpeg_decompress_struct *dinfo,
|
||||
static int getSubsamp(j_decompress_ptr dinfo)
|
||||
{
|
||||
int retval=-1, i, k;
|
||||
|
||||
/* The sampling factors actually have no meaning with grayscale JPEG files,
|
||||
and in fact it's possible to generate grayscale JPEGs with sampling
|
||||
factors > 1 (even though those sampling factors are ignored by the
|
||||
decompressor.) Thus, we need to treat grayscale as a special case. */
|
||||
if(dinfo->num_components==1 && dinfo->jpeg_color_space==JCS_GRAYSCALE)
|
||||
return TJSAMP_GRAY;
|
||||
|
||||
for(i=0; i<NUMSUBOPT; i++)
|
||||
{
|
||||
if(dinfo->num_components==pixelsize[i]
|
||||
|
||||
Reference in New Issue
Block a user