Merge branch 'libjpeg-turbo'
* libjpeg-turbo: (39 commits) Oops. Delete the duplicate copy of [lib]turbojpeg.dll in the binary directory when uninstalling the package. AltiVec SIMD implementation of sample conversion and integer quantization Document the fact that the AltiVec implementation uses the same modified algorithms as the SSE2 implementation Use intrinsics for loading/storing data in the DCT/IDCT functions. This has no effect on the performance of the aligned loads/stores, but it makes it more obvious what that code is doing. Using intrinsics for the unaligned stores in the inverse DCT functions increases overall decompression performance by 1-2%. AltiVec SIMD implementation of RGB-to-Grayscale color conversion Remove unneeded code; Make sure jccolor-altivec.o will be rebuilt if jccolext-altivec.c changes. AltiVec SIMD implementation of RGB-to-YCC color conversion Make test a phony target so things don't go haywire if there is a file named test.c in the current directory. Maintain the traditional order of the regression tests while allowing the TurboJPEG and libjpeg portions to be executed separately Make comments more consistent Add a "quicktest" pseudo-target, for those times when you just don't want to sit through 11 iterations of TJUnitTest. Cosmetic tweaks to the PowerPC SIMD stubs Split AltiVec algorithms into separate files for ease of maintenance; Rename constants using lowercase so they are not confused with macros Optimizations to the AltiVec DCT algorithms (pre-compute constants and combine multiply/add operations) AltiVec SIMD implementation of slow integer inverse DCT Use macros to allocate constants statically, rather than reading them from a table using vec_splat*(). This improves code readability and probably improves performance a bit as well. Swap the order of the IFAST and ISLOW FDCT functions so that it matches the order of the prototypes in jsimd.h and the stubs in jsimd_powerpc.c. Include ARMv8 binaries when generating a combined OS X/iOS package using 'make iosdmg' In the output of the configure script, indicate whether gas-preprocessor.pl is being used along with the assembler. Modify the ARM64 assembly file so that it uses only syntax that the clang assembler in XCode 5.x can understand. These changes should all be cosmetic in nature-- they do not change the meaning or readability of the code nor the ability to build it for Linux. Actually, the code is now more in compliance with the ARM64 programming manual. In addition to these changes, there were a couple of instructions that clang simply doesn't support, so gas-preprocessor.pl was modified so that it now converts those into equivalent instructions that clang can handle. ... Conflicts: BUILDING.txt ChangeLog.txt cjpeg.c jpegtran.c
This commit is contained in:
104
BUILDING.txt
104
BUILDING.txt
@@ -322,6 +322,9 @@ Additional build requirements:
|
|||||||
(https://sourceforge.net/p/libjpeg-turbo/code/HEAD/tree/gas-preprocessor)
|
(https://sourceforge.net/p/libjpeg-turbo/code/HEAD/tree/gas-preprocessor)
|
||||||
should be installed in your PATH.
|
should be installed in your PATH.
|
||||||
|
|
||||||
|
|
||||||
|
ARM 32-bit Build (Xcode 4.6.x and earlier, LLVM-GCC):
|
||||||
|
|
||||||
Set the following shell variables for simplicity:
|
Set the following shell variables for simplicity:
|
||||||
|
|
||||||
Xcode 4.2 and earlier:
|
Xcode 4.2 and earlier:
|
||||||
@@ -330,47 +333,80 @@ Set the following shell variables for simplicity:
|
|||||||
IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
|
IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
|
||||||
|
|
||||||
IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
|
IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
|
||||||
|
|
||||||
Xcode 4.6.x and earlier:
|
|
||||||
IOS_GCC=$IOS_PLATFORMDIR/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2
|
IOS_GCC=$IOS_PLATFORMDIR/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2
|
||||||
Xcode 5.0.x and later:
|
|
||||||
IOS_GCC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
|
||||||
|
|
||||||
ARM v6 (code will run on all iOS devices, not SIMD-accelerated):
|
ARMv6 (code will run on all iOS devices, not SIMD-accelerated):
|
||||||
[NOTE: Requires Xcode 4.4.x or earlier]
|
[NOTE: Requires Xcode 4.4.x or earlier]
|
||||||
IOS_CFLAGS="-march=armv6 -mcpu=arm1176jzf-s -mfpu=vfp"
|
IOS_CFLAGS="-march=armv6 -mcpu=arm1176jzf-s -mfpu=vfp"
|
||||||
|
|
||||||
ARM v7 (code will run on iPhone 3GS-4S/iPad 1st-3rd Generation and newer):
|
ARMv7 (code will run on iPhone 3GS-4S/iPad 1st-3rd Generation and newer):
|
||||||
Xcode 4.6.x and earlier:
|
|
||||||
IOS_CFLAGS="-march=armv7 -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon"
|
IOS_CFLAGS="-march=armv7 -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon"
|
||||||
Xcode 5.0.x and later:
|
|
||||||
IOS_CFLAGS="-arch armv7"
|
|
||||||
|
|
||||||
ARM v7s (code will run on iPhone 5/iPad 4th Generation and newer):
|
ARMv7s (code will run on iPhone 5/iPad 4th Generation and newer):
|
||||||
[NOTE: Requires Xcode 4.5 or later]
|
[NOTE: Requires Xcode 4.5 or later]
|
||||||
Xcode 4.6.x and earlier:
|
|
||||||
IOS_CFLAGS="-march=armv7s -mcpu=swift -mtune=swift -mfpu=neon"
|
IOS_CFLAGS="-march=armv7s -mcpu=swift -mtune=swift -mfpu=neon"
|
||||||
Xcode 5.0.x and later:
|
|
||||||
IOS_CFLAGS="-arch armv7s"
|
|
||||||
|
|
||||||
Follow the procedure under "Building mozjpeg" above, adding
|
Follow the procedure under "Building mozjpeg" above, adding
|
||||||
|
|
||||||
--host arm-apple-darwin10 --enable-static --disable-shared \
|
--host arm-apple-darwin10 \
|
||||||
CC="$IOS_GCC" LD="$IOS_GCC" \
|
CC="$IOS_GCC" LD="$IOS_GCC" \
|
||||||
CFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
|
CFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
|
||||||
LDFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT $IOS_CFLAGS"
|
LDFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT $IOS_CFLAGS"
|
||||||
|
|
||||||
to the configure command line. If using Xcode 5.0.x or later, also add
|
to the configure command line.
|
||||||
|
|
||||||
|
|
||||||
|
ARM 32-bit Build (Xcode 5.0.x and later, Clang):
|
||||||
|
|
||||||
|
Set the following shell variables for simplicity:
|
||||||
|
|
||||||
|
IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
|
||||||
|
IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
|
||||||
|
IOS_GCC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||||
|
|
||||||
|
ARMv7 (code will run on iPhone 3GS-4S/iPad 1st-3rd Generation and newer):
|
||||||
|
IOS_CFLAGS="-arch armv7"
|
||||||
|
|
||||||
|
ARMv7s (code will run on iPhone 5/iPad 4th Generation and newer):
|
||||||
|
IOS_CFLAGS="-arch armv7s"
|
||||||
|
|
||||||
|
Follow the procedure under "Building libjpeg-turbo" above, adding
|
||||||
|
|
||||||
|
--host arm-apple-darwin10 \
|
||||||
|
CC="$IOS_GCC" LD="$IOS_GCC" \
|
||||||
|
CFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
|
||||||
|
LDFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT $IOS_CFLAGS" \
|
||||||
CCASFLAGS="-no-integrated-as $IOS_CFLAGS"
|
CCASFLAGS="-no-integrated-as $IOS_CFLAGS"
|
||||||
|
|
||||||
to the configure command line.
|
to the configure command line.
|
||||||
|
|
||||||
|
|
||||||
|
ARMv8 64-bit Build (Xcode 5.0.x and later, Clang):
|
||||||
|
|
||||||
|
Code will run on iPhone 5S/iPad Mini 2 and newer.
|
||||||
|
|
||||||
|
Set the following shell variables for simplicity:
|
||||||
|
|
||||||
|
IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
|
||||||
|
IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
|
||||||
|
IOS_GCC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||||
|
IOS_CFLAGS="-arch arm64"
|
||||||
|
|
||||||
|
Follow the procedure under "Building libjpeg-turbo" above, adding
|
||||||
|
|
||||||
|
--host aarch64-apple-darwin \
|
||||||
|
CC="$IOS_GCC" LD="$IOS_GCC" \
|
||||||
|
CFLAGS="-isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
|
||||||
|
LDFLAGS="-isysroot $IOS_SYSROOT $IOS_CFLAGS"
|
||||||
|
|
||||||
|
to the configure command line.
|
||||||
|
|
||||||
|
|
||||||
NOTE: You can also add -miphoneos-version-min={version} to $IOS_CFLAGS above
|
NOTE: You can also add -miphoneos-version-min={version} to $IOS_CFLAGS above
|
||||||
in order to support older versions of iOS than the default version supported by
|
in order to support older versions of iOS than the default version supported by
|
||||||
the SDK.
|
the SDK.
|
||||||
|
|
||||||
Once built, lipo can be used to combine the ARM v6, v7, and/or v7s variants
|
Once built, lipo can be used to combine the ARMv6, v7, v7s, and/or v8 variants
|
||||||
into a universal library.
|
into a universal library.
|
||||||
|
|
||||||
|
|
||||||
@@ -745,26 +781,32 @@ make udmg [BUILDDIR32={32-bit build directory}]
|
|||||||
make command line as shown above.
|
make command line as shown above.
|
||||||
|
|
||||||
make iosdmg [BUILDDIR32={32-bit build directory}] \
|
make iosdmg [BUILDDIR32={32-bit build directory}] \
|
||||||
[BUILDDIRARMV6={ARM v6 build directory}] \
|
[BUILDDIRARMV6={ARMv6 build directory}] \
|
||||||
[BUILDDIRARMV7={ARM v7 build directory}] \
|
[BUILDDIRARMV7={ARMv7 build directory}] \
|
||||||
[BUILDDIRARMV7S={ARM v7s build directory}]
|
[BUILDDIRARMV7S={ARMv7s build directory}] \
|
||||||
|
[BUILDDIRARMV8={ARMv8 build directory}]
|
||||||
|
|
||||||
On OS X systems, this creates a Macintosh package and disk image in which the
|
On OS X systems, this creates a Macintosh package and disk image in which the
|
||||||
mozjpeg static libraries contain ARM architectures necessary to build
|
mozjpeg static libraries contain ARM architectures necessary to build
|
||||||
iOS applications. If building on an x86-64 system, the binaries will also
|
iOS applications. If building on an x86-64 system, the binaries will also
|
||||||
contain the i386 architecture, as with 'make udmg' above. You should first
|
contain the i386 architecture, as with 'make udmg' above. You should first
|
||||||
configure ARM v6, ARM v7, and/or ARM v7s out-of-tree builds of mozjpeg
|
configure ARMv6, ARMv7, ARMv7s, and/or ARMv8 out-of-tree builds of
|
||||||
(see "Building mozjpeg for iOS" above.) If you are building an x86-64
|
mozjpeg (see "Building mozjpeg for iOS" above.) If you are
|
||||||
version of mozjpeg, you should configure a 32-bit out-of-tree build as
|
building an x86-64 version of mozjpeg, you should configure a 32-bit
|
||||||
well. Next, build mozjpeg as you would normally, using an out-of-tree
|
out-of-tree build as well. Next, build mozjpeg as you would normally,
|
||||||
build. When it is built, run 'make iosdmg' from the build directory. The
|
using an out-of-tree build. When it is built, run 'make iosdmg' from the
|
||||||
build system will look for the ARM v6 build under {source_directory}/iosarmv6
|
build directory. The build system will look for the ARMv6 build under
|
||||||
by default, the ARM v7 build under {source_directory}/iosarmv7 by default,
|
{source_directory}/iosarmv6 by default, the ARMv7 build under
|
||||||
the ARM v7s build under {source_directory}/iosarmv7s by default, and (if
|
{source_directory}/iosarmv7 by default, the ARMv7s build under
|
||||||
applicable) the 32-bit build under {source_directory}/osxx86 by default, but
|
{source_directory}/iosarmv7s by default, the ARMv8 build under
|
||||||
you can override this by setting the BUILDDIR32, BUILDDIRARMV6,
|
{source_directory}/iosarmv8 by default, and (if applicable) the 32-bit build
|
||||||
BUILDDIRARMV7, and/or BUILDDIRARMV7S variables on the make command line as
|
under {source_directory}/osxx86 by default, but you can override this by
|
||||||
shown above.
|
setting the BUILDDIR32, BUILDDIRARMV6, BUILDDIRARMV7, BUILDDIRARMV7S, and/or
|
||||||
|
BUILDDIRARMV8 variables on the make command line as shown above.
|
||||||
|
|
||||||
|
NOTE: If including an ARMv8 build in the package, then you may need to use
|
||||||
|
Xcode's version of lipo instead of the operating system's. To do this, pass
|
||||||
|
an argument of LIPO="xcrun lipo" on the make command line.
|
||||||
|
|
||||||
make cygwinpkg
|
make cygwinpkg
|
||||||
|
|
||||||
|
|||||||
@@ -557,7 +557,7 @@ foreach(libtype shared static)
|
|||||||
add_test(djpeg${suffix}-gray-islow-rgb
|
add_test(djpeg${suffix}-gray-islow-rgb
|
||||||
${dir}djpeg${suffix} -dct int -rgb -outfile testout_gray_islow_rgb.ppm
|
${dir}djpeg${suffix} -dct int -rgb -outfile testout_gray_islow_rgb.ppm
|
||||||
testout_gray_islow.jpg)
|
testout_gray_islow.jpg)
|
||||||
add_test(cjpeg${suffix}-gray-islow-rgb-cmp
|
add_test(djpeg${suffix}-gray-islow-rgb-cmp
|
||||||
${CMAKE_COMMAND} -DMD5=${MD5_PPM_GRAY_ISLOW_RGB}
|
${CMAKE_COMMAND} -DMD5=${MD5_PPM_GRAY_ISLOW_RGB}
|
||||||
-DFILE=testout_gray_islow_rgb.ppm
|
-DFILE=testout_gray_islow_rgb.ppm
|
||||||
-P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
|
-P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
|
||||||
@@ -637,10 +637,10 @@ foreach(libtype shared static)
|
|||||||
endif()
|
endif()
|
||||||
if(WITH_ARITH_DEC)
|
if(WITH_ARITH_DEC)
|
||||||
# CC: RGB->YCC SAMP: h2v2 merged IDCT: ifast ENT: arith
|
# CC: RGB->YCC SAMP: h2v2 merged IDCT: ifast ENT: arith
|
||||||
add_test(cjpeg${suffix}-420m-ifast-ari
|
add_test(djpeg${suffix}-420m-ifast-ari
|
||||||
${dir}djpeg${suffix} -fast -ppm -outfile testout_420m_ifast_ari.ppm
|
${dir}djpeg${suffix} -fast -ppm -outfile testout_420m_ifast_ari.ppm
|
||||||
${CMAKE_SOURCE_DIR}/testimages/testimgari.jpg)
|
${CMAKE_SOURCE_DIR}/testimages/testimgari.jpg)
|
||||||
add_test(cjpeg${suffix}-420m-ifast-ari-cmp
|
add_test(djpeg${suffix}-420m-ifast-ari-cmp
|
||||||
${CMAKE_COMMAND} -DMD5=${MD5_PPM_420M_IFAST_ARI}
|
${CMAKE_COMMAND} -DMD5=${MD5_PPM_420M_IFAST_ARI}
|
||||||
-DFILE=testout_420m_ifast_ari.ppm
|
-DFILE=testout_420m_ifast_ari.ppm
|
||||||
-P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
|
-P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
|
||||||
|
|||||||
@@ -13,6 +13,25 @@ instead of -1 if componentID was > 0 and subsamp was TJSAMP_GRAY.
|
|||||||
[3] Fixed an issue in tjBufSizeYUV2() wherby it would erroneously return 0
|
[3] Fixed an issue in tjBufSizeYUV2() wherby it would erroneously return 0
|
||||||
instead of -1 if width was < 1.
|
instead of -1 if width was < 1.
|
||||||
|
|
||||||
|
[5] The Huffman encoder now uses clz and bsr instructions for bit counting on
|
||||||
|
ARM64 platforms (see 1.4 beta1 [5].)
|
||||||
|
|
||||||
|
[6] The close() method in the TJCompressor and TJDecompressor Java classes is
|
||||||
|
now idempotent. Previously, that method would call the native tjDestroy()
|
||||||
|
function even if the TurboJPEG instance had already been destroyed. This
|
||||||
|
caused an exception to be thrown during finalization, if the close() method had
|
||||||
|
already been called. The exception was caught, but it was still an expensive
|
||||||
|
operation.
|
||||||
|
|
||||||
|
[7] The TurboJPEG API previously generated an error ("Could not determine
|
||||||
|
subsampling type for JPEG image") when attempting to decompress grayscale JPEG
|
||||||
|
images that were compressed with a sampling factor other than 1 (for instance,
|
||||||
|
with 'cjpeg -grayscale -sample 2x2'). Subsampling technically has no meaning
|
||||||
|
with grayscale JPEGs, and thus the horizontal and vertical sampling factors
|
||||||
|
for such images are ignored by the decompressor. However, the TurboJPEG API
|
||||||
|
was being too rigid and was expecting the sampling factors to be equal to 1
|
||||||
|
before it treated the image as a grayscale JPEG.
|
||||||
|
|
||||||
[8] cjpeg, djpeg, and jpegtran now accept an argument of -version, which will
|
[8] cjpeg, djpeg, and jpegtran now accept an argument of -version, which will
|
||||||
print the library version and exit.
|
print the library version and exit.
|
||||||
|
|
||||||
@@ -28,6 +47,26 @@ order), the Huffman encoder can produce encoded blocks that approach double the
|
|||||||
size of the unencoded blocks. Thus, the Huffman local buffer was increased to
|
size of the unencoded blocks. Thus, the Huffman local buffer was increased to
|
||||||
256 bytes, which should prevent any such issue from re-occurring in the future.
|
256 bytes, which should prevent any such issue from re-occurring in the future.
|
||||||
|
|
||||||
|
[10] The new tjPlaneSizeYUV(), tjPlaneWidth(), and tjPlaneHeight() functions
|
||||||
|
were not actually usable on any platform except OS X and Windows, because
|
||||||
|
those functions were not included in the libturbojpeg mapfile. This has been
|
||||||
|
fixed.
|
||||||
|
|
||||||
|
[11] Restored the JPP(), JMETHOD(), and FAR macros in the libjpeg-turbo header
|
||||||
|
files. The JPP() and JMETHOD() macros were originally implemented in libjpeg
|
||||||
|
as a way of supporting non-ANSI compilers that lacked support for prototype
|
||||||
|
parameters. libjpeg-turbo has never supported such compilers, but some
|
||||||
|
software packages still use the macros to define their own prototypes.
|
||||||
|
Similarly, libjpeg-turbo has never supported MS-DOS and other platforms that
|
||||||
|
have far symbols, but some software packages still use the FAR macro. A pretty
|
||||||
|
good argument can be made that this is a bad practice on the part of the
|
||||||
|
software in question, but since this affects more than one package, it's just
|
||||||
|
easier to fix it here.
|
||||||
|
|
||||||
|
[12] Fixed issues that were preventing the ARM 64-bit SIMD code from compiling
|
||||||
|
for iOS, and included an ARMv8 architecture in all of the binaries installed by
|
||||||
|
the "official" libjpeg-turbo SDK for OS X.
|
||||||
|
|
||||||
|
|
||||||
1.3.90 (1.4 beta1)
|
1.3.90 (1.4 beta1)
|
||||||
==================
|
==================
|
||||||
@@ -280,7 +319,7 @@ configure/CMake switch in order to retain strict API/ABI compatibility with the
|
|||||||
libjpeg v6b or v7 API/ABI (or with previous versions of libjpeg-turbo.) See
|
libjpeg v6b or v7 API/ABI (or with previous versions of libjpeg-turbo.) See
|
||||||
README-turbo.txt for more details.
|
README-turbo.txt for more details.
|
||||||
|
|
||||||
[13] Added ARM v7s architecture to libjpeg.a and libturbojpeg.a in the official
|
[13] Added ARMv7s architecture to libjpeg.a and libturbojpeg.a in the official
|
||||||
libjpeg-turbo binary package for OS X, so that those libraries can be used to
|
libjpeg-turbo binary package for OS X, so that those libraries can be used to
|
||||||
build applications that leverage the faster CPUs in the iPhone 5 and iPad 4.
|
build applications that leverage the faster CPUs in the iPhone 5 and iPad 4.
|
||||||
|
|
||||||
@@ -363,7 +402,7 @@ K component is assigned a component ID of 1 instead of 4. Although these files
|
|||||||
are in violation of the spec, other JPEG implementations handle them
|
are in violation of the spec, other JPEG implementations handle them
|
||||||
correctly.
|
correctly.
|
||||||
|
|
||||||
[7] Added ARM v6 and ARM v7 architectures to libjpeg.a and libturbojpeg.a in
|
[7] Added ARMv6 and ARMv7 architectures to libjpeg.a and libturbojpeg.a in
|
||||||
the official libjpeg-turbo binary package for OS X, so that those libraries can
|
the official libjpeg-turbo binary package for OS X, so that those libraries can
|
||||||
be used to build both OS X and iOS applications.
|
be used to build both OS X and iOS applications.
|
||||||
|
|
||||||
|
|||||||
11
Makefile.am
11
Makefile.am
@@ -276,7 +276,10 @@ MD5_JPEG_CROP = b4197f377e621c4e9b1d20471432610d
|
|||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
test: testclean all
|
.PHONY: test
|
||||||
|
test: tjquicktest bittest
|
||||||
|
|
||||||
|
tjquicktest: testclean all
|
||||||
|
|
||||||
if WITH_TURBOJPEG
|
if WITH_TURBOJPEG
|
||||||
if WITH_JAVA
|
if WITH_JAVA
|
||||||
@@ -294,6 +297,8 @@ endif
|
|||||||
./tjunittest -yuv -noyuvpad
|
./tjunittest -yuv -noyuvpad
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
bittest: testclean all
|
||||||
|
|
||||||
# These tests are carefully crafted to provide full coverage of as many of the
|
# These tests are carefully crafted to provide full coverage of as many of the
|
||||||
# underlying algorithms as possible (including all of the SIMD-accelerated
|
# underlying algorithms as possible (including all of the SIMD-accelerated
|
||||||
# ones.)
|
# ones.)
|
||||||
@@ -598,12 +603,12 @@ udmg: all pkgscripts/makemacpkg pkgscripts/uninstall
|
|||||||
sh pkgscripts/makemacpkg -build32 ${BUILDDIR32}
|
sh pkgscripts/makemacpkg -build32 ${BUILDDIR32}
|
||||||
|
|
||||||
iosdmg: all pkgscripts/makemacpkg pkgscripts/uninstall
|
iosdmg: all pkgscripts/makemacpkg pkgscripts/uninstall
|
||||||
sh pkgscripts/makemacpkg -build32 ${BUILDDIR32} -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S}
|
sh pkgscripts/makemacpkg -build32 ${BUILDDIR32} -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S} -buildarmv8 ${BUILDDIRARMV8} -lipo "${LIPO}"
|
||||||
|
|
||||||
else
|
else
|
||||||
|
|
||||||
iosdmg: all pkgscripts/makemacpkg pkgscripts/uninstall
|
iosdmg: all pkgscripts/makemacpkg pkgscripts/uninstall
|
||||||
sh pkgscripts/makemacpkg -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S}
|
sh pkgscripts/makemacpkg -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S} -buildarmv8 ${BUILDDIRARMV8} -lipo "${LIPO}"
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|||||||
12
configure.ac
12
configure.ac
@@ -443,7 +443,11 @@ if test "x${with_simd}" != "xno"; then
|
|||||||
AC_MSG_RESULT([yes (arm)])
|
AC_MSG_RESULT([yes (arm)])
|
||||||
AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
|
AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
|
||||||
AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE(
|
AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE(
|
||||||
[AC_MSG_RESULT([yes])
|
[if test "x$ac_use_gas_preprocessor" = "xyes"; then
|
||||||
|
AC_MSG_RESULT([yes (with gas-preprocessor)])
|
||||||
|
else
|
||||||
|
AC_MSG_RESULT([yes])
|
||||||
|
fi
|
||||||
simd_arch=arm],
|
simd_arch=arm],
|
||||||
[AC_MSG_RESULT([no])
|
[AC_MSG_RESULT([no])
|
||||||
with_simd=no])
|
with_simd=no])
|
||||||
@@ -459,7 +463,11 @@ if test "x${with_simd}" != "xno"; then
|
|||||||
AC_MSG_RESULT([yes (arm64)])
|
AC_MSG_RESULT([yes (arm64)])
|
||||||
AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
|
AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
|
||||||
AC_CHECK_COMPATIBLE_ARM64_ASSEMBLER_IFELSE(
|
AC_CHECK_COMPATIBLE_ARM64_ASSEMBLER_IFELSE(
|
||||||
[AC_MSG_RESULT([yes])
|
[if test "x$ac_use_gas_preprocessor" = "xyes"; then
|
||||||
|
AC_MSG_RESULT([yes (with gas-preprocessor)])
|
||||||
|
else
|
||||||
|
AC_MSG_RESULT([yes])
|
||||||
|
fi
|
||||||
simd_arch=aarch64],
|
simd_arch=aarch64],
|
||||||
[AC_MSG_RESULT([no])
|
[AC_MSG_RESULT([no])
|
||||||
with_simd=no])
|
with_simd=no])
|
||||||
|
|||||||
@@ -567,6 +567,7 @@ public class TJCompressor {
|
|||||||
* Free the native structures associated with this compressor instance.
|
* Free the native structures associated with this compressor instance.
|
||||||
*/
|
*/
|
||||||
public void close() throws Exception {
|
public void close() throws Exception {
|
||||||
|
if (handle != 0)
|
||||||
destroy();
|
destroy();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -834,6 +834,7 @@ public class TJDecompressor {
|
|||||||
* Free the native structures associated with this decompressor instance.
|
* Free the native structures associated with this decompressor instance.
|
||||||
*/
|
*/
|
||||||
public void close() throws Exception {
|
public void close() throws Exception {
|
||||||
|
if (handle != 0)
|
||||||
destroy();
|
destroy();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
2
jchuff.c
2
jchuff.c
@@ -37,7 +37,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
/* NOTE: Both GCC and Clang define __GNUC__ */
|
/* NOTE: Both GCC and Clang define __GNUC__ */
|
||||||
#if defined __GNUC__ && defined __arm__
|
#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
|
||||||
#if !defined __thumb__ || defined __thumb2__
|
#if !defined __thumb__ || defined __thumb2__
|
||||||
#define USE_CLZ_INTRINSIC
|
#define USE_CLZ_INTRINSIC
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
140
jcmainct.c
140
jcmainct.c
@@ -17,14 +17,6 @@
|
|||||||
#include "jpeglib.h"
|
#include "jpeglib.h"
|
||||||
|
|
||||||
|
|
||||||
/* Note: currently, there is no operating mode in which a full-image buffer
|
|
||||||
* is needed at this step. If there were, that mode could not be used with
|
|
||||||
* "raw data" input, since this module is bypassed in that case. However,
|
|
||||||
* we've left the code here for possible use in special applications.
|
|
||||||
*/
|
|
||||||
#undef FULL_MAIN_BUFFER_SUPPORTED
|
|
||||||
|
|
||||||
|
|
||||||
/* Private buffer controller object */
|
/* Private buffer controller object */
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
@@ -40,13 +32,6 @@ typedef struct {
|
|||||||
* points to the currently accessible strips of the virtual arrays.
|
* points to the currently accessible strips of the virtual arrays.
|
||||||
*/
|
*/
|
||||||
JSAMPARRAY buffer[MAX_COMPONENTS];
|
JSAMPARRAY buffer[MAX_COMPONENTS];
|
||||||
|
|
||||||
#ifdef FULL_MAIN_BUFFER_SUPPORTED
|
|
||||||
/* If using full-image storage, this array holds pointers to virtual-array
|
|
||||||
* control blocks for each component. Unused if not full-image storage.
|
|
||||||
*/
|
|
||||||
jvirt_sarray_ptr whole_image[MAX_COMPONENTS];
|
|
||||||
#endif
|
|
||||||
} my_main_controller;
|
} my_main_controller;
|
||||||
|
|
||||||
typedef my_main_controller * my_main_ptr;
|
typedef my_main_controller * my_main_ptr;
|
||||||
@@ -56,11 +41,6 @@ typedef my_main_controller * my_main_ptr;
|
|||||||
METHODDEF(void) process_data_simple_main
|
METHODDEF(void) process_data_simple_main
|
||||||
(j_compress_ptr cinfo, JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
|
(j_compress_ptr cinfo, JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
|
||||||
JDIMENSION in_rows_avail);
|
JDIMENSION in_rows_avail);
|
||||||
#ifdef FULL_MAIN_BUFFER_SUPPORTED
|
|
||||||
METHODDEF(void) process_data_buffer_main
|
|
||||||
(j_compress_ptr cinfo, JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
|
|
||||||
JDIMENSION in_rows_avail);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -76,32 +56,14 @@ start_pass_main (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
|
|||||||
if (cinfo->raw_data_in)
|
if (cinfo->raw_data_in)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
if (pass_mode != JBUF_PASS_THRU)
|
||||||
|
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
|
||||||
|
|
||||||
main_ptr->cur_iMCU_row = 0; /* initialize counters */
|
main_ptr->cur_iMCU_row = 0; /* initialize counters */
|
||||||
main_ptr->rowgroup_ctr = 0;
|
main_ptr->rowgroup_ctr = 0;
|
||||||
main_ptr->suspended = FALSE;
|
main_ptr->suspended = FALSE;
|
||||||
main_ptr->pass_mode = pass_mode; /* save mode for use by process_data */
|
main_ptr->pass_mode = pass_mode; /* save mode for use by process_data */
|
||||||
|
|
||||||
switch (pass_mode) {
|
|
||||||
case JBUF_PASS_THRU:
|
|
||||||
#ifdef FULL_MAIN_BUFFER_SUPPORTED
|
|
||||||
if (main_ptr->whole_image[0] != NULL)
|
|
||||||
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
|
|
||||||
#endif
|
|
||||||
main_ptr->pub.process_data = process_data_simple_main;
|
main_ptr->pub.process_data = process_data_simple_main;
|
||||||
break;
|
|
||||||
#ifdef FULL_MAIN_BUFFER_SUPPORTED
|
|
||||||
case JBUF_SAVE_SOURCE:
|
|
||||||
case JBUF_CRANK_DEST:
|
|
||||||
case JBUF_SAVE_AND_PASS:
|
|
||||||
if (main_ptr->whole_image[0] == NULL)
|
|
||||||
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
|
|
||||||
main_ptr->pub.process_data = process_data_buffer_main;
|
|
||||||
break;
|
|
||||||
#endif
|
|
||||||
default:
|
|
||||||
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -160,85 +122,6 @@ process_data_simple_main (j_compress_ptr cinfo,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#ifdef FULL_MAIN_BUFFER_SUPPORTED
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Process some data.
|
|
||||||
* This routine handles all of the modes that use a full-size buffer.
|
|
||||||
*/
|
|
||||||
|
|
||||||
METHODDEF(void)
|
|
||||||
process_data_buffer_main (j_compress_ptr cinfo,
|
|
||||||
JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
|
|
||||||
JDIMENSION in_rows_avail)
|
|
||||||
{
|
|
||||||
my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
|
|
||||||
int ci;
|
|
||||||
jpeg_component_info *compptr;
|
|
||||||
boolean writing = (main_ptr->pass_mode != JBUF_CRANK_DEST);
|
|
||||||
|
|
||||||
while (main_ptr->cur_iMCU_row < cinfo->total_iMCU_rows) {
|
|
||||||
/* Realign the virtual buffers if at the start of an iMCU row. */
|
|
||||||
if (main_ptr->rowgroup_ctr == 0) {
|
|
||||||
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
|
|
||||||
ci++, compptr++) {
|
|
||||||
main_ptr->buffer[ci] = (*cinfo->mem->access_virt_sarray)
|
|
||||||
((j_common_ptr) cinfo, main_ptr->whole_image[ci],
|
|
||||||
main_ptr->cur_iMCU_row * (compptr->v_samp_factor * DCTSIZE),
|
|
||||||
(JDIMENSION) (compptr->v_samp_factor * DCTSIZE), writing);
|
|
||||||
}
|
|
||||||
/* In a read pass, pretend we just read some source data. */
|
|
||||||
if (! writing) {
|
|
||||||
*in_row_ctr += cinfo->max_v_samp_factor * DCTSIZE;
|
|
||||||
main_ptr->rowgroup_ctr = DCTSIZE;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If a write pass, read input data until the current iMCU row is full. */
|
|
||||||
/* Note: preprocessor will pad if necessary to fill the last iMCU row. */
|
|
||||||
if (writing) {
|
|
||||||
(*cinfo->prep->pre_process_data) (cinfo,
|
|
||||||
input_buf, in_row_ctr, in_rows_avail,
|
|
||||||
main_ptr->buffer, &main_ptr->rowgroup_ctr,
|
|
||||||
(JDIMENSION) DCTSIZE);
|
|
||||||
/* Return to application if we need more data to fill the iMCU row. */
|
|
||||||
if (main_ptr->rowgroup_ctr < DCTSIZE)
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Emit data, unless this is a sink-only pass. */
|
|
||||||
if (main_ptr->pass_mode != JBUF_SAVE_SOURCE) {
|
|
||||||
if (! (*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
|
|
||||||
/* If compressor did not consume the whole row, then we must need to
|
|
||||||
* suspend processing and return to the application. In this situation
|
|
||||||
* we pretend we didn't yet consume the last input row; otherwise, if
|
|
||||||
* it happened to be the last row of the image, the application would
|
|
||||||
* think we were done.
|
|
||||||
*/
|
|
||||||
if (! main_ptr->suspended) {
|
|
||||||
(*in_row_ctr)--;
|
|
||||||
main_ptr->suspended = TRUE;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
/* We did finish the row. Undo our little suspension hack if a previous
|
|
||||||
* call suspended; then mark the main buffer empty.
|
|
||||||
*/
|
|
||||||
if (main_ptr->suspended) {
|
|
||||||
(*in_row_ctr)++;
|
|
||||||
main_ptr->suspended = FALSE;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If get here, we are done with this iMCU row. Mark buffer empty. */
|
|
||||||
main_ptr->rowgroup_ctr = 0;
|
|
||||||
main_ptr->cur_iMCU_row++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* FULL_MAIN_BUFFER_SUPPORTED */
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Initialize main buffer controller.
|
* Initialize main buffer controller.
|
||||||
*/
|
*/
|
||||||
@@ -264,25 +147,8 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
|
|||||||
* may be of a different size.
|
* may be of a different size.
|
||||||
*/
|
*/
|
||||||
if (need_full_buffer) {
|
if (need_full_buffer) {
|
||||||
#ifdef FULL_MAIN_BUFFER_SUPPORTED
|
|
||||||
/* Allocate a full-image virtual array for each component */
|
|
||||||
/* Note we pad the bottom to a multiple of the iMCU height */
|
|
||||||
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
|
|
||||||
ci++, compptr++) {
|
|
||||||
main_ptr->whole_image[ci] = (*cinfo->mem->request_virt_sarray)
|
|
||||||
((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
|
|
||||||
compptr->width_in_blocks * DCTSIZE,
|
|
||||||
(JDIMENSION) jround_up((long) compptr->height_in_blocks,
|
|
||||||
(long) compptr->v_samp_factor) * DCTSIZE,
|
|
||||||
(JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
|
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
|
||||||
#endif
|
|
||||||
} else {
|
} else {
|
||||||
#ifdef FULL_MAIN_BUFFER_SUPPORTED
|
|
||||||
main_ptr->whole_image[0] = NULL; /* flag for no virtual arrays */
|
|
||||||
#endif
|
|
||||||
/* Allocate a strip buffer for each component */
|
/* Allocate a strip buffer for each component */
|
||||||
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
|
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
|
||||||
ci++, compptr++) {
|
ci++, compptr++) {
|
||||||
|
|||||||
@@ -514,8 +514,9 @@ jinit_downsampler (j_compress_ptr cinfo)
|
|||||||
#endif
|
#endif
|
||||||
downsample->methods[ci] = h2v2_smooth_downsample;
|
downsample->methods[ci] = h2v2_smooth_downsample;
|
||||||
downsample->pub.need_context_rows = TRUE;
|
downsample->pub.need_context_rows = TRUE;
|
||||||
} else {
|
} else
|
||||||
#endif
|
#endif
|
||||||
|
{
|
||||||
if (jsimd_can_h2v2_downsample())
|
if (jsimd_can_h2v2_downsample())
|
||||||
downsample->methods[ci] = jsimd_h2v2_downsample;
|
downsample->methods[ci] = jsimd_h2v2_downsample;
|
||||||
else
|
else
|
||||||
|
|||||||
18
jmorecfg.h
18
jmorecfg.h
@@ -180,6 +180,24 @@ typedef unsigned int JDIMENSION;
|
|||||||
#define EXTERN(type) extern type
|
#define EXTERN(type) extern type
|
||||||
|
|
||||||
|
|
||||||
|
/* Originally, this macro was used as a way of defining function prototypes
|
||||||
|
* for both modern compilers as well as older compilers that did not support
|
||||||
|
* prototype parameters. libjpeg-turbo has never supported these older,
|
||||||
|
* non-ANSI compilers, but the macro is still included because there is some
|
||||||
|
* software out there that uses it.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define JMETHOD(type,methodname,arglist) type (*methodname) arglist
|
||||||
|
|
||||||
|
|
||||||
|
/* libjpeg-turbo no longer supports platforms that have far symbols (MS-DOS),
|
||||||
|
* but again, some software relies on this macro.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#undef FAR
|
||||||
|
#define FAR
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* On a few systems, type boolean and/or its values FALSE, TRUE may appear
|
* On a few systems, type boolean and/or its values FALSE, TRUE may appear
|
||||||
* in standard header files. Or you may have conflicts with application-
|
* in standard header files. Or you may have conflicts with application-
|
||||||
|
|||||||
10
jpeglib.h
10
jpeglib.h
@@ -923,6 +923,16 @@ struct jpeg_memory_mgr {
|
|||||||
typedef boolean (*jpeg_marker_parser_method) (j_decompress_ptr cinfo);
|
typedef boolean (*jpeg_marker_parser_method) (j_decompress_ptr cinfo);
|
||||||
|
|
||||||
|
|
||||||
|
/* Originally, this macro was used as a way of defining function prototypes
|
||||||
|
* for both modern compilers as well as older compilers that did not support
|
||||||
|
* prototype parameters. libjpeg-turbo has never supported these older,
|
||||||
|
* non-ANSI compilers, but the macro is still included because there is some
|
||||||
|
* software out there that uses it.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define JPP(arglist) arglist
|
||||||
|
|
||||||
|
|
||||||
/* Default error-management setup */
|
/* Default error-management setup */
|
||||||
EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr * err);
|
EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr * err);
|
||||||
|
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ onexit()
|
|||||||
|
|
||||||
usage()
|
usage()
|
||||||
{
|
{
|
||||||
echo "$0 [-build32 [32-bit build dir]] [-buildarmv6 [ARM v6 build dir]] [-buildarmv7 [ARM v7 build dir]] [-buildarmv7s [ARM v7s build dir]]"
|
echo "$0 [-build32 [32-bit build dir]] [-buildarmv6 [ARMv6 build dir]] [-buildarmv7 [ARMv7 build dir]] [-buildarmv7s [ARMv7s build dir] [-buildarmv8 [ARMv8 build dir]] [-lipo [path to lipo]]"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -33,7 +33,10 @@ BUILDDIRARMV7=@abs_top_srcdir@/iosarmv7
|
|||||||
BUILDARMV7=0
|
BUILDARMV7=0
|
||||||
BUILDDIRARMV7S=@abs_top_srcdir@/iosarmv7s
|
BUILDDIRARMV7S=@abs_top_srcdir@/iosarmv7s
|
||||||
BUILDARMV7S=0
|
BUILDARMV7S=0
|
||||||
|
BUILDDIRARMV8=@abs_top_srcdir@/iosarmv8
|
||||||
|
BUILDARMV8=0
|
||||||
WITH_JAVA=@WITH_JAVA@
|
WITH_JAVA=@WITH_JAVA@
|
||||||
|
LIPO=lipo
|
||||||
|
|
||||||
PREFIX=%{__prefix}
|
PREFIX=%{__prefix}
|
||||||
BINDIR=%{__bindir}
|
BINDIR=%{__bindir}
|
||||||
@@ -75,6 +78,21 @@ while [ $# -gt 0 ]; do
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
;;
|
;;
|
||||||
|
-buildarmv8)
|
||||||
|
BUILDARMV8=1
|
||||||
|
if [ $# -gt 1 ]; then
|
||||||
|
if [[ ! "$2" =~ -.* ]]; then
|
||||||
|
BUILDDIRARMV8=$2; shift
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
-lipo)
|
||||||
|
if [ $# -gt 1 ]; then
|
||||||
|
if [[ ! "$2" =~ -.* ]]; then
|
||||||
|
LIPO=$2; shift
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
shift
|
shift
|
||||||
done
|
done
|
||||||
@@ -110,50 +128,50 @@ if [ $BUILD32 = 1 ]; then
|
|||||||
popd
|
popd
|
||||||
if [ ! -h $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
|
if [ ! -h $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
|
||||||
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
|
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
|
||||||
lipo -create \
|
$LIPO -create \
|
||||||
-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||||
-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||||
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
|
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
|
||||||
elif [ ! -h $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
|
elif [ ! -h $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
|
||||||
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
|
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
|
||||||
lipo -create \
|
$LIPO -create \
|
||||||
-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||||
-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||||
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
|
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
|
||||||
fi
|
fi
|
||||||
lipo -create \
|
$LIPO -create \
|
||||||
-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.a \
|
-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.a \
|
||||||
-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.a \
|
-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.a \
|
||||||
-output $PKGROOT/$LIBDIR/libjpeg.a
|
-output $PKGROOT/$LIBDIR/libjpeg.a
|
||||||
lipo -create \
|
$LIPO -create \
|
||||||
-arch i386 $TMPDIR/dist.x86/$LIBDIR/libturbojpeg.0.dylib \
|
-arch i386 $TMPDIR/dist.x86/$LIBDIR/libturbojpeg.0.dylib \
|
||||||
-arch x86_64 $PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
|
-arch x86_64 $PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
|
||||||
-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
|
-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
|
||||||
lipo -create \
|
$LIPO -create \
|
||||||
-arch i386 $TMPDIR/dist.x86/$LIBDIR/libturbojpeg.a \
|
-arch i386 $TMPDIR/dist.x86/$LIBDIR/libturbojpeg.a \
|
||||||
-arch x86_64 $PKGROOT/$LIBDIR/libturbojpeg.a \
|
-arch x86_64 $PKGROOT/$LIBDIR/libturbojpeg.a \
|
||||||
-output $PKGROOT/$LIBDIR/libturbojpeg.a
|
-output $PKGROOT/$LIBDIR/libturbojpeg.a
|
||||||
lipo -create \
|
$LIPO -create \
|
||||||
-arch i386 $TMPDIR/dist.x86/$BINDIR/cjpeg \
|
-arch i386 $TMPDIR/dist.x86/$BINDIR/cjpeg \
|
||||||
-arch x86_64 $PKGROOT/$BINDIR/cjpeg \
|
-arch x86_64 $PKGROOT/$BINDIR/cjpeg \
|
||||||
-output $PKGROOT/$BINDIR/cjpeg
|
-output $PKGROOT/$BINDIR/cjpeg
|
||||||
lipo -create \
|
$LIPO -create \
|
||||||
-arch i386 $TMPDIR/dist.x86/$BINDIR/djpeg \
|
-arch i386 $TMPDIR/dist.x86/$BINDIR/djpeg \
|
||||||
-arch x86_64 $PKGROOT/$BINDIR/djpeg \
|
-arch x86_64 $PKGROOT/$BINDIR/djpeg \
|
||||||
-output $PKGROOT/$BINDIR/djpeg
|
-output $PKGROOT/$BINDIR/djpeg
|
||||||
lipo -create \
|
$LIPO -create \
|
||||||
-arch i386 $TMPDIR/dist.x86/$BINDIR/jpegtran \
|
-arch i386 $TMPDIR/dist.x86/$BINDIR/jpegtran \
|
||||||
-arch x86_64 $PKGROOT/$BINDIR/jpegtran \
|
-arch x86_64 $PKGROOT/$BINDIR/jpegtran \
|
||||||
-output $PKGROOT/$BINDIR/jpegtran
|
-output $PKGROOT/$BINDIR/jpegtran
|
||||||
lipo -create \
|
$LIPO -create \
|
||||||
-arch i386 $TMPDIR/dist.x86/$BINDIR/tjbench \
|
-arch i386 $TMPDIR/dist.x86/$BINDIR/tjbench \
|
||||||
-arch x86_64 $PKGROOT/$BINDIR/tjbench \
|
-arch x86_64 $PKGROOT/$BINDIR/tjbench \
|
||||||
-output $PKGROOT/$BINDIR/tjbench
|
-output $PKGROOT/$BINDIR/tjbench
|
||||||
lipo -create \
|
$LIPO -create \
|
||||||
-arch i386 $TMPDIR/dist.x86/$BINDIR/rdjpgcom \
|
-arch i386 $TMPDIR/dist.x86/$BINDIR/rdjpgcom \
|
||||||
-arch x86_64 $PKGROOT/$BINDIR/rdjpgcom \
|
-arch x86_64 $PKGROOT/$BINDIR/rdjpgcom \
|
||||||
-output $PKGROOT/$BINDIR/rdjpgcom
|
-output $PKGROOT/$BINDIR/rdjpgcom
|
||||||
lipo -create \
|
$LIPO -create \
|
||||||
-arch i386 $TMPDIR/dist.x86/$BINDIR/wrjpgcom \
|
-arch i386 $TMPDIR/dist.x86/$BINDIR/wrjpgcom \
|
||||||
-arch x86_64 $PKGROOT/$BINDIR/wrjpgcom \
|
-arch x86_64 $PKGROOT/$BINDIR/wrjpgcom \
|
||||||
-output $PKGROOT/$BINDIR/wrjpgcom
|
-output $PKGROOT/$BINDIR/wrjpgcom
|
||||||
@@ -162,71 +180,258 @@ fi
|
|||||||
|
|
||||||
if [ $BUILDARMV6 = 1 ]; then
|
if [ $BUILDARMV6 = 1 ]; then
|
||||||
if [ ! -d $BUILDDIRARMV6 ]; then
|
if [ ! -d $BUILDDIRARMV6 ]; then
|
||||||
echo ERROR: ARM v6 build directory $BUILDDIRARMV6 does not exist
|
echo ERROR: ARMv6 build directory $BUILDDIRARMV6 does not exist
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if [ ! -f $BUILDDIRARMV6/Makefile ]; then
|
if [ ! -f $BUILDDIRARMV6/Makefile ]; then
|
||||||
echo ERROR: ARM v6 build directory $BUILDDIRARMV6 is not configured
|
echo ERROR: ARMv6 build directory $BUILDDIRARMV6 is not configured
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
mkdir -p $TMPDIR/dist.armv6
|
mkdir -p $TMPDIR/dist.armv6
|
||||||
pushd $BUILDDIRARMV6
|
pushd $BUILDDIRARMV6
|
||||||
make install DESTDIR=$TMPDIR/dist.armv6
|
make install DESTDIR=$TMPDIR/dist.armv6
|
||||||
popd
|
popd
|
||||||
lipo -create \
|
if [ ! -h $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
|
||||||
|
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||||
|
-arch arm $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||||
|
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
|
||||||
|
elif [ ! -h $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
|
||||||
|
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||||
|
-arch arm $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||||
|
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
|
||||||
|
fi
|
||||||
|
$LIPO -create \
|
||||||
$PKGROOT/$LIBDIR/libjpeg.a \
|
$PKGROOT/$LIBDIR/libjpeg.a \
|
||||||
-arch arm $TMPDIR/dist.armv6/$LIBDIR/libjpeg.a \
|
-arch arm $TMPDIR/dist.armv6/$LIBDIR/libjpeg.a \
|
||||||
-output $PKGROOT/$LIBDIR/libjpeg.a
|
-output $PKGROOT/$LIBDIR/libjpeg.a
|
||||||
lipo -create \
|
$LIPO -create \
|
||||||
|
$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
|
||||||
|
-arch arm $TMPDIR/dist.armv6/$LIBDIR/libturbojpeg.0.dylib \
|
||||||
|
-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
|
||||||
|
$LIPO -create \
|
||||||
$PKGROOT/$LIBDIR/libturbojpeg.a \
|
$PKGROOT/$LIBDIR/libturbojpeg.a \
|
||||||
-arch arm $TMPDIR/dist.armv6/$LIBDIR/libturbojpeg.a \
|
-arch arm $TMPDIR/dist.armv6/$LIBDIR/libturbojpeg.a \
|
||||||
-output $PKGROOT/$LIBDIR/libturbojpeg.a
|
-output $PKGROOT/$LIBDIR/libturbojpeg.a
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/cjpeg \
|
||||||
|
-arch arm $TMPDIR/dist.armv6/$BINDIR/cjpeg \
|
||||||
|
-output $PKGROOT/$BINDIR/cjpeg
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/djpeg \
|
||||||
|
-arch arm $TMPDIR/dist.armv6/$BINDIR/djpeg \
|
||||||
|
-output $PKGROOT/$BINDIR/djpeg
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/jpegtran \
|
||||||
|
-arch arm $TMPDIR/dist.armv6/$BINDIR/jpegtran \
|
||||||
|
-output $PKGROOT/$BINDIR/jpegtran
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/tjbench \
|
||||||
|
-arch arm $TMPDIR/dist.armv6/$BINDIR/tjbench \
|
||||||
|
-output $PKGROOT/$BINDIR/tjbench
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/rdjpgcom \
|
||||||
|
-arch arm $TMPDIR/dist.armv6/$BINDIR/rdjpgcom \
|
||||||
|
-output $PKGROOT/$BINDIR/rdjpgcom
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/wrjpgcom \
|
||||||
|
-arch arm $TMPDIR/dist.armv6/$BINDIR/wrjpgcom \
|
||||||
|
-output $PKGROOT/$BINDIR/wrjpgcom
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $BUILDARMV7 = 1 ]; then
|
if [ $BUILDARMV7 = 1 ]; then
|
||||||
if [ ! -d $BUILDDIRARMV7 ]; then
|
if [ ! -d $BUILDDIRARMV7 ]; then
|
||||||
echo ERROR: ARM v7 build directory $BUILDDIRARMV7 does not exist
|
echo ERROR: ARMv7 build directory $BUILDDIRARMV7 does not exist
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if [ ! -f $BUILDDIRARMV7/Makefile ]; then
|
if [ ! -f $BUILDDIRARMV7/Makefile ]; then
|
||||||
echo ERROR: ARM v7 build directory $BUILDDIRARMV7 is not configured
|
echo ERROR: ARMv7 build directory $BUILDDIRARMV7 is not configured
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
mkdir -p $TMPDIR/dist.armv7
|
mkdir -p $TMPDIR/dist.armv7
|
||||||
pushd $BUILDDIRARMV7
|
pushd $BUILDDIRARMV7
|
||||||
make install DESTDIR=$TMPDIR/dist.armv7
|
make install DESTDIR=$TMPDIR/dist.armv7
|
||||||
popd
|
popd
|
||||||
lipo -create \
|
if [ ! -h $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
|
||||||
|
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||||
|
-arch arm $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||||
|
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
|
||||||
|
elif [ ! -h $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
|
||||||
|
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||||
|
-arch arm $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||||
|
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
|
||||||
|
fi
|
||||||
|
$LIPO -create \
|
||||||
$PKGROOT/$LIBDIR/libjpeg.a \
|
$PKGROOT/$LIBDIR/libjpeg.a \
|
||||||
-arch arm $TMPDIR/dist.armv7/$LIBDIR/libjpeg.a \
|
-arch arm $TMPDIR/dist.armv7/$LIBDIR/libjpeg.a \
|
||||||
-output $PKGROOT/$LIBDIR/libjpeg.a
|
-output $PKGROOT/$LIBDIR/libjpeg.a
|
||||||
lipo -create \
|
$LIPO -create \
|
||||||
|
$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
|
||||||
|
-arch arm $TMPDIR/dist.armv7/$LIBDIR/libturbojpeg.0.dylib \
|
||||||
|
-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
|
||||||
|
$LIPO -create \
|
||||||
$PKGROOT/$LIBDIR/libturbojpeg.a \
|
$PKGROOT/$LIBDIR/libturbojpeg.a \
|
||||||
-arch arm $TMPDIR/dist.armv7/$LIBDIR/libturbojpeg.a \
|
-arch arm $TMPDIR/dist.armv7/$LIBDIR/libturbojpeg.a \
|
||||||
-output $PKGROOT/$LIBDIR/libturbojpeg.a
|
-output $PKGROOT/$LIBDIR/libturbojpeg.a
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/cjpeg \
|
||||||
|
-arch arm $TMPDIR/dist.armv7/$BINDIR/cjpeg \
|
||||||
|
-output $PKGROOT/$BINDIR/cjpeg
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/djpeg \
|
||||||
|
-arch arm $TMPDIR/dist.armv7/$BINDIR/djpeg \
|
||||||
|
-output $PKGROOT/$BINDIR/djpeg
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/jpegtran \
|
||||||
|
-arch arm $TMPDIR/dist.armv7/$BINDIR/jpegtran \
|
||||||
|
-output $PKGROOT/$BINDIR/jpegtran
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/tjbench \
|
||||||
|
-arch arm $TMPDIR/dist.armv7/$BINDIR/tjbench \
|
||||||
|
-output $PKGROOT/$BINDIR/tjbench
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/rdjpgcom \
|
||||||
|
-arch arm $TMPDIR/dist.armv7/$BINDIR/rdjpgcom \
|
||||||
|
-output $PKGROOT/$BINDIR/rdjpgcom
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/wrjpgcom \
|
||||||
|
-arch arm $TMPDIR/dist.armv7/$BINDIR/wrjpgcom \
|
||||||
|
-output $PKGROOT/$BINDIR/wrjpgcom
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $BUILDARMV7S = 1 ]; then
|
if [ $BUILDARMV7S = 1 ]; then
|
||||||
if [ ! -d $BUILDDIRARMV7S ]; then
|
if [ ! -d $BUILDDIRARMV7S ]; then
|
||||||
echo ERROR: ARM v7s build directory $BUILDDIRARMV7S does not exist
|
echo ERROR: ARMv7s build directory $BUILDDIRARMV7S does not exist
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if [ ! -f $BUILDDIRARMV7S/Makefile ]; then
|
if [ ! -f $BUILDDIRARMV7S/Makefile ]; then
|
||||||
echo ERROR: ARM v7s build directory $BUILDDIRARMV7S is not configured
|
echo ERROR: ARMv7s build directory $BUILDDIRARMV7S is not configured
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
mkdir -p $TMPDIR/dist.armv7s
|
mkdir -p $TMPDIR/dist.armv7s
|
||||||
pushd $BUILDDIRARMV7S
|
pushd $BUILDDIRARMV7S
|
||||||
make install DESTDIR=$TMPDIR/dist.armv7s
|
make install DESTDIR=$TMPDIR/dist.armv7s
|
||||||
popd
|
popd
|
||||||
lipo -create \
|
if [ ! -h $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
|
||||||
|
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||||
|
-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||||
|
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
|
||||||
|
elif [ ! -h $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
|
||||||
|
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||||
|
-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||||
|
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
|
||||||
|
fi
|
||||||
|
$LIPO -create \
|
||||||
$PKGROOT/$LIBDIR/libjpeg.a \
|
$PKGROOT/$LIBDIR/libjpeg.a \
|
||||||
-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.a \
|
-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.a \
|
||||||
-output $PKGROOT/$LIBDIR/libjpeg.a
|
-output $PKGROOT/$LIBDIR/libjpeg.a
|
||||||
lipo -create \
|
$LIPO -create \
|
||||||
|
$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
|
||||||
|
-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libturbojpeg.0.dylib \
|
||||||
|
-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
|
||||||
|
$LIPO -create \
|
||||||
$PKGROOT/$LIBDIR/libturbojpeg.a \
|
$PKGROOT/$LIBDIR/libturbojpeg.a \
|
||||||
-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libturbojpeg.a \
|
-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libturbojpeg.a \
|
||||||
-output $PKGROOT/$LIBDIR/libturbojpeg.a
|
-output $PKGROOT/$LIBDIR/libturbojpeg.a
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/cjpeg \
|
||||||
|
-arch arm $TMPDIR/dist.armv7s/$BINDIR/cjpeg \
|
||||||
|
-output $PKGROOT/$BINDIR/cjpeg
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/djpeg \
|
||||||
|
-arch arm $TMPDIR/dist.armv7s/$BINDIR/djpeg \
|
||||||
|
-output $PKGROOT/$BINDIR/djpeg
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/jpegtran \
|
||||||
|
-arch arm $TMPDIR/dist.armv7s/$BINDIR/jpegtran \
|
||||||
|
-output $PKGROOT/$BINDIR/jpegtran
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/tjbench \
|
||||||
|
-arch arm $TMPDIR/dist.armv7s/$BINDIR/tjbench \
|
||||||
|
-output $PKGROOT/$BINDIR/tjbench
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/rdjpgcom \
|
||||||
|
-arch arm $TMPDIR/dist.armv7s/$BINDIR/rdjpgcom \
|
||||||
|
-output $PKGROOT/$BINDIR/rdjpgcom
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/wrjpgcom \
|
||||||
|
-arch arm $TMPDIR/dist.armv7s/$BINDIR/wrjpgcom \
|
||||||
|
-output $PKGROOT/$BINDIR/wrjpgcom
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $BUILDARMV8 = 1 ]; then
|
||||||
|
if [ ! -d $BUILDDIRARMV8 ]; then
|
||||||
|
echo ERROR: ARMv8 build directory $BUILDDIRARMV8 does not exist
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ ! -f $BUILDDIRARMV8/Makefile ]; then
|
||||||
|
echo ERROR: ARMv8 build directory $BUILDDIRARMV8 is not configured
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
mkdir -p $TMPDIR/dist.armv8
|
||||||
|
pushd $BUILDDIRARMV8
|
||||||
|
make install DESTDIR=$TMPDIR/dist.armv8
|
||||||
|
popd
|
||||||
|
if [ ! -h $TMPDIR/dist.armv8/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
|
||||||
|
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||||
|
-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
|
||||||
|
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
|
||||||
|
elif [ ! -h $TMPDIR/dist.armv8/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
|
||||||
|
! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||||
|
-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
|
||||||
|
-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
|
||||||
|
fi
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$LIBDIR/libjpeg.a \
|
||||||
|
-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libjpeg.a \
|
||||||
|
-output $PKGROOT/$LIBDIR/libjpeg.a
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
|
||||||
|
-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libturbojpeg.0.dylib \
|
||||||
|
-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$LIBDIR/libturbojpeg.a \
|
||||||
|
-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libturbojpeg.a \
|
||||||
|
-output $PKGROOT/$LIBDIR/libturbojpeg.a
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/cjpeg \
|
||||||
|
-arch arm64 $TMPDIR/dist.armv8/$BINDIR/cjpeg \
|
||||||
|
-output $PKGROOT/$BINDIR/cjpeg
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/djpeg \
|
||||||
|
-arch arm64 $TMPDIR/dist.armv8/$BINDIR/djpeg \
|
||||||
|
-output $PKGROOT/$BINDIR/djpeg
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/jpegtran \
|
||||||
|
-arch arm64 $TMPDIR/dist.armv8/$BINDIR/jpegtran \
|
||||||
|
-output $PKGROOT/$BINDIR/jpegtran
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/tjbench \
|
||||||
|
-arch arm64 $TMPDIR/dist.armv8/$BINDIR/tjbench \
|
||||||
|
-output $PKGROOT/$BINDIR/tjbench
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/rdjpgcom \
|
||||||
|
-arch arm64 $TMPDIR/dist.armv8/$BINDIR/rdjpgcom \
|
||||||
|
-output $PKGROOT/$BINDIR/rdjpgcom
|
||||||
|
$LIPO -create \
|
||||||
|
$PKGROOT/$BINDIR/wrjpgcom \
|
||||||
|
-arch arm64 $TMPDIR/dist.armv8/$BINDIR/wrjpgcom \
|
||||||
|
-output $PKGROOT/$BINDIR/wrjpgcom
|
||||||
fi
|
fi
|
||||||
|
|
||||||
install_name_tool -id $LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
|
install_name_tool -id $LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
|
||||||
|
|||||||
@@ -111,6 +111,7 @@ Section "Uninstall"
|
|||||||
|
|
||||||
!ifdef GCC
|
!ifdef GCC
|
||||||
Delete $INSTDIR\bin\libjpeg-@DLL_VERSION@.dll
|
Delete $INSTDIR\bin\libjpeg-@DLL_VERSION@.dll
|
||||||
|
Delete $INSTDIR\bin\libturbojpeg.dll
|
||||||
Delete $SYSDIR\libturbojpeg.dll
|
Delete $SYSDIR\libturbojpeg.dll
|
||||||
Delete $INSTDIR\lib\libturbojpeg.dll.a"
|
Delete $INSTDIR\lib\libturbojpeg.dll.a"
|
||||||
Delete $INSTDIR\lib\libturbojpeg.a"
|
Delete $INSTDIR\lib\libturbojpeg.a"
|
||||||
@@ -118,6 +119,7 @@ Section "Uninstall"
|
|||||||
Delete $INSTDIR\lib\libjpeg.a"
|
Delete $INSTDIR\lib\libjpeg.a"
|
||||||
!else
|
!else
|
||||||
Delete $INSTDIR\bin\jpeg@DLL_VERSION@.dll
|
Delete $INSTDIR\bin\jpeg@DLL_VERSION@.dll
|
||||||
|
Delete $INSTDIR\bin\turbojpeg.dll
|
||||||
Delete $SYSDIR\turbojpeg.dll
|
Delete $SYSDIR\turbojpeg.dll
|
||||||
Delete $INSTDIR\lib\jpeg.lib
|
Delete $INSTDIR\lib\jpeg.lib
|
||||||
Delete $INSTDIR\lib\jpeg-static.lib
|
Delete $INSTDIR\lib\jpeg-static.lib
|
||||||
|
|||||||
@@ -72,9 +72,16 @@ endif
|
|||||||
|
|
||||||
if SIMD_POWERPC
|
if SIMD_POWERPC
|
||||||
|
|
||||||
libsimd_la_SOURCES = jsimd_powerpc.c jsimd_powerpc_altivec.c
|
libsimd_la_SOURCES = jsimd_powerpc.c \
|
||||||
|
jccolor-altivec.c jcgray-altivec.c \
|
||||||
|
jfdctfst-altivec.c jfdctint-altivec.c \
|
||||||
|
jidctfst-altivec.c jidctint-altivec.c \
|
||||||
|
jquanti-altivec.c
|
||||||
libsimd_la_CFLAGS = -maltivec
|
libsimd_la_CFLAGS = -maltivec
|
||||||
|
|
||||||
|
jccolor-altivec.lo: jccolext-altivec.c
|
||||||
|
jcgray-altivec.lo: jcgryext-altivec.c
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
AM_CPPFLAGS = -I$(top_srcdir)
|
AM_CPPFLAGS = -I$(top_srcdir)
|
||||||
|
|||||||
250
simd/jccolext-altivec.c
Normal file
250
simd/jccolext-altivec.c
Normal file
@@ -0,0 +1,250 @@
|
|||||||
|
/*
|
||||||
|
* AltiVec optimizations for libjpeg-turbo
|
||||||
|
*
|
||||||
|
* Copyright (C) 2014, D. R. Commander.
|
||||||
|
* Copyright (C) 2014, Jay Foad.
|
||||||
|
* All rights reserved.
|
||||||
|
* This software is provided 'as-is', without any express or implied
|
||||||
|
* warranty. In no event will the authors be held liable for any damages
|
||||||
|
* arising from the use of this software.
|
||||||
|
*
|
||||||
|
* Permission is granted to anyone to use this software for any purpose,
|
||||||
|
* including commercial applications, and to alter it and redistribute it
|
||||||
|
* freely, subject to the following restrictions:
|
||||||
|
*
|
||||||
|
* 1. The origin of this software must not be misrepresented; you must not
|
||||||
|
* claim that you wrote the original software. If you use this software
|
||||||
|
* in a product, an acknowledgment in the product documentation would be
|
||||||
|
* appreciated but is not required.
|
||||||
|
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||||
|
* misrepresented as being the original software.
|
||||||
|
* 3. This notice may not be removed or altered from any source distribution.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* This file is included by jccolor-altivec.c */
|
||||||
|
|
||||||
|
|
||||||
|
void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
|
||||||
|
JSAMPIMAGE output_buf,
|
||||||
|
JDIMENSION output_row, int num_rows)
|
||||||
|
{
|
||||||
|
JSAMPROW inptr;
|
||||||
|
JSAMPROW outptr0, outptr1, outptr2;
|
||||||
|
int pitch;
|
||||||
|
__vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0}, rgbg0,
|
||||||
|
rgbg1, rgbg2, rgbg3, y, cb, cr;
|
||||||
|
#if RGB_PIXELSIZE == 4
|
||||||
|
__vector unsigned char rgb4;
|
||||||
|
#endif
|
||||||
|
__vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
|
||||||
|
__vector unsigned short y01, y23, cr01, cr23, cb01, cb23;
|
||||||
|
__vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
|
||||||
|
|
||||||
|
/* Constants */
|
||||||
|
__vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
|
||||||
|
pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) },
|
||||||
|
pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) },
|
||||||
|
pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) };
|
||||||
|
__vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) };
|
||||||
|
__vector int pd_onehalf = { __4X(ONE_HALF) },
|
||||||
|
pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
|
||||||
|
__vector unsigned char zero = { __16X(0) },
|
||||||
|
shift_pack_index =
|
||||||
|
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
|
||||||
|
|
||||||
|
while (--num_rows >= 0) {
|
||||||
|
inptr = *input_buf++;
|
||||||
|
outptr0 = output_buf[0][output_row];
|
||||||
|
outptr1 = output_buf[1][output_row];
|
||||||
|
outptr2 = output_buf[2][output_row];
|
||||||
|
output_row++;
|
||||||
|
|
||||||
|
for (pitch = img_width * RGB_PIXELSIZE; pitch > 0;
|
||||||
|
pitch -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
|
||||||
|
outptr0 += 16, outptr1 += 16, outptr2 += 16) {
|
||||||
|
|
||||||
|
#if RGB_PIXELSIZE == 3
|
||||||
|
/* Load 16 pixels == 48 bytes */
|
||||||
|
if ((size_t)inptr & 15) {
|
||||||
|
__vector unsigned char unaligned_shift_index;
|
||||||
|
rgb0 = vec_ld(0, inptr);
|
||||||
|
if (pitch > 16)
|
||||||
|
rgb1 = vec_ld(16, inptr);
|
||||||
|
else
|
||||||
|
rgb1 = vec_ld(-1, inptr + pitch);
|
||||||
|
if (pitch > 32)
|
||||||
|
rgb2 = vec_ld(32, inptr);
|
||||||
|
else
|
||||||
|
rgb2 = vec_ld(-1, inptr + pitch);
|
||||||
|
if (pitch > 48)
|
||||||
|
rgb3 = vec_ld(48, inptr);
|
||||||
|
else
|
||||||
|
rgb3 = vec_ld(-1, inptr + pitch);
|
||||||
|
unaligned_shift_index = vec_lvsl(0, inptr);
|
||||||
|
rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
|
||||||
|
rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
|
||||||
|
rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
|
||||||
|
} else {
|
||||||
|
rgb0 = vec_ld(0, inptr);
|
||||||
|
if (pitch > 16)
|
||||||
|
rgb1 = vec_ld(16, inptr);
|
||||||
|
if (pitch > 32)
|
||||||
|
rgb2 = vec_ld(32, inptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
|
||||||
|
* rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
|
||||||
|
* rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
|
||||||
|
*
|
||||||
|
* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
|
||||||
|
* rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
|
||||||
|
* rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
|
||||||
|
* rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
|
||||||
|
*/
|
||||||
|
rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
|
||||||
|
rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
|
||||||
|
rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
|
||||||
|
rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
|
||||||
|
#else
|
||||||
|
/* Load 16 pixels == 64 bytes */
|
||||||
|
if ((size_t)inptr & 15) {
|
||||||
|
__vector unsigned char unaligned_shift_index;
|
||||||
|
rgb0 = vec_ld(0, inptr);
|
||||||
|
if (pitch > 16)
|
||||||
|
rgb1 = vec_ld(16, inptr);
|
||||||
|
else
|
||||||
|
rgb1 = vec_ld(-1, inptr + pitch);
|
||||||
|
if (pitch > 32)
|
||||||
|
rgb2 = vec_ld(32, inptr);
|
||||||
|
else
|
||||||
|
rgb2 = vec_ld(-1, inptr + pitch);
|
||||||
|
if (pitch > 48)
|
||||||
|
rgb3 = vec_ld(48, inptr);
|
||||||
|
else
|
||||||
|
rgb3 = vec_ld(-1, inptr + pitch);
|
||||||
|
if (pitch > 64)
|
||||||
|
rgb4 = vec_ld(64, inptr);
|
||||||
|
else
|
||||||
|
rgb4 = vec_ld(-1, inptr + pitch);
|
||||||
|
unaligned_shift_index = vec_lvsl(0, inptr);
|
||||||
|
rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
|
||||||
|
rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
|
||||||
|
rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
|
||||||
|
rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
|
||||||
|
} else {
|
||||||
|
rgb0 = vec_ld(0, inptr);
|
||||||
|
if (pitch > 16)
|
||||||
|
rgb1 = vec_ld(16, inptr);
|
||||||
|
if (pitch > 32)
|
||||||
|
rgb2 = vec_ld(32, inptr);
|
||||||
|
if (pitch > 48)
|
||||||
|
rgb3 = vec_ld(48, inptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
|
||||||
|
* rgb0 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
|
||||||
|
* rgb0 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
|
||||||
|
* rgb0 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
|
||||||
|
*
|
||||||
|
* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
|
||||||
|
* rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
|
||||||
|
* rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
|
||||||
|
* rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
|
||||||
|
*/
|
||||||
|
rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
|
||||||
|
rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
|
||||||
|
rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
|
||||||
|
rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
|
||||||
|
* bg0 = B0 G0 B1 G1 B2 G2 B3 G3
|
||||||
|
* ...
|
||||||
|
*
|
||||||
|
* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
|
||||||
|
* support unsigned vectors.
|
||||||
|
*/
|
||||||
|
rg0 = (__vector signed short)vec_mergeh(zero, rgbg0);
|
||||||
|
bg0 = (__vector signed short)vec_mergel(zero, rgbg0);
|
||||||
|
rg1 = (__vector signed short)vec_mergeh(zero, rgbg1);
|
||||||
|
bg1 = (__vector signed short)vec_mergel(zero, rgbg1);
|
||||||
|
rg2 = (__vector signed short)vec_mergeh(zero, rgbg2);
|
||||||
|
bg2 = (__vector signed short)vec_mergel(zero, rgbg2);
|
||||||
|
rg3 = (__vector signed short)vec_mergeh(zero, rgbg3);
|
||||||
|
bg3 = (__vector signed short)vec_mergel(zero, rgbg3);
|
||||||
|
|
||||||
|
/* (Original)
|
||||||
|
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||||
|
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||||
|
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||||
|
*
|
||||||
|
* (This implementation)
|
||||||
|
* Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||||
|
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||||
|
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Calculate Y values */
|
||||||
|
|
||||||
|
y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
|
||||||
|
y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
|
||||||
|
y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
|
||||||
|
y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
|
||||||
|
y0 = vec_msums(bg0, pw_f0114_f0250, y0);
|
||||||
|
y1 = vec_msums(bg1, pw_f0114_f0250, y1);
|
||||||
|
y2 = vec_msums(bg2, pw_f0114_f0250, y2);
|
||||||
|
y3 = vec_msums(bg3, pw_f0114_f0250, y3);
|
||||||
|
/* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
|
||||||
|
* each dword into a new 16-bit vector, which is the equivalent of
|
||||||
|
* descaling the 32-bit results (right-shifting by 16 bits) and then
|
||||||
|
* packing them.
|
||||||
|
*/
|
||||||
|
y01 = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
|
||||||
|
shift_pack_index);
|
||||||
|
y23 = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
|
||||||
|
shift_pack_index);
|
||||||
|
y = vec_pack(y01, y23);
|
||||||
|
vec_st(y, 0, outptr0);
|
||||||
|
|
||||||
|
/* Calculate Cb values */
|
||||||
|
cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj);
|
||||||
|
cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj);
|
||||||
|
cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj);
|
||||||
|
cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj);
|
||||||
|
cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000,
|
||||||
|
(__vector unsigned int)cb0);
|
||||||
|
cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000,
|
||||||
|
(__vector unsigned int)cb1);
|
||||||
|
cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000,
|
||||||
|
(__vector unsigned int)cb2);
|
||||||
|
cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000,
|
||||||
|
(__vector unsigned int)cb3);
|
||||||
|
cb01 = vec_perm((__vector unsigned short)cb0,
|
||||||
|
(__vector unsigned short)cb1, shift_pack_index);
|
||||||
|
cb23 = vec_perm((__vector unsigned short)cb2,
|
||||||
|
(__vector unsigned short)cb3, shift_pack_index);
|
||||||
|
cb = vec_pack(cb01, cb23);
|
||||||
|
vec_st(cb, 0, outptr1);
|
||||||
|
|
||||||
|
/* Calculate Cr values */
|
||||||
|
cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj);
|
||||||
|
cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj);
|
||||||
|
cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj);
|
||||||
|
cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj);
|
||||||
|
cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000,
|
||||||
|
(__vector unsigned int)cr0);
|
||||||
|
cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000,
|
||||||
|
(__vector unsigned int)cr1);
|
||||||
|
cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000,
|
||||||
|
(__vector unsigned int)cr2);
|
||||||
|
cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000,
|
||||||
|
(__vector unsigned int)cr3);
|
||||||
|
cr01 = vec_perm((__vector unsigned short)cr0,
|
||||||
|
(__vector unsigned short)cr1, shift_pack_index);
|
||||||
|
cr23 = vec_perm((__vector unsigned short)cr2,
|
||||||
|
(__vector unsigned short)cr3, shift_pack_index);
|
||||||
|
cr = vec_pack(cr01, cr23);
|
||||||
|
vec_st(cr, 0, outptr2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
104
simd/jccolor-altivec.c
Normal file
104
simd/jccolor-altivec.c
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
/*
|
||||||
|
* AltiVec optimizations for libjpeg-turbo
|
||||||
|
*
|
||||||
|
* Copyright (C) 2014, D. R. Commander.
|
||||||
|
* All rights reserved.
|
||||||
|
* This software is provided 'as-is', without any express or implied
|
||||||
|
* warranty. In no event will the authors be held liable for any damages
|
||||||
|
* arising from the use of this software.
|
||||||
|
*
|
||||||
|
* Permission is granted to anyone to use this software for any purpose,
|
||||||
|
* including commercial applications, and to alter it and redistribute it
|
||||||
|
* freely, subject to the following restrictions:
|
||||||
|
*
|
||||||
|
* 1. The origin of this software must not be misrepresented; you must not
|
||||||
|
* claim that you wrote the original software. If you use this software
|
||||||
|
* in a product, an acknowledgment in the product documentation would be
|
||||||
|
* appreciated but is not required.
|
||||||
|
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||||
|
* misrepresented as being the original software.
|
||||||
|
* 3. This notice may not be removed or altered from any source distribution.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* RGB --> YCC CONVERSION */
|
||||||
|
|
||||||
|
#include "jsimd_altivec.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define F_0_081 5329 /* FIX(0.08131) */
|
||||||
|
#define F_0_114 7471 /* FIX(0.11400) */
|
||||||
|
#define F_0_168 11059 /* FIX(0.16874) */
|
||||||
|
#define F_0_250 16384 /* FIX(0.25000) */
|
||||||
|
#define F_0_299 19595 /* FIX(0.29900) */
|
||||||
|
#define F_0_331 21709 /* FIX(0.33126) */
|
||||||
|
#define F_0_418 27439 /* FIX(0.41869) */
|
||||||
|
#define F_0_500 32768 /* FIX(0.50000) */
|
||||||
|
#define F_0_587 38470 /* FIX(0.58700) */
|
||||||
|
#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
|
||||||
|
|
||||||
|
#define SCALEBITS 16
|
||||||
|
#define ONE_HALF (1 << (SCALEBITS - 1))
|
||||||
|
|
||||||
|
|
||||||
|
#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
|
||||||
|
#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
|
||||||
|
#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
|
||||||
|
#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
|
||||||
|
#include "jccolext-altivec.c"
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
|
||||||
|
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||||
|
#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec
|
||||||
|
#include "jccolext-altivec.c"
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef RGBG_INDEX0
|
||||||
|
#undef RGBG_INDEX1
|
||||||
|
#undef RGBG_INDEX2
|
||||||
|
#undef RGBG_INDEX3
|
||||||
|
#undef jsimd_rgb_ycc_convert_altivec
|
||||||
|
|
||||||
|
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||||
|
#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
|
||||||
|
#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec
|
||||||
|
#include "jccolext-altivec.c"
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef RGBG_INDEX
|
||||||
|
#undef jsimd_rgb_ycc_convert_altivec
|
||||||
|
|
||||||
|
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||||
|
#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
|
||||||
|
#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
|
||||||
|
#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
|
||||||
|
#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
|
||||||
|
#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec
|
||||||
|
#include "jccolext-altivec.c"
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef RGBG_INDEX0
|
||||||
|
#undef RGBG_INDEX1
|
||||||
|
#undef RGBG_INDEX2
|
||||||
|
#undef RGBG_INDEX3
|
||||||
|
#undef jsimd_rgb_ycc_convert_altivec
|
||||||
|
|
||||||
|
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||||
|
#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
|
||||||
|
#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec
|
||||||
|
#include "jccolext-altivec.c"
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef RGBG_INDEX
|
||||||
|
#undef jsimd_rgb_ycc_convert_altivec
|
||||||
|
|
||||||
|
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||||
|
#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
|
||||||
|
#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec
|
||||||
|
#include "jccolext-altivec.c"
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef RGBG_INDEX
|
||||||
|
#undef jsimd_rgb_ycc_convert_altivec
|
||||||
|
|
||||||
|
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||||
|
#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
|
||||||
|
#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec
|
||||||
|
#include "jccolext-altivec.c"
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef RGBG_INDEX
|
||||||
|
#undef jsimd_rgb_ycc_convert_altivec
|
||||||
99
simd/jcgray-altivec.c
Normal file
99
simd/jcgray-altivec.c
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
/*
|
||||||
|
* AltiVec optimizations for libjpeg-turbo
|
||||||
|
*
|
||||||
|
* Copyright (C) 2014, D. R. Commander.
|
||||||
|
* All rights reserved.
|
||||||
|
* This software is provided 'as-is', without any express or implied
|
||||||
|
* warranty. In no event will the authors be held liable for any damages
|
||||||
|
* arising from the use of this software.
|
||||||
|
*
|
||||||
|
* Permission is granted to anyone to use this software for any purpose,
|
||||||
|
* including commercial applications, and to alter it and redistribute it
|
||||||
|
* freely, subject to the following restrictions:
|
||||||
|
*
|
||||||
|
* 1. The origin of this software must not be misrepresented; you must not
|
||||||
|
* claim that you wrote the original software. If you use this software
|
||||||
|
* in a product, an acknowledgment in the product documentation would be
|
||||||
|
* appreciated but is not required.
|
||||||
|
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||||
|
* misrepresented as being the original software.
|
||||||
|
* 3. This notice may not be removed or altered from any source distribution.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* RGB --> GRAYSCALE CONVERSION */
|
||||||
|
|
||||||
|
#include "jsimd_altivec.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define F_0_114 7471 /* FIX(0.11400) */
|
||||||
|
#define F_0_250 16384 /* FIX(0.25000) */
|
||||||
|
#define F_0_299 19595 /* FIX(0.29900) */
|
||||||
|
#define F_0_587 38470 /* FIX(0.58700) */
|
||||||
|
#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
|
||||||
|
|
||||||
|
#define SCALEBITS 16
|
||||||
|
#define ONE_HALF (1 << (SCALEBITS - 1))
|
||||||
|
|
||||||
|
|
||||||
|
#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
|
||||||
|
#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
|
||||||
|
#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
|
||||||
|
#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
|
||||||
|
#include "jcgryext-altivec.c"
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
|
||||||
|
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||||
|
#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec
|
||||||
|
#include "jcgryext-altivec.c"
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef RGBG_INDEX0
|
||||||
|
#undef RGBG_INDEX1
|
||||||
|
#undef RGBG_INDEX2
|
||||||
|
#undef RGBG_INDEX3
|
||||||
|
#undef jsimd_rgb_gray_convert_altivec
|
||||||
|
|
||||||
|
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||||
|
#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
|
||||||
|
#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec
|
||||||
|
#include "jcgryext-altivec.c"
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef RGBG_INDEX
|
||||||
|
#undef jsimd_rgb_gray_convert_altivec
|
||||||
|
|
||||||
|
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||||
|
#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
|
||||||
|
#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
|
||||||
|
#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
|
||||||
|
#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
|
||||||
|
#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec
|
||||||
|
#include "jcgryext-altivec.c"
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef RGBG_INDEX0
|
||||||
|
#undef RGBG_INDEX1
|
||||||
|
#undef RGBG_INDEX2
|
||||||
|
#undef RGBG_INDEX3
|
||||||
|
#undef jsimd_rgb_gray_convert_altivec
|
||||||
|
|
||||||
|
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||||
|
#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
|
||||||
|
#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec
|
||||||
|
#include "jcgryext-altivec.c"
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef RGBG_INDEX
|
||||||
|
#undef jsimd_rgb_gray_convert_altivec
|
||||||
|
|
||||||
|
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||||
|
#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
|
||||||
|
#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec
|
||||||
|
#include "jcgryext-altivec.c"
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef RGBG_INDEX
|
||||||
|
#undef jsimd_rgb_gray_convert_altivec
|
||||||
|
|
||||||
|
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||||
|
#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
|
||||||
|
#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec
|
||||||
|
#include "jcgryext-altivec.c"
|
||||||
|
#undef RGB_PIXELSIZE
|
||||||
|
#undef RGBG_INDEX
|
||||||
|
#undef jsimd_rgb_gray_convert_altivec
|
||||||
200
simd/jcgryext-altivec.c
Normal file
200
simd/jcgryext-altivec.c
Normal file
@@ -0,0 +1,200 @@
|
|||||||
|
/*
|
||||||
|
* AltiVec optimizations for libjpeg-turbo
|
||||||
|
*
|
||||||
|
* Copyright (C) 2014, D. R. Commander.
|
||||||
|
* Copyright (C) 2014, Jay Foad.
|
||||||
|
* All rights reserved.
|
||||||
|
* This software is provided 'as-is', without any express or implied
|
||||||
|
* warranty. In no event will the authors be held liable for any damages
|
||||||
|
* arising from the use of this software.
|
||||||
|
*
|
||||||
|
* Permission is granted to anyone to use this software for any purpose,
|
||||||
|
* including commercial applications, and to alter it and redistribute it
|
||||||
|
* freely, subject to the following restrictions:
|
||||||
|
*
|
||||||
|
* 1. The origin of this software must not be misrepresented; you must not
|
||||||
|
* claim that you wrote the original software. If you use this software
|
||||||
|
* in a product, an acknowledgment in the product documentation would be
|
||||||
|
* appreciated but is not required.
|
||||||
|
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||||
|
* misrepresented as being the original software.
|
||||||
|
* 3. This notice may not be removed or altered from any source distribution.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* This file is included by jcgray-altivec.c */
|
||||||
|
|
||||||
|
|
||||||
|
void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
|
||||||
|
JSAMPARRAY input_buf,
|
||||||
|
JSAMPIMAGE output_buf,
|
||||||
|
JDIMENSION output_row, int num_rows)
|
||||||
|
{
|
||||||
|
JSAMPROW inptr, outptr;
|
||||||
|
int pitch;
|
||||||
|
__vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0}, rgbg0,
|
||||||
|
rgbg1, rgbg2, rgbg3, y;
|
||||||
|
#if RGB_PIXELSIZE == 4
|
||||||
|
__vector unsigned char rgb4;
|
||||||
|
#endif
|
||||||
|
__vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
|
||||||
|
__vector unsigned short y01, y23;
|
||||||
|
__vector int y0, y1, y2, y3;
|
||||||
|
|
||||||
|
/* Constants */
|
||||||
|
__vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
|
||||||
|
pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) };
|
||||||
|
__vector int pd_onehalf = { __4X(ONE_HALF) };
|
||||||
|
__vector unsigned char zero = { __16X(0) },
|
||||||
|
shift_pack_index =
|
||||||
|
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
|
||||||
|
|
||||||
|
while (--num_rows >= 0) {
|
||||||
|
inptr = *input_buf++;
|
||||||
|
outptr = output_buf[0][output_row];
|
||||||
|
output_row++;
|
||||||
|
|
||||||
|
for (pitch = img_width * RGB_PIXELSIZE; pitch > 0;
|
||||||
|
pitch -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
|
||||||
|
outptr += 16) {
|
||||||
|
|
||||||
|
#if RGB_PIXELSIZE == 3
|
||||||
|
/* Load 16 pixels == 48 bytes */
|
||||||
|
if ((size_t)inptr & 15) {
|
||||||
|
__vector unsigned char unaligned_shift_index;
|
||||||
|
rgb0 = vec_ld(0, inptr);
|
||||||
|
if (pitch > 16)
|
||||||
|
rgb1 = vec_ld(16, inptr);
|
||||||
|
else
|
||||||
|
rgb1 = vec_ld(-1, inptr + pitch);
|
||||||
|
if (pitch > 32)
|
||||||
|
rgb2 = vec_ld(32, inptr);
|
||||||
|
else
|
||||||
|
rgb2 = vec_ld(-1, inptr + pitch);
|
||||||
|
if (pitch > 48)
|
||||||
|
rgb3 = vec_ld(48, inptr);
|
||||||
|
else
|
||||||
|
rgb3 = vec_ld(-1, inptr + pitch);
|
||||||
|
unaligned_shift_index = vec_lvsl(0, inptr);
|
||||||
|
rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
|
||||||
|
rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
|
||||||
|
rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
|
||||||
|
} else {
|
||||||
|
rgb0 = vec_ld(0, inptr);
|
||||||
|
if (pitch > 16)
|
||||||
|
rgb1 = vec_ld(16, inptr);
|
||||||
|
if (pitch > 32)
|
||||||
|
rgb2 = vec_ld(32, inptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
|
||||||
|
* rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
|
||||||
|
* rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
|
||||||
|
*
|
||||||
|
* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
|
||||||
|
* rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
|
||||||
|
* rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
|
||||||
|
* rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
|
||||||
|
*/
|
||||||
|
rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
|
||||||
|
rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
|
||||||
|
rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
|
||||||
|
rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
|
||||||
|
#else
|
||||||
|
/* Load 16 pixels == 64 bytes */
|
||||||
|
if ((size_t)inptr & 15) {
|
||||||
|
__vector unsigned char unaligned_shift_index;
|
||||||
|
rgb0 = vec_ld(0, inptr);
|
||||||
|
if (pitch > 16)
|
||||||
|
rgb1 = vec_ld(16, inptr);
|
||||||
|
else
|
||||||
|
rgb1 = vec_ld(-1, inptr + pitch);
|
||||||
|
if (pitch > 32)
|
||||||
|
rgb2 = vec_ld(32, inptr);
|
||||||
|
else
|
||||||
|
rgb2 = vec_ld(-1, inptr + pitch);
|
||||||
|
if (pitch > 48)
|
||||||
|
rgb3 = vec_ld(48, inptr);
|
||||||
|
else
|
||||||
|
rgb3 = vec_ld(-1, inptr + pitch);
|
||||||
|
if (pitch > 64)
|
||||||
|
rgb4 = vec_ld(64, inptr);
|
||||||
|
else
|
||||||
|
rgb4 = vec_ld(-1, inptr + pitch);
|
||||||
|
unaligned_shift_index = vec_lvsl(0, inptr);
|
||||||
|
rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
|
||||||
|
rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
|
||||||
|
rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
|
||||||
|
rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
|
||||||
|
} else {
|
||||||
|
rgb0 = vec_ld(0, inptr);
|
||||||
|
if (pitch > 16)
|
||||||
|
rgb1 = vec_ld(16, inptr);
|
||||||
|
if (pitch > 32)
|
||||||
|
rgb2 = vec_ld(32, inptr);
|
||||||
|
if (pitch > 48)
|
||||||
|
rgb3 = vec_ld(48, inptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
|
||||||
|
* rgb0 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
|
||||||
|
* rgb0 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
|
||||||
|
* rgb0 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
|
||||||
|
*
|
||||||
|
* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
|
||||||
|
* rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
|
||||||
|
* rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
|
||||||
|
* rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
|
||||||
|
*/
|
||||||
|
rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
|
||||||
|
rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
|
||||||
|
rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
|
||||||
|
rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
|
||||||
|
* bg0 = B0 G0 B1 G1 B2 G2 B3 G3
|
||||||
|
* ...
|
||||||
|
*
|
||||||
|
* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
|
||||||
|
* support unsigned vectors.
|
||||||
|
*/
|
||||||
|
rg0 = (__vector signed short)vec_mergeh(zero, rgbg0);
|
||||||
|
bg0 = (__vector signed short)vec_mergel(zero, rgbg0);
|
||||||
|
rg1 = (__vector signed short)vec_mergeh(zero, rgbg1);
|
||||||
|
bg1 = (__vector signed short)vec_mergel(zero, rgbg1);
|
||||||
|
rg2 = (__vector signed short)vec_mergeh(zero, rgbg2);
|
||||||
|
bg2 = (__vector signed short)vec_mergel(zero, rgbg2);
|
||||||
|
rg3 = (__vector signed short)vec_mergeh(zero, rgbg3);
|
||||||
|
bg3 = (__vector signed short)vec_mergel(zero, rgbg3);
|
||||||
|
|
||||||
|
/* (Original)
|
||||||
|
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||||
|
*
|
||||||
|
* (This implementation)
|
||||||
|
* Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Calculate Y values */
|
||||||
|
|
||||||
|
y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
|
||||||
|
y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
|
||||||
|
y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
|
||||||
|
y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
|
||||||
|
y0 = vec_msums(bg0, pw_f0114_f0250, y0);
|
||||||
|
y1 = vec_msums(bg1, pw_f0114_f0250, y1);
|
||||||
|
y2 = vec_msums(bg2, pw_f0114_f0250, y2);
|
||||||
|
y3 = vec_msums(bg3, pw_f0114_f0250, y3);
|
||||||
|
/* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
|
||||||
|
* each dword into a new 16-bit vector, which is the equivalent of
|
||||||
|
* descaling the 32-bit results (right-shifting by 16 bits) and then
|
||||||
|
* packing them.
|
||||||
|
*/
|
||||||
|
y01 = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
|
||||||
|
shift_pack_index);
|
||||||
|
y23 = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
|
||||||
|
shift_pack_index);
|
||||||
|
y = vec_pack(y01, y23);
|
||||||
|
vec_st(y, 0, outptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
156
simd/jfdctfst-altivec.c
Normal file
156
simd/jfdctfst-altivec.c
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
/*
|
||||||
|
* AltiVec optimizations for libjpeg-turbo
|
||||||
|
*
|
||||||
|
* Copyright (C) 2014, D. R. Commander.
|
||||||
|
* All rights reserved.
|
||||||
|
* This software is provided 'as-is', without any express or implied
|
||||||
|
* warranty. In no event will the authors be held liable for any damages
|
||||||
|
* arising from the use of this software.
|
||||||
|
*
|
||||||
|
* Permission is granted to anyone to use this software for any purpose,
|
||||||
|
* including commercial applications, and to alter it and redistribute it
|
||||||
|
* freely, subject to the following restrictions:
|
||||||
|
*
|
||||||
|
* 1. The origin of this software must not be misrepresented; you must not
|
||||||
|
* claim that you wrote the original software. If you use this software
|
||||||
|
* in a product, an acknowledgment in the product documentation would be
|
||||||
|
* appreciated but is not required.
|
||||||
|
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||||
|
* misrepresented as being the original software.
|
||||||
|
* 3. This notice may not be removed or altered from any source distribution.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* FAST INTEGER FORWARD DCT
|
||||||
|
*
|
||||||
|
* This is similar to the SSE2 implementation, except that we left-shift the
|
||||||
|
* constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
|
||||||
|
* vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
|
||||||
|
* the elements in arg3 + the most significant 17 bits of
|
||||||
|
* (the elements in arg1 * the elements in arg2).
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "jsimd_altivec.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define F_0_382 98 /* FIX(0.382683433) */
|
||||||
|
#define F_0_541 139 /* FIX(0.541196100) */
|
||||||
|
#define F_0_707 181 /* FIX(0.707106781) */
|
||||||
|
#define F_1_306 334 /* FIX(1.306562965) */
|
||||||
|
|
||||||
|
#define CONST_BITS 8
|
||||||
|
#define PRE_MULTIPLY_SCALE_BITS 2
|
||||||
|
#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
|
||||||
|
|
||||||
|
|
||||||
|
#define DO_FDCT() \
|
||||||
|
{ \
|
||||||
|
/* Even part */ \
|
||||||
|
\
|
||||||
|
tmp10 = vec_add(tmp0, tmp3); \
|
||||||
|
tmp13 = vec_sub(tmp0, tmp3); \
|
||||||
|
tmp11 = vec_add(tmp1, tmp2); \
|
||||||
|
tmp12 = vec_sub(tmp1, tmp2); \
|
||||||
|
\
|
||||||
|
out0 = vec_add(tmp10, tmp11); \
|
||||||
|
out4 = vec_sub(tmp10, tmp11); \
|
||||||
|
\
|
||||||
|
z1 = vec_add(tmp12, tmp13); \
|
||||||
|
z1 = vec_sl(z1, pre_multiply_scale_bits); \
|
||||||
|
z1 = vec_madds(z1, pw_0707, zero); \
|
||||||
|
\
|
||||||
|
out2 = vec_add(tmp13, z1); \
|
||||||
|
out6 = vec_sub(tmp13, z1); \
|
||||||
|
\
|
||||||
|
/* Odd part */ \
|
||||||
|
\
|
||||||
|
tmp10 = vec_add(tmp4, tmp5); \
|
||||||
|
tmp11 = vec_add(tmp5, tmp6); \
|
||||||
|
tmp12 = vec_add(tmp6, tmp7); \
|
||||||
|
\
|
||||||
|
tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \
|
||||||
|
tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
|
||||||
|
z5 = vec_sub(tmp10, tmp12); \
|
||||||
|
z5 = vec_madds(z5, pw_0382, zero); \
|
||||||
|
\
|
||||||
|
z2 = vec_madds(tmp10, pw_0541, z5); \
|
||||||
|
z4 = vec_madds(tmp12, pw_1306, z5); \
|
||||||
|
\
|
||||||
|
tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
|
||||||
|
z3 = vec_madds(tmp11, pw_0707, zero); \
|
||||||
|
\
|
||||||
|
z11 = vec_add(tmp7, z3); \
|
||||||
|
z13 = vec_sub(tmp7, z3); \
|
||||||
|
\
|
||||||
|
out5 = vec_add(z13, z2); \
|
||||||
|
out3 = vec_sub(z13, z2); \
|
||||||
|
out1 = vec_add(z11, z4); \
|
||||||
|
out7 = vec_sub(z11, z4); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
jsimd_fdct_ifast_altivec (DCTELEM *data)
|
||||||
|
{
|
||||||
|
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
|
||||||
|
col0, col1, col2, col3, col4, col5, col6, col7,
|
||||||
|
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
|
||||||
|
z1, z2, z3, z4, z5, z11, z13,
|
||||||
|
out0, out1, out2, out3, out4, out5, out6, out7;
|
||||||
|
|
||||||
|
/* Constants */
|
||||||
|
__vector short zero = vec_splat_s16(0),
|
||||||
|
pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
|
||||||
|
pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
|
||||||
|
pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
|
||||||
|
pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
|
||||||
|
__vector unsigned short
|
||||||
|
pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
|
||||||
|
|
||||||
|
/* Pass 1: process rows */
|
||||||
|
|
||||||
|
row0 = vec_ld(0, data);
|
||||||
|
row1 = vec_ld(16, data);
|
||||||
|
row2 = vec_ld(32, data);
|
||||||
|
row3 = vec_ld(48, data);
|
||||||
|
row4 = vec_ld(64, data);
|
||||||
|
row5 = vec_ld(80, data);
|
||||||
|
row6 = vec_ld(96, data);
|
||||||
|
row7 = vec_ld(112, data);
|
||||||
|
|
||||||
|
TRANSPOSE(row, col);
|
||||||
|
|
||||||
|
tmp0 = vec_add(col0, col7);
|
||||||
|
tmp7 = vec_sub(col0, col7);
|
||||||
|
tmp1 = vec_add(col1, col6);
|
||||||
|
tmp6 = vec_sub(col1, col6);
|
||||||
|
tmp2 = vec_add(col2, col5);
|
||||||
|
tmp5 = vec_sub(col2, col5);
|
||||||
|
tmp3 = vec_add(col3, col4);
|
||||||
|
tmp4 = vec_sub(col3, col4);
|
||||||
|
|
||||||
|
DO_FDCT();
|
||||||
|
|
||||||
|
/* Pass 2: process columns */
|
||||||
|
|
||||||
|
TRANSPOSE(out, row);
|
||||||
|
|
||||||
|
tmp0 = vec_add(row0, row7);
|
||||||
|
tmp7 = vec_sub(row0, row7);
|
||||||
|
tmp1 = vec_add(row1, row6);
|
||||||
|
tmp6 = vec_sub(row1, row6);
|
||||||
|
tmp2 = vec_add(row2, row5);
|
||||||
|
tmp5 = vec_sub(row2, row5);
|
||||||
|
tmp3 = vec_add(row3, row4);
|
||||||
|
tmp4 = vec_sub(row3, row4);
|
||||||
|
|
||||||
|
DO_FDCT();
|
||||||
|
|
||||||
|
vec_st(out0, 0, data);
|
||||||
|
vec_st(out1, 16, data);
|
||||||
|
vec_st(out2, 32, data);
|
||||||
|
vec_st(out3, 48, data);
|
||||||
|
vec_st(out4, 64, data);
|
||||||
|
vec_st(out5, 80, data);
|
||||||
|
vec_st(out6, 96, data);
|
||||||
|
vec_st(out7, 112, data);
|
||||||
|
}
|
||||||
262
simd/jfdctint-altivec.c
Normal file
262
simd/jfdctint-altivec.c
Normal file
@@ -0,0 +1,262 @@
|
|||||||
|
/*
|
||||||
|
* AltiVec optimizations for libjpeg-turbo
|
||||||
|
*
|
||||||
|
* Copyright (C) 2014, D. R. Commander.
|
||||||
|
* All rights reserved.
|
||||||
|
* This software is provided 'as-is', without any express or implied
|
||||||
|
* warranty. In no event will the authors be held liable for any damages
|
||||||
|
* arising from the use of this software.
|
||||||
|
*
|
||||||
|
* Permission is granted to anyone to use this software for any purpose,
|
||||||
|
* including commercial applications, and to alter it and redistribute it
|
||||||
|
* freely, subject to the following restrictions:
|
||||||
|
*
|
||||||
|
* 1. The origin of this software must not be misrepresented; you must not
|
||||||
|
* claim that you wrote the original software. If you use this software
|
||||||
|
* in a product, an acknowledgment in the product documentation would be
|
||||||
|
* appreciated but is not required.
|
||||||
|
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||||
|
* misrepresented as being the original software.
|
||||||
|
* 3. This notice may not be removed or altered from any source distribution.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* SLOW INTEGER FORWARD DCT */
|
||||||
|
|
||||||
|
#include "jsimd_altivec.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define F_0_298 2446 /* FIX(0.298631336) */
|
||||||
|
#define F_0_390 3196 /* FIX(0.390180644) */
|
||||||
|
#define F_0_541 4433 /* FIX(0.541196100) */
|
||||||
|
#define F_0_765 6270 /* FIX(0.765366865) */
|
||||||
|
#define F_0_899 7373 /* FIX(0.899976223) */
|
||||||
|
#define F_1_175 9633 /* FIX(1.175875602) */
|
||||||
|
#define F_1_501 12299 /* FIX(1.501321110) */
|
||||||
|
#define F_1_847 15137 /* FIX(1.847759065) */
|
||||||
|
#define F_1_961 16069 /* FIX(1.961570560) */
|
||||||
|
#define F_2_053 16819 /* FIX(2.053119869) */
|
||||||
|
#define F_2_562 20995 /* FIX(2.562915447) */
|
||||||
|
#define F_3_072 25172 /* FIX(3.072711026) */
|
||||||
|
|
||||||
|
#define CONST_BITS 13
|
||||||
|
#define PASS1_BITS 2
|
||||||
|
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||||
|
#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
|
||||||
|
|
||||||
|
|
||||||
|
#define DO_FDCT_COMMON(PASS) \
|
||||||
|
{ \
|
||||||
|
/* (Original) \
|
||||||
|
* z1 = (tmp12 + tmp13) * 0.541196100; \
|
||||||
|
* data2 = z1 + tmp13 * 0.765366865; \
|
||||||
|
* data6 = z1 + tmp12 * -1.847759065; \
|
||||||
|
* \
|
||||||
|
* (This implementation) \
|
||||||
|
* data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
|
||||||
|
* data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
|
||||||
|
*/ \
|
||||||
|
\
|
||||||
|
tmp1312l = vec_mergeh(tmp13, tmp12); \
|
||||||
|
tmp1312h = vec_mergel(tmp13, tmp12); \
|
||||||
|
\
|
||||||
|
out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \
|
||||||
|
out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \
|
||||||
|
out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \
|
||||||
|
out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \
|
||||||
|
\
|
||||||
|
out2l = vec_sra(out2l, descale_p##PASS); \
|
||||||
|
out2h = vec_sra(out2h, descale_p##PASS); \
|
||||||
|
out6l = vec_sra(out6l, descale_p##PASS); \
|
||||||
|
out6h = vec_sra(out6h, descale_p##PASS); \
|
||||||
|
\
|
||||||
|
out2 = vec_pack(out2l, out2h); \
|
||||||
|
out6 = vec_pack(out6l, out6h); \
|
||||||
|
\
|
||||||
|
/* Odd part */ \
|
||||||
|
\
|
||||||
|
z3 = vec_add(tmp4, tmp6); \
|
||||||
|
z4 = vec_add(tmp5, tmp7); \
|
||||||
|
\
|
||||||
|
/* (Original) \
|
||||||
|
* z5 = (z3 + z4) * 1.175875602; \
|
||||||
|
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
|
||||||
|
* z3 += z5; z4 += z5; \
|
||||||
|
* \
|
||||||
|
* (This implementation) \
|
||||||
|
* z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
|
||||||
|
* z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
|
||||||
|
*/ \
|
||||||
|
\
|
||||||
|
z34l = vec_mergeh(z3, z4); \
|
||||||
|
z34h = vec_mergel(z3, z4); \
|
||||||
|
\
|
||||||
|
z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \
|
||||||
|
z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \
|
||||||
|
z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \
|
||||||
|
z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \
|
||||||
|
\
|
||||||
|
/* (Original) \
|
||||||
|
* z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \
|
||||||
|
* tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \
|
||||||
|
* tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \
|
||||||
|
* z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
|
||||||
|
* data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \
|
||||||
|
* data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \
|
||||||
|
* \
|
||||||
|
* (This implementation) \
|
||||||
|
* tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
|
||||||
|
* tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
|
||||||
|
* tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
|
||||||
|
* tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
|
||||||
|
* data7 = tmp4 + z3; data5 = tmp5 + z4; \
|
||||||
|
* data3 = tmp6 + z3; data1 = tmp7 + z4; \
|
||||||
|
*/ \
|
||||||
|
\
|
||||||
|
tmp47l = vec_mergeh(tmp4, tmp7); \
|
||||||
|
tmp47h = vec_mergel(tmp4, tmp7); \
|
||||||
|
\
|
||||||
|
out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \
|
||||||
|
out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \
|
||||||
|
out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \
|
||||||
|
out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \
|
||||||
|
\
|
||||||
|
out7l = vec_sra(out7l, descale_p##PASS); \
|
||||||
|
out7h = vec_sra(out7h, descale_p##PASS); \
|
||||||
|
out1l = vec_sra(out1l, descale_p##PASS); \
|
||||||
|
out1h = vec_sra(out1h, descale_p##PASS); \
|
||||||
|
\
|
||||||
|
out7 = vec_pack(out7l, out7h); \
|
||||||
|
out1 = vec_pack(out1l, out1h); \
|
||||||
|
\
|
||||||
|
tmp56l = vec_mergeh(tmp5, tmp6); \
|
||||||
|
tmp56h = vec_mergel(tmp5, tmp6); \
|
||||||
|
\
|
||||||
|
out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \
|
||||||
|
out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \
|
||||||
|
out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \
|
||||||
|
out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \
|
||||||
|
\
|
||||||
|
out5l = vec_sra(out5l, descale_p##PASS); \
|
||||||
|
out5h = vec_sra(out5h, descale_p##PASS); \
|
||||||
|
out3l = vec_sra(out3l, descale_p##PASS); \
|
||||||
|
out3h = vec_sra(out3h, descale_p##PASS); \
|
||||||
|
\
|
||||||
|
out5 = vec_pack(out5l, out5h); \
|
||||||
|
out3 = vec_pack(out3l, out3h); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define DO_FDCT_ROWS() \
|
||||||
|
{ \
|
||||||
|
/* Even part */ \
|
||||||
|
\
|
||||||
|
tmp10 = vec_add(tmp0, tmp3); \
|
||||||
|
tmp13 = vec_sub(tmp0, tmp3); \
|
||||||
|
tmp11 = vec_add(tmp1, tmp2); \
|
||||||
|
tmp12 = vec_sub(tmp1, tmp2); \
|
||||||
|
\
|
||||||
|
out0 = vec_add(tmp10, tmp11); \
|
||||||
|
out0 = vec_sl(out0, pass1_bits); \
|
||||||
|
out4 = vec_sub(tmp10, tmp11); \
|
||||||
|
out4 = vec_sl(out4, pass1_bits); \
|
||||||
|
\
|
||||||
|
DO_FDCT_COMMON(1); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define DO_FDCT_COLS() \
|
||||||
|
{ \
|
||||||
|
/* Even part */ \
|
||||||
|
\
|
||||||
|
tmp10 = vec_add(tmp0, tmp3); \
|
||||||
|
tmp13 = vec_sub(tmp0, tmp3); \
|
||||||
|
tmp11 = vec_add(tmp1, tmp2); \
|
||||||
|
tmp12 = vec_sub(tmp1, tmp2); \
|
||||||
|
\
|
||||||
|
out0 = vec_add(tmp10, tmp11); \
|
||||||
|
out0 = vec_add(out0, pw_descale_p2x); \
|
||||||
|
out0 = vec_sra(out0, pass1_bits); \
|
||||||
|
out4 = vec_sub(tmp10, tmp11); \
|
||||||
|
out4 = vec_add(out4, pw_descale_p2x); \
|
||||||
|
out4 = vec_sra(out4, pass1_bits); \
|
||||||
|
\
|
||||||
|
DO_FDCT_COMMON(2); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
jsimd_fdct_islow_altivec (DCTELEM *data)
|
||||||
|
{
|
||||||
|
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
|
||||||
|
col0, col1, col2, col3, col4, col5, col6, col7,
|
||||||
|
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
|
||||||
|
tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
|
||||||
|
z3, z4, z34l, z34h,
|
||||||
|
out0, out1, out2, out3, out4, out5, out6, out7;
|
||||||
|
__vector int z3l, z3h, z4l, z4h,
|
||||||
|
out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
|
||||||
|
out7l, out7h;
|
||||||
|
|
||||||
|
/* Constants */
|
||||||
|
__vector short
|
||||||
|
pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
|
||||||
|
pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
|
||||||
|
pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
|
||||||
|
pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
|
||||||
|
pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
|
||||||
|
pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
|
||||||
|
pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
|
||||||
|
pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
|
||||||
|
pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
|
||||||
|
__vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
|
||||||
|
__vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
|
||||||
|
pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
|
||||||
|
__vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
|
||||||
|
descale_p2 = { __4X(DESCALE_P2) };
|
||||||
|
|
||||||
|
/* Pass 1: process rows */
|
||||||
|
|
||||||
|
row0 = vec_ld(0, data);
|
||||||
|
row1 = vec_ld(16, data);
|
||||||
|
row2 = vec_ld(32, data);
|
||||||
|
row3 = vec_ld(48, data);
|
||||||
|
row4 = vec_ld(64, data);
|
||||||
|
row5 = vec_ld(80, data);
|
||||||
|
row6 = vec_ld(96, data);
|
||||||
|
row7 = vec_ld(112, data);
|
||||||
|
|
||||||
|
TRANSPOSE(row, col);
|
||||||
|
|
||||||
|
tmp0 = vec_add(col0, col7);
|
||||||
|
tmp7 = vec_sub(col0, col7);
|
||||||
|
tmp1 = vec_add(col1, col6);
|
||||||
|
tmp6 = vec_sub(col1, col6);
|
||||||
|
tmp2 = vec_add(col2, col5);
|
||||||
|
tmp5 = vec_sub(col2, col5);
|
||||||
|
tmp3 = vec_add(col3, col4);
|
||||||
|
tmp4 = vec_sub(col3, col4);
|
||||||
|
|
||||||
|
DO_FDCT_ROWS();
|
||||||
|
|
||||||
|
/* Pass 2: process columns */
|
||||||
|
|
||||||
|
TRANSPOSE(out, row);
|
||||||
|
|
||||||
|
tmp0 = vec_add(row0, row7);
|
||||||
|
tmp7 = vec_sub(row0, row7);
|
||||||
|
tmp1 = vec_add(row1, row6);
|
||||||
|
tmp6 = vec_sub(row1, row6);
|
||||||
|
tmp2 = vec_add(row2, row5);
|
||||||
|
tmp5 = vec_sub(row2, row5);
|
||||||
|
tmp3 = vec_add(row3, row4);
|
||||||
|
tmp4 = vec_sub(row3, row4);
|
||||||
|
|
||||||
|
DO_FDCT_COLS();
|
||||||
|
|
||||||
|
vec_st(out0, 0, data);
|
||||||
|
vec_st(out1, 16, data);
|
||||||
|
vec_st(out2, 32, data);
|
||||||
|
vec_st(out3, 48, data);
|
||||||
|
vec_st(out4, 64, data);
|
||||||
|
vec_st(out5, 80, data);
|
||||||
|
vec_st(out6, 96, data);
|
||||||
|
vec_st(out7, 112, data);
|
||||||
|
}
|
||||||
256
simd/jidctfst-altivec.c
Normal file
256
simd/jidctfst-altivec.c
Normal file
@@ -0,0 +1,256 @@
|
|||||||
|
/*
|
||||||
|
* AltiVec optimizations for libjpeg-turbo
|
||||||
|
*
|
||||||
|
* Copyright (C) 2014, D. R. Commander.
|
||||||
|
* All rights reserved.
|
||||||
|
* This software is provided 'as-is', without any express or implied
|
||||||
|
* warranty. In no event will the authors be held liable for any damages
|
||||||
|
* arising from the use of this software.
|
||||||
|
*
|
||||||
|
* Permission is granted to anyone to use this software for any purpose,
|
||||||
|
* including commercial applications, and to alter it and redistribute it
|
||||||
|
* freely, subject to the following restrictions:
|
||||||
|
*
|
||||||
|
* 1. The origin of this software must not be misrepresented; you must not
|
||||||
|
* claim that you wrote the original software. If you use this software
|
||||||
|
* in a product, an acknowledgment in the product documentation would be
|
||||||
|
* appreciated but is not required.
|
||||||
|
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||||
|
* misrepresented as being the original software.
|
||||||
|
* 3. This notice may not be removed or altered from any source distribution.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* FAST INTEGER INVERSE DCT
|
||||||
|
*
|
||||||
|
* This is similar to the SSE2 implementation, except that we left-shift the
|
||||||
|
* constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
|
||||||
|
* vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
|
||||||
|
* the elements in arg3 + the most significant 17 bits of
|
||||||
|
* (the elements in arg1 * the elements in arg2).
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "jsimd_altivec.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define F_1_082 277 /* FIX(1.082392200) */
|
||||||
|
#define F_1_414 362 /* FIX(1.414213562) */
|
||||||
|
#define F_1_847 473 /* FIX(1.847759065) */
|
||||||
|
#define F_2_613 669 /* FIX(2.613125930) */
|
||||||
|
#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
|
||||||
|
|
||||||
|
#define CONST_BITS 8
|
||||||
|
#define PASS1_BITS 2
|
||||||
|
#define PRE_MULTIPLY_SCALE_BITS 2
|
||||||
|
#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
|
||||||
|
|
||||||
|
|
||||||
|
#define DO_IDCT(in) \
|
||||||
|
{ \
|
||||||
|
/* Even part */ \
|
||||||
|
\
|
||||||
|
tmp10 = vec_add(in##0, in##4); \
|
||||||
|
tmp11 = vec_sub(in##0, in##4); \
|
||||||
|
tmp13 = vec_add(in##2, in##6); \
|
||||||
|
\
|
||||||
|
tmp12 = vec_sub(in##2, in##6); \
|
||||||
|
tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
|
||||||
|
tmp12 = vec_madds(tmp12, pw_F1414, zero); \
|
||||||
|
tmp12 = vec_sub(tmp12, tmp13); \
|
||||||
|
\
|
||||||
|
tmp0 = vec_add(tmp10, tmp13); \
|
||||||
|
tmp3 = vec_sub(tmp10, tmp13); \
|
||||||
|
tmp1 = vec_add(tmp11, tmp12); \
|
||||||
|
tmp2 = vec_sub(tmp11, tmp12); \
|
||||||
|
\
|
||||||
|
/* Odd part */ \
|
||||||
|
\
|
||||||
|
z13 = vec_add(in##5, in##3); \
|
||||||
|
z10 = vec_sub(in##5, in##3); \
|
||||||
|
z10s = vec_sl(z10, pre_multiply_scale_bits); \
|
||||||
|
z11 = vec_add(in##1, in##7); \
|
||||||
|
z12s = vec_sub(in##1, in##7); \
|
||||||
|
z12s = vec_sl(z12s, pre_multiply_scale_bits); \
|
||||||
|
\
|
||||||
|
tmp11 = vec_sub(z11, z13); \
|
||||||
|
tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
|
||||||
|
tmp11 = vec_madds(tmp11, pw_F1414, zero); \
|
||||||
|
\
|
||||||
|
tmp7 = vec_add(z11, z13); \
|
||||||
|
\
|
||||||
|
/* To avoid overflow... \
|
||||||
|
* \
|
||||||
|
* (Original) \
|
||||||
|
* tmp12 = -2.613125930 * z10 + z5; \
|
||||||
|
* \
|
||||||
|
* (This implementation) \
|
||||||
|
* tmp12 = (-1.613125930 - 1) * z10 + z5; \
|
||||||
|
* = -1.613125930 * z10 - z10 + z5; \
|
||||||
|
*/ \
|
||||||
|
\
|
||||||
|
z5 = vec_add(z10s, z12s); \
|
||||||
|
z5 = vec_madds(z5, pw_F1847, zero); \
|
||||||
|
\
|
||||||
|
tmp10 = vec_madds(z12s, pw_F1082, zero); \
|
||||||
|
tmp10 = vec_sub(tmp10, z5); \
|
||||||
|
tmp12 = vec_madds(z10s, pw_MF1613, z5); \
|
||||||
|
tmp12 = vec_sub(tmp12, z10); \
|
||||||
|
\
|
||||||
|
tmp6 = vec_sub(tmp12, tmp7); \
|
||||||
|
tmp5 = vec_sub(tmp11, tmp6); \
|
||||||
|
tmp4 = vec_add(tmp10, tmp5); \
|
||||||
|
\
|
||||||
|
out0 = vec_add(tmp0, tmp7); \
|
||||||
|
out1 = vec_add(tmp1, tmp6); \
|
||||||
|
out2 = vec_add(tmp2, tmp5); \
|
||||||
|
out3 = vec_sub(tmp3, tmp4); \
|
||||||
|
out4 = vec_add(tmp3, tmp4); \
|
||||||
|
out5 = vec_sub(tmp2, tmp5); \
|
||||||
|
out6 = vec_sub(tmp1, tmp6); \
|
||||||
|
out7 = vec_sub(tmp0, tmp7); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
|
||||||
|
JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||||
|
{
|
||||||
|
short *dct_table = (short *)dct_table_;
|
||||||
|
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
|
||||||
|
col0, col1, col2, col3, col4, col5, col6, col7,
|
||||||
|
quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
|
||||||
|
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
|
||||||
|
z5, z10, z10s, z11, z12s, z13,
|
||||||
|
out0, out1, out2, out3, out4, out5, out6, out7;
|
||||||
|
__vector signed char outb;
|
||||||
|
int *outptr;
|
||||||
|
|
||||||
|
/* Constants */
|
||||||
|
__vector short zero = { __8X(0) },
|
||||||
|
pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
|
||||||
|
pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
|
||||||
|
pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
|
||||||
|
pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
|
||||||
|
__vector unsigned short
|
||||||
|
pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
|
||||||
|
pass1_bits3 = { __8X(PASS1_BITS + 3) };
|
||||||
|
__vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
|
||||||
|
|
||||||
|
/* Pass 1: process columns */
|
||||||
|
|
||||||
|
col0 = vec_ld(0, coef_block);
|
||||||
|
col1 = vec_ld(16, coef_block);
|
||||||
|
col2 = vec_ld(32, coef_block);
|
||||||
|
col3 = vec_ld(48, coef_block);
|
||||||
|
col4 = vec_ld(64, coef_block);
|
||||||
|
col5 = vec_ld(80, coef_block);
|
||||||
|
col6 = vec_ld(96, coef_block);
|
||||||
|
col7 = vec_ld(112, coef_block);
|
||||||
|
|
||||||
|
tmp1 = vec_or(col1, col2);
|
||||||
|
tmp2 = vec_or(col3, col4);
|
||||||
|
tmp1 = vec_or(tmp1, tmp2);
|
||||||
|
tmp3 = vec_or(col5, col6);
|
||||||
|
tmp3 = vec_or(tmp3, col7);
|
||||||
|
tmp1 = vec_or(tmp1, tmp3);
|
||||||
|
|
||||||
|
quant0 = *(__vector short *)&dct_table[0];
|
||||||
|
col0 = vec_mladd(col0, quant0, zero);
|
||||||
|
|
||||||
|
if (vec_all_eq(tmp1, zero)) {
|
||||||
|
/* AC terms all zero */
|
||||||
|
|
||||||
|
row0 = vec_splat(col0, 0);
|
||||||
|
row1 = vec_splat(col0, 1);
|
||||||
|
row2 = vec_splat(col0, 2);
|
||||||
|
row3 = vec_splat(col0, 3);
|
||||||
|
row4 = vec_splat(col0, 4);
|
||||||
|
row5 = vec_splat(col0, 5);
|
||||||
|
row6 = vec_splat(col0, 6);
|
||||||
|
row7 = vec_splat(col0, 7);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
quant1 = *(__vector short *)&dct_table[8];
|
||||||
|
quant2 = *(__vector short *)&dct_table[16];
|
||||||
|
quant3 = *(__vector short *)&dct_table[24];
|
||||||
|
quant4 = *(__vector short *)&dct_table[32];
|
||||||
|
quant5 = *(__vector short *)&dct_table[40];
|
||||||
|
quant6 = *(__vector short *)&dct_table[48];
|
||||||
|
quant7 = *(__vector short *)&dct_table[56];
|
||||||
|
|
||||||
|
col1 = vec_mladd(col1, quant1, zero);
|
||||||
|
col2 = vec_mladd(col2, quant2, zero);
|
||||||
|
col3 = vec_mladd(col3, quant3, zero);
|
||||||
|
col4 = vec_mladd(col4, quant4, zero);
|
||||||
|
col5 = vec_mladd(col5, quant5, zero);
|
||||||
|
col6 = vec_mladd(col6, quant6, zero);
|
||||||
|
col7 = vec_mladd(col7, quant7, zero);
|
||||||
|
|
||||||
|
DO_IDCT(col);
|
||||||
|
|
||||||
|
TRANSPOSE(out, row);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Pass 2: process rows */
|
||||||
|
|
||||||
|
DO_IDCT(row);
|
||||||
|
|
||||||
|
out0 = vec_sra(out0, pass1_bits3);
|
||||||
|
out1 = vec_sra(out1, pass1_bits3);
|
||||||
|
out2 = vec_sra(out2, pass1_bits3);
|
||||||
|
out3 = vec_sra(out3, pass1_bits3);
|
||||||
|
out4 = vec_sra(out4, pass1_bits3);
|
||||||
|
out5 = vec_sra(out5, pass1_bits3);
|
||||||
|
out6 = vec_sra(out6, pass1_bits3);
|
||||||
|
out7 = vec_sra(out7, pass1_bits3);
|
||||||
|
|
||||||
|
TRANSPOSE(out, col);
|
||||||
|
|
||||||
|
outb = vec_packs(col0, col0);
|
||||||
|
outb = vec_add(outb, pb_centerjsamp);
|
||||||
|
outptr = (int *)(output_buf[0] + output_col);
|
||||||
|
vec_ste((__vector int)outb, 0, outptr);
|
||||||
|
vec_ste((__vector int)outb, 4, outptr);
|
||||||
|
|
||||||
|
outb = vec_packs(col1, col1);
|
||||||
|
outb = vec_add(outb, pb_centerjsamp);
|
||||||
|
outptr = (int *)(output_buf[1] + output_col);
|
||||||
|
vec_ste((__vector int)outb, 0, outptr);
|
||||||
|
vec_ste((__vector int)outb, 4, outptr);
|
||||||
|
|
||||||
|
outb = vec_packs(col2, col2);
|
||||||
|
outb = vec_add(outb, pb_centerjsamp);
|
||||||
|
outptr = (int *)(output_buf[2] + output_col);
|
||||||
|
vec_ste((__vector int)outb, 0, outptr);
|
||||||
|
vec_ste((__vector int)outb, 4, outptr);
|
||||||
|
|
||||||
|
outb = vec_packs(col3, col3);
|
||||||
|
outb = vec_add(outb, pb_centerjsamp);
|
||||||
|
outptr = (int *)(output_buf[3] + output_col);
|
||||||
|
vec_ste((__vector int)outb, 0, outptr);
|
||||||
|
vec_ste((__vector int)outb, 4, outptr);
|
||||||
|
|
||||||
|
outb = vec_packs(col4, col4);
|
||||||
|
outb = vec_add(outb, pb_centerjsamp);
|
||||||
|
outptr = (int *)(output_buf[4] + output_col);
|
||||||
|
vec_ste((__vector int)outb, 0, outptr);
|
||||||
|
vec_ste((__vector int)outb, 4, outptr);
|
||||||
|
|
||||||
|
outb = vec_packs(col5, col5);
|
||||||
|
outb = vec_add(outb, pb_centerjsamp);
|
||||||
|
outptr = (int *)(output_buf[5] + output_col);
|
||||||
|
vec_ste((__vector int)outb, 0, outptr);
|
||||||
|
vec_ste((__vector int)outb, 4, outptr);
|
||||||
|
|
||||||
|
outb = vec_packs(col6, col6);
|
||||||
|
outb = vec_add(outb, pb_centerjsamp);
|
||||||
|
outptr = (int *)(output_buf[6] + output_col);
|
||||||
|
vec_ste((__vector int)outb, 0, outptr);
|
||||||
|
vec_ste((__vector int)outb, 4, outptr);
|
||||||
|
|
||||||
|
outb = vec_packs(col7, col7);
|
||||||
|
outb = vec_add(outb, pb_centerjsamp);
|
||||||
|
outptr = (int *)(output_buf[7] + output_col);
|
||||||
|
vec_ste((__vector int)outb, 0, outptr);
|
||||||
|
vec_ste((__vector int)outb, 4, outptr);
|
||||||
|
}
|
||||||
358
simd/jidctint-altivec.c
Normal file
358
simd/jidctint-altivec.c
Normal file
@@ -0,0 +1,358 @@
|
|||||||
|
/*
|
||||||
|
* AltiVec optimizations for libjpeg-turbo
|
||||||
|
*
|
||||||
|
* Copyright (C) 2014, D. R. Commander.
|
||||||
|
* All rights reserved.
|
||||||
|
* This software is provided 'as-is', without any express or implied
|
||||||
|
* warranty. In no event will the authors be held liable for any damages
|
||||||
|
* arising from the use of this software.
|
||||||
|
*
|
||||||
|
* Permission is granted to anyone to use this software for any purpose,
|
||||||
|
* including commercial applications, and to alter it and redistribute it
|
||||||
|
* freely, subject to the following restrictions:
|
||||||
|
*
|
||||||
|
* 1. The origin of this software must not be misrepresented; you must not
|
||||||
|
* claim that you wrote the original software. If you use this software
|
||||||
|
* in a product, an acknowledgment in the product documentation would be
|
||||||
|
* appreciated but is not required.
|
||||||
|
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||||
|
* misrepresented as being the original software.
|
||||||
|
* 3. This notice may not be removed or altered from any source distribution.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* SLOW INTEGER INVERSE DCT */
|
||||||
|
|
||||||
|
#include "jsimd_altivec.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define F_0_298 2446 /* FIX(0.298631336) */
|
||||||
|
#define F_0_390 3196 /* FIX(0.390180644) */
|
||||||
|
#define F_0_541 4433 /* FIX(0.541196100) */
|
||||||
|
#define F_0_765 6270 /* FIX(0.765366865) */
|
||||||
|
#define F_0_899 7373 /* FIX(0.899976223) */
|
||||||
|
#define F_1_175 9633 /* FIX(1.175875602) */
|
||||||
|
#define F_1_501 12299 /* FIX(1.501321110) */
|
||||||
|
#define F_1_847 15137 /* FIX(1.847759065) */
|
||||||
|
#define F_1_961 16069 /* FIX(1.961570560) */
|
||||||
|
#define F_2_053 16819 /* FIX(2.053119869) */
|
||||||
|
#define F_2_562 20995 /* FIX(2.562915447) */
|
||||||
|
#define F_3_072 25172 /* FIX(3.072711026) */
|
||||||
|
|
||||||
|
#define CONST_BITS 13
|
||||||
|
#define PASS1_BITS 2
|
||||||
|
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||||
|
#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
|
||||||
|
|
||||||
|
|
||||||
|
#define DO_IDCT(in, PASS) \
|
||||||
|
{ \
|
||||||
|
/* Even part \
|
||||||
|
* \
|
||||||
|
* (Original) \
|
||||||
|
* z1 = (z2 + z3) * 0.541196100; \
|
||||||
|
* tmp2 = z1 + z3 * -1.847759065; \
|
||||||
|
* tmp3 = z1 + z2 * 0.765366865; \
|
||||||
|
* \
|
||||||
|
* (This implementation) \
|
||||||
|
* tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
|
||||||
|
* tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
|
||||||
|
*/ \
|
||||||
|
\
|
||||||
|
in##26l = vec_mergeh(in##2, in##6); \
|
||||||
|
in##26h = vec_mergel(in##2, in##6); \
|
||||||
|
\
|
||||||
|
tmp3l = vec_msums(in##26l, pw_f130_f054, zero32); \
|
||||||
|
tmp3h = vec_msums(in##26h, pw_f130_f054, zero32); \
|
||||||
|
tmp2l = vec_msums(in##26l, pw_f054_mf130, zero32); \
|
||||||
|
tmp2h = vec_msums(in##26h, pw_f054_mf130, zero32); \
|
||||||
|
\
|
||||||
|
tmp0 = vec_add(in##0, in##4); \
|
||||||
|
tmp1 = vec_sub(in##0, in##4); \
|
||||||
|
\
|
||||||
|
tmp0l = vec_unpackh(tmp0); \
|
||||||
|
tmp0h = vec_unpackl(tmp0); \
|
||||||
|
tmp0l = vec_sl(tmp0l, const_bits); \
|
||||||
|
tmp0h = vec_sl(tmp0h, const_bits); \
|
||||||
|
tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
|
||||||
|
tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
|
||||||
|
\
|
||||||
|
tmp10l = vec_add(tmp0l, tmp3l); \
|
||||||
|
tmp10h = vec_add(tmp0h, tmp3h); \
|
||||||
|
tmp13l = vec_sub(tmp0l, tmp3l); \
|
||||||
|
tmp13h = vec_sub(tmp0h, tmp3h); \
|
||||||
|
\
|
||||||
|
tmp1l = vec_unpackh(tmp1); \
|
||||||
|
tmp1h = vec_unpackl(tmp1); \
|
||||||
|
tmp1l = vec_sl(tmp1l, const_bits); \
|
||||||
|
tmp1h = vec_sl(tmp1h, const_bits); \
|
||||||
|
tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
|
||||||
|
tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
|
||||||
|
\
|
||||||
|
tmp11l = vec_add(tmp1l, tmp2l); \
|
||||||
|
tmp11h = vec_add(tmp1h, tmp2h); \
|
||||||
|
tmp12l = vec_sub(tmp1l, tmp2l); \
|
||||||
|
tmp12h = vec_sub(tmp1h, tmp2h); \
|
||||||
|
\
|
||||||
|
/* Odd part */ \
|
||||||
|
\
|
||||||
|
z3 = vec_add(in##3, in##7); \
|
||||||
|
z4 = vec_add(in##1, in##5); \
|
||||||
|
\
|
||||||
|
/* (Original) \
|
||||||
|
* z5 = (z3 + z4) * 1.175875602; \
|
||||||
|
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
|
||||||
|
* z3 += z5; z4 += z5; \
|
||||||
|
* \
|
||||||
|
* (This implementation) \
|
||||||
|
* z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
|
||||||
|
* z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
|
||||||
|
*/ \
|
||||||
|
\
|
||||||
|
z34l = vec_mergeh(z3, z4); \
|
||||||
|
z34h = vec_mergel(z3, z4); \
|
||||||
|
\
|
||||||
|
z3l = vec_msums(z34l, pw_mf078_f117, zero32); \
|
||||||
|
z3h = vec_msums(z34h, pw_mf078_f117, zero32); \
|
||||||
|
z4l = vec_msums(z34l, pw_f117_f078, zero32); \
|
||||||
|
z4h = vec_msums(z34h, pw_f117_f078, zero32); \
|
||||||
|
\
|
||||||
|
/* (Original) \
|
||||||
|
* z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
|
||||||
|
* tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
|
||||||
|
* tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
|
||||||
|
* z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
|
||||||
|
* tmp0 += z1 + z3; tmp1 += z2 + z4; \
|
||||||
|
* tmp2 += z2 + z3; tmp3 += z1 + z4; \
|
||||||
|
* \
|
||||||
|
* (This implementation) \
|
||||||
|
* tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
|
||||||
|
* tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
|
||||||
|
* tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
|
||||||
|
* tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
|
||||||
|
* tmp0 += z3; tmp1 += z4; \
|
||||||
|
* tmp2 += z3; tmp3 += z4; \
|
||||||
|
*/ \
|
||||||
|
\
|
||||||
|
in##71l = vec_mergeh(in##7, in##1); \
|
||||||
|
in##71h = vec_mergel(in##7, in##1); \
|
||||||
|
\
|
||||||
|
tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
|
||||||
|
tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
|
||||||
|
tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
|
||||||
|
tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
|
||||||
|
\
|
||||||
|
in##53l = vec_mergeh(in##5, in##3); \
|
||||||
|
in##53h = vec_mergel(in##5, in##3); \
|
||||||
|
\
|
||||||
|
tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
|
||||||
|
tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
|
||||||
|
tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
|
||||||
|
tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
|
||||||
|
\
|
||||||
|
/* Final output stage */ \
|
||||||
|
\
|
||||||
|
out0l = vec_add(tmp10l, tmp3l); \
|
||||||
|
out0h = vec_add(tmp10h, tmp3h); \
|
||||||
|
out7l = vec_sub(tmp10l, tmp3l); \
|
||||||
|
out7h = vec_sub(tmp10h, tmp3h); \
|
||||||
|
\
|
||||||
|
out0l = vec_sra(out0l, descale_p##PASS); \
|
||||||
|
out0h = vec_sra(out0h, descale_p##PASS); \
|
||||||
|
out7l = vec_sra(out7l, descale_p##PASS); \
|
||||||
|
out7h = vec_sra(out7h, descale_p##PASS); \
|
||||||
|
\
|
||||||
|
out0 = vec_pack(out0l, out0h); \
|
||||||
|
out7 = vec_pack(out7l, out7h); \
|
||||||
|
\
|
||||||
|
out1l = vec_add(tmp11l, tmp2l); \
|
||||||
|
out1h = vec_add(tmp11h, tmp2h); \
|
||||||
|
out6l = vec_sub(tmp11l, tmp2l); \
|
||||||
|
out6h = vec_sub(tmp11h, tmp2h); \
|
||||||
|
\
|
||||||
|
out1l = vec_sra(out1l, descale_p##PASS); \
|
||||||
|
out1h = vec_sra(out1h, descale_p##PASS); \
|
||||||
|
out6l = vec_sra(out6l, descale_p##PASS); \
|
||||||
|
out6h = vec_sra(out6h, descale_p##PASS); \
|
||||||
|
\
|
||||||
|
out1 = vec_pack(out1l, out1h); \
|
||||||
|
out6 = vec_pack(out6l, out6h); \
|
||||||
|
\
|
||||||
|
out2l = vec_add(tmp12l, tmp1l); \
|
||||||
|
out2h = vec_add(tmp12h, tmp1h); \
|
||||||
|
out5l = vec_sub(tmp12l, tmp1l); \
|
||||||
|
out5h = vec_sub(tmp12h, tmp1h); \
|
||||||
|
\
|
||||||
|
out2l = vec_sra(out2l, descale_p##PASS); \
|
||||||
|
out2h = vec_sra(out2h, descale_p##PASS); \
|
||||||
|
out5l = vec_sra(out5l, descale_p##PASS); \
|
||||||
|
out5h = vec_sra(out5h, descale_p##PASS); \
|
||||||
|
\
|
||||||
|
out2 = vec_pack(out2l, out2h); \
|
||||||
|
out5 = vec_pack(out5l, out5h); \
|
||||||
|
\
|
||||||
|
out3l = vec_add(tmp13l, tmp0l); \
|
||||||
|
out3h = vec_add(tmp13h, tmp0h); \
|
||||||
|
out4l = vec_sub(tmp13l, tmp0l); \
|
||||||
|
out4h = vec_sub(tmp13h, tmp0h); \
|
||||||
|
\
|
||||||
|
out3l = vec_sra(out3l, descale_p##PASS); \
|
||||||
|
out3h = vec_sra(out3h, descale_p##PASS); \
|
||||||
|
out4l = vec_sra(out4l, descale_p##PASS); \
|
||||||
|
out4h = vec_sra(out4h, descale_p##PASS); \
|
||||||
|
\
|
||||||
|
out3 = vec_pack(out3l, out3h); \
|
||||||
|
out4 = vec_pack(out4l, out4h); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
|
||||||
|
JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||||
|
{
|
||||||
|
short *dct_table = (short *)dct_table_;
|
||||||
|
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
|
||||||
|
col0, col1, col2, col3, col4, col5, col6, col7,
|
||||||
|
quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
|
||||||
|
tmp0, tmp1, tmp2, tmp3, z3, z4,
|
||||||
|
z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
|
||||||
|
row71l, row71h, row26l, row26h, row53l, row53h,
|
||||||
|
out0, out1, out2, out3, out4, out5, out6, out7;
|
||||||
|
__vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
|
||||||
|
tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
|
||||||
|
z3l, z3h, z4l, z4h,
|
||||||
|
out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
|
||||||
|
out5l, out5h, out6l, out6h, out7l, out7h;
|
||||||
|
__vector signed char outb;
|
||||||
|
int *outptr;
|
||||||
|
|
||||||
|
/* Constants */
|
||||||
|
__vector short zero16 = { __8X(0) },
|
||||||
|
pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
|
||||||
|
pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
|
||||||
|
pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
|
||||||
|
pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
|
||||||
|
pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
|
||||||
|
pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
|
||||||
|
pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
|
||||||
|
pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
|
||||||
|
__vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
|
||||||
|
__vector int zero32 = { __4X(0) },
|
||||||
|
pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
|
||||||
|
pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
|
||||||
|
__vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
|
||||||
|
descale_p2 = { __4X(DESCALE_P2) },
|
||||||
|
const_bits = { __4X(CONST_BITS) };
|
||||||
|
__vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
|
||||||
|
|
||||||
|
/* Pass 1: process columns */
|
||||||
|
|
||||||
|
col0 = *(__vector short *)&coef_block[0];
|
||||||
|
col1 = *(__vector short *)&coef_block[8];
|
||||||
|
col2 = *(__vector short *)&coef_block[16];
|
||||||
|
col3 = *(__vector short *)&coef_block[24];
|
||||||
|
col4 = *(__vector short *)&coef_block[32];
|
||||||
|
col5 = *(__vector short *)&coef_block[40];
|
||||||
|
col6 = *(__vector short *)&coef_block[48];
|
||||||
|
col7 = *(__vector short *)&coef_block[56];
|
||||||
|
|
||||||
|
tmp1 = vec_or(col1, col2);
|
||||||
|
tmp2 = vec_or(col3, col4);
|
||||||
|
tmp1 = vec_or(tmp1, tmp2);
|
||||||
|
tmp3 = vec_or(col5, col6);
|
||||||
|
tmp3 = vec_or(tmp3, col7);
|
||||||
|
tmp1 = vec_or(tmp1, tmp3);
|
||||||
|
|
||||||
|
quant0 = *(__vector short *)&dct_table[0];
|
||||||
|
col0 = vec_mladd(col0, quant0, zero16);
|
||||||
|
|
||||||
|
if (vec_all_eq(tmp1, zero16)) {
|
||||||
|
/* AC terms all zero */
|
||||||
|
|
||||||
|
col0 = vec_sl(col0, pass1_bits);
|
||||||
|
|
||||||
|
row0 = vec_splat(col0, 0);
|
||||||
|
row1 = vec_splat(col0, 1);
|
||||||
|
row2 = vec_splat(col0, 2);
|
||||||
|
row3 = vec_splat(col0, 3);
|
||||||
|
row4 = vec_splat(col0, 4);
|
||||||
|
row5 = vec_splat(col0, 5);
|
||||||
|
row6 = vec_splat(col0, 6);
|
||||||
|
row7 = vec_splat(col0, 7);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
quant1 = *(__vector short *)&dct_table[8];
|
||||||
|
quant2 = *(__vector short *)&dct_table[16];
|
||||||
|
quant3 = *(__vector short *)&dct_table[24];
|
||||||
|
quant4 = *(__vector short *)&dct_table[32];
|
||||||
|
quant5 = *(__vector short *)&dct_table[40];
|
||||||
|
quant6 = *(__vector short *)&dct_table[48];
|
||||||
|
quant7 = *(__vector short *)&dct_table[56];
|
||||||
|
|
||||||
|
col1 = vec_mladd(col1, quant1, zero16);
|
||||||
|
col2 = vec_mladd(col2, quant2, zero16);
|
||||||
|
col3 = vec_mladd(col3, quant3, zero16);
|
||||||
|
col4 = vec_mladd(col4, quant4, zero16);
|
||||||
|
col5 = vec_mladd(col5, quant5, zero16);
|
||||||
|
col6 = vec_mladd(col6, quant6, zero16);
|
||||||
|
col7 = vec_mladd(col7, quant7, zero16);
|
||||||
|
|
||||||
|
DO_IDCT(col, 1);
|
||||||
|
|
||||||
|
TRANSPOSE(out, row);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Pass 2: process rows */
|
||||||
|
|
||||||
|
DO_IDCT(row, 2);
|
||||||
|
|
||||||
|
TRANSPOSE(out, col);
|
||||||
|
|
||||||
|
outb = vec_packs(col0, col0);
|
||||||
|
outb = vec_add(outb, pb_centerjsamp);
|
||||||
|
outptr = (int *)(output_buf[0] + output_col);
|
||||||
|
vec_ste((__vector int)outb, 0, outptr);
|
||||||
|
vec_ste((__vector int)outb, 4, outptr);
|
||||||
|
|
||||||
|
outb = vec_packs(col1, col1);
|
||||||
|
outb = vec_add(outb, pb_centerjsamp);
|
||||||
|
outptr = (int *)(output_buf[1] + output_col);
|
||||||
|
vec_ste((__vector int)outb, 0, outptr);
|
||||||
|
vec_ste((__vector int)outb, 4, outptr);
|
||||||
|
|
||||||
|
outb = vec_packs(col2, col2);
|
||||||
|
outb = vec_add(outb, pb_centerjsamp);
|
||||||
|
outptr = (int *)(output_buf[2] + output_col);
|
||||||
|
vec_ste((__vector int)outb, 0, outptr);
|
||||||
|
vec_ste((__vector int)outb, 4, outptr);
|
||||||
|
|
||||||
|
outb = vec_packs(col3, col3);
|
||||||
|
outb = vec_add(outb, pb_centerjsamp);
|
||||||
|
outptr = (int *)(output_buf[3] + output_col);
|
||||||
|
vec_ste((__vector int)outb, 0, outptr);
|
||||||
|
vec_ste((__vector int)outb, 4, outptr);
|
||||||
|
|
||||||
|
outb = vec_packs(col4, col4);
|
||||||
|
outb = vec_add(outb, pb_centerjsamp);
|
||||||
|
outptr = (int *)(output_buf[4] + output_col);
|
||||||
|
vec_ste((__vector int)outb, 0, outptr);
|
||||||
|
vec_ste((__vector int)outb, 4, outptr);
|
||||||
|
|
||||||
|
outb = vec_packs(col5, col5);
|
||||||
|
outb = vec_add(outb, pb_centerjsamp);
|
||||||
|
outptr = (int *)(output_buf[5] + output_col);
|
||||||
|
vec_ste((__vector int)outb, 0, outptr);
|
||||||
|
vec_ste((__vector int)outb, 4, outptr);
|
||||||
|
|
||||||
|
outb = vec_packs(col6, col6);
|
||||||
|
outb = vec_add(outb, pb_centerjsamp);
|
||||||
|
outptr = (int *)(output_buf[6] + output_col);
|
||||||
|
vec_ste((__vector int)outb, 0, outptr);
|
||||||
|
vec_ste((__vector int)outb, 4, outptr);
|
||||||
|
|
||||||
|
outb = vec_packs(col7, col7);
|
||||||
|
outb = vec_add(outb, pb_centerjsamp);
|
||||||
|
outptr = (int *)(output_buf[7] + output_col);
|
||||||
|
vec_ste((__vector int)outb, 0, outptr);
|
||||||
|
vec_ste((__vector int)outb, 4, outptr);
|
||||||
|
}
|
||||||
236
simd/jquanti-altivec.c
Normal file
236
simd/jquanti-altivec.c
Normal file
@@ -0,0 +1,236 @@
|
|||||||
|
/*
|
||||||
|
* AltiVec optimizations for libjpeg-turbo
|
||||||
|
*
|
||||||
|
* Copyright (C) 2014, D. R. Commander.
|
||||||
|
* All rights reserved.
|
||||||
|
* This software is provided 'as-is', without any express or implied
|
||||||
|
* warranty. In no event will the authors be held liable for any damages
|
||||||
|
* arising from the use of this software.
|
||||||
|
*
|
||||||
|
* Permission is granted to anyone to use this software for any purpose,
|
||||||
|
* including commercial applications, and to alter it and redistribute it
|
||||||
|
* freely, subject to the following restrictions:
|
||||||
|
*
|
||||||
|
* 1. The origin of this software must not be misrepresented; you must not
|
||||||
|
* claim that you wrote the original software. If you use this software
|
||||||
|
* in a product, an acknowledgment in the product documentation would be
|
||||||
|
* appreciated but is not required.
|
||||||
|
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||||
|
* misrepresented as being the original software.
|
||||||
|
* 3. This notice may not be removed or altered from any source distribution.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
|
||||||
|
|
||||||
|
#include "jsimd_altivec.h"
|
||||||
|
|
||||||
|
|
||||||
|
/* NOTE: The address will either be aligned or offset by 8 bytes, so we can
|
||||||
|
* always get the data we want by using a single vector load (although we may
|
||||||
|
* have to permute the result.)
|
||||||
|
*/
|
||||||
|
#define LOAD_ROW(row) { \
|
||||||
|
elemptr = sample_data[row] + start_col; \
|
||||||
|
in##row = vec_ld(0, elemptr); \
|
||||||
|
if ((size_t)elemptr & 15) \
|
||||||
|
in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||||
|
DCTELEM * workspace)
|
||||||
|
{
|
||||||
|
JSAMPROW elemptr;
|
||||||
|
__vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
|
||||||
|
__vector short out0, out1, out2, out3, out4, out5, out6, out7;
|
||||||
|
|
||||||
|
/* Constants */
|
||||||
|
__vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
|
||||||
|
__vector unsigned char zero = { __16X(0) };
|
||||||
|
|
||||||
|
LOAD_ROW(0);
|
||||||
|
LOAD_ROW(1);
|
||||||
|
LOAD_ROW(2);
|
||||||
|
LOAD_ROW(3);
|
||||||
|
LOAD_ROW(4);
|
||||||
|
LOAD_ROW(5);
|
||||||
|
LOAD_ROW(6);
|
||||||
|
LOAD_ROW(7);
|
||||||
|
|
||||||
|
out0 = (__vector short)vec_mergeh(zero, in0);
|
||||||
|
out1 = (__vector short)vec_mergeh(zero, in1);
|
||||||
|
out2 = (__vector short)vec_mergeh(zero, in2);
|
||||||
|
out3 = (__vector short)vec_mergeh(zero, in3);
|
||||||
|
out4 = (__vector short)vec_mergeh(zero, in4);
|
||||||
|
out5 = (__vector short)vec_mergeh(zero, in5);
|
||||||
|
out6 = (__vector short)vec_mergeh(zero, in6);
|
||||||
|
out7 = (__vector short)vec_mergeh(zero, in7);
|
||||||
|
|
||||||
|
out0 = vec_sub(out0, pw_centerjsamp);
|
||||||
|
out1 = vec_sub(out1, pw_centerjsamp);
|
||||||
|
out2 = vec_sub(out2, pw_centerjsamp);
|
||||||
|
out3 = vec_sub(out3, pw_centerjsamp);
|
||||||
|
out4 = vec_sub(out4, pw_centerjsamp);
|
||||||
|
out5 = vec_sub(out5, pw_centerjsamp);
|
||||||
|
out6 = vec_sub(out6, pw_centerjsamp);
|
||||||
|
out7 = vec_sub(out7, pw_centerjsamp);
|
||||||
|
|
||||||
|
vec_st(out0, 0, workspace);
|
||||||
|
vec_st(out1, 16, workspace);
|
||||||
|
vec_st(out2, 32, workspace);
|
||||||
|
vec_st(out3, 48, workspace);
|
||||||
|
vec_st(out4, 64, workspace);
|
||||||
|
vec_st(out5, 80, workspace);
|
||||||
|
vec_st(out6, 96, workspace);
|
||||||
|
vec_st(out7, 112, workspace);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define WORD_BIT 16
|
||||||
|
|
||||||
|
/* There is no AltiVec unsigned multiply instruction, hence this. */
|
||||||
|
|
||||||
|
#define MULTIPLY(vs0, vs1, out) { \
|
||||||
|
tmpe = vec_mule((__vector unsigned short)vs0, \
|
||||||
|
(__vector unsigned short)vs1); \
|
||||||
|
tmpo = vec_mulo((__vector unsigned short)vs0, \
|
||||||
|
(__vector unsigned short)vs1); \
|
||||||
|
out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
|
||||||
|
(__vector unsigned short)tmpo, \
|
||||||
|
shift_pack_index); \
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM * divisors,
|
||||||
|
DCTELEM * workspace)
|
||||||
|
{
|
||||||
|
__vector short row0, row1, row2, row3, row4, row5, row6, row7;
|
||||||
|
__vector short row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s;
|
||||||
|
__vector short corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7;
|
||||||
|
__vector short recip0, recip1, recip2, recip3, recip4, recip5, recip6,
|
||||||
|
recip7;
|
||||||
|
__vector short scale0, scale1, scale2, scale3, scale4, scale5, scale6,
|
||||||
|
scale7;
|
||||||
|
__vector unsigned int tmpe, tmpo;
|
||||||
|
|
||||||
|
/* Constants */
|
||||||
|
__vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
|
||||||
|
__vector unsigned char shift_pack_index =
|
||||||
|
{ 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
|
||||||
|
|
||||||
|
row0 = vec_ld(0, workspace);
|
||||||
|
row1 = vec_ld(16, workspace);
|
||||||
|
row2 = vec_ld(32, workspace);
|
||||||
|
row3 = vec_ld(48, workspace);
|
||||||
|
row4 = vec_ld(64, workspace);
|
||||||
|
row5 = vec_ld(80, workspace);
|
||||||
|
row6 = vec_ld(96, workspace);
|
||||||
|
row7 = vec_ld(112, workspace);
|
||||||
|
|
||||||
|
/* Branch-less absolute value */
|
||||||
|
row0s = vec_sra(row0, pw_word_bit_m1);
|
||||||
|
row1s = vec_sra(row1, pw_word_bit_m1);
|
||||||
|
row2s = vec_sra(row2, pw_word_bit_m1);
|
||||||
|
row3s = vec_sra(row3, pw_word_bit_m1);
|
||||||
|
row4s = vec_sra(row4, pw_word_bit_m1);
|
||||||
|
row5s = vec_sra(row5, pw_word_bit_m1);
|
||||||
|
row6s = vec_sra(row6, pw_word_bit_m1);
|
||||||
|
row7s = vec_sra(row7, pw_word_bit_m1);
|
||||||
|
row0 = vec_xor(row0, row0s);
|
||||||
|
row1 = vec_xor(row1, row1s);
|
||||||
|
row2 = vec_xor(row2, row2s);
|
||||||
|
row3 = vec_xor(row3, row3s);
|
||||||
|
row4 = vec_xor(row4, row4s);
|
||||||
|
row5 = vec_xor(row5, row5s);
|
||||||
|
row6 = vec_xor(row6, row6s);
|
||||||
|
row7 = vec_xor(row7, row7s);
|
||||||
|
row0 = vec_sub(row0, row0s);
|
||||||
|
row1 = vec_sub(row1, row1s);
|
||||||
|
row2 = vec_sub(row2, row2s);
|
||||||
|
row3 = vec_sub(row3, row3s);
|
||||||
|
row4 = vec_sub(row4, row4s);
|
||||||
|
row5 = vec_sub(row5, row5s);
|
||||||
|
row6 = vec_sub(row6, row6s);
|
||||||
|
row7 = vec_sub(row7, row7s);
|
||||||
|
|
||||||
|
corr0 = vec_ld(DCTSIZE2 * 2, divisors);
|
||||||
|
corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
|
||||||
|
corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
|
||||||
|
corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
|
||||||
|
corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
|
||||||
|
corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
|
||||||
|
corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
|
||||||
|
corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
|
||||||
|
|
||||||
|
row0 = vec_add(row0, corr0);
|
||||||
|
row1 = vec_add(row1, corr1);
|
||||||
|
row2 = vec_add(row2, corr2);
|
||||||
|
row3 = vec_add(row3, corr3);
|
||||||
|
row4 = vec_add(row4, corr4);
|
||||||
|
row5 = vec_add(row5, corr5);
|
||||||
|
row6 = vec_add(row6, corr6);
|
||||||
|
row7 = vec_add(row7, corr7);
|
||||||
|
|
||||||
|
recip0 = vec_ld(0, divisors);
|
||||||
|
recip1 = vec_ld(16, divisors);
|
||||||
|
recip2 = vec_ld(32, divisors);
|
||||||
|
recip3 = vec_ld(48, divisors);
|
||||||
|
recip4 = vec_ld(64, divisors);
|
||||||
|
recip5 = vec_ld(80, divisors);
|
||||||
|
recip6 = vec_ld(96, divisors);
|
||||||
|
recip7 = vec_ld(112, divisors);
|
||||||
|
|
||||||
|
MULTIPLY(row0, recip0, row0);
|
||||||
|
MULTIPLY(row1, recip1, row1);
|
||||||
|
MULTIPLY(row2, recip2, row2);
|
||||||
|
MULTIPLY(row3, recip3, row3);
|
||||||
|
MULTIPLY(row4, recip4, row4);
|
||||||
|
MULTIPLY(row5, recip5, row5);
|
||||||
|
MULTIPLY(row6, recip6, row6);
|
||||||
|
MULTIPLY(row7, recip7, row7);
|
||||||
|
|
||||||
|
scale0 = vec_ld(DCTSIZE2 * 4, divisors);
|
||||||
|
scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
|
||||||
|
scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
|
||||||
|
scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
|
||||||
|
scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
|
||||||
|
scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
|
||||||
|
scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
|
||||||
|
scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
|
||||||
|
|
||||||
|
MULTIPLY(row0, scale0, row0);
|
||||||
|
MULTIPLY(row1, scale1, row1);
|
||||||
|
MULTIPLY(row2, scale2, row2);
|
||||||
|
MULTIPLY(row3, scale3, row3);
|
||||||
|
MULTIPLY(row4, scale4, row4);
|
||||||
|
MULTIPLY(row5, scale5, row5);
|
||||||
|
MULTIPLY(row6, scale6, row6);
|
||||||
|
MULTIPLY(row7, scale7, row7);
|
||||||
|
|
||||||
|
row0 = vec_xor(row0, row0s);
|
||||||
|
row1 = vec_xor(row1, row1s);
|
||||||
|
row2 = vec_xor(row2, row2s);
|
||||||
|
row3 = vec_xor(row3, row3s);
|
||||||
|
row4 = vec_xor(row4, row4s);
|
||||||
|
row5 = vec_xor(row5, row5s);
|
||||||
|
row6 = vec_xor(row6, row6s);
|
||||||
|
row7 = vec_xor(row7, row7s);
|
||||||
|
row0 = vec_sub(row0, row0s);
|
||||||
|
row1 = vec_sub(row1, row1s);
|
||||||
|
row2 = vec_sub(row2, row2s);
|
||||||
|
row3 = vec_sub(row3, row3s);
|
||||||
|
row4 = vec_sub(row4, row4s);
|
||||||
|
row5 = vec_sub(row5, row5s);
|
||||||
|
row6 = vec_sub(row6, row6s);
|
||||||
|
row7 = vec_sub(row7, row7s);
|
||||||
|
|
||||||
|
vec_st(row0, 0, coef_block);
|
||||||
|
vec_st(row1, 16, coef_block);
|
||||||
|
vec_st(row2, 32, coef_block);
|
||||||
|
vec_st(row3, 48, coef_block);
|
||||||
|
vec_st(row4, 64, coef_block);
|
||||||
|
vec_st(row5, 80, coef_block);
|
||||||
|
vec_st(row6, 96, coef_block);
|
||||||
|
vec_st(row7, 112, coef_block);
|
||||||
|
}
|
||||||
60
simd/jsimd.h
60
simd/jsimd.h
@@ -116,6 +116,28 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_mips_dspr2
|
|||||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
JDIMENSION output_row, int num_rows);
|
JDIMENSION output_row, int num_rows);
|
||||||
|
|
||||||
|
EXTERN(void) jsimd_rgb_ycc_convert_altivec
|
||||||
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
|
JDIMENSION output_row, int num_rows);
|
||||||
|
EXTERN(void) jsimd_extrgb_ycc_convert_altivec
|
||||||
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
|
JDIMENSION output_row, int num_rows);
|
||||||
|
EXTERN(void) jsimd_extrgbx_ycc_convert_altivec
|
||||||
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
|
JDIMENSION output_row, int num_rows);
|
||||||
|
EXTERN(void) jsimd_extbgr_ycc_convert_altivec
|
||||||
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
|
JDIMENSION output_row, int num_rows);
|
||||||
|
EXTERN(void) jsimd_extbgrx_ycc_convert_altivec
|
||||||
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
|
JDIMENSION output_row, int num_rows);
|
||||||
|
EXTERN(void) jsimd_extxbgr_ycc_convert_altivec
|
||||||
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
|
JDIMENSION output_row, int num_rows);
|
||||||
|
EXTERN(void) jsimd_extxrgb_ycc_convert_altivec
|
||||||
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
|
JDIMENSION output_row, int num_rows);
|
||||||
|
|
||||||
/* RGB & extended RGB --> Grayscale Colorspace Conversion */
|
/* RGB & extended RGB --> Grayscale Colorspace Conversion */
|
||||||
EXTERN(void) jsimd_rgb_gray_convert_mmx
|
EXTERN(void) jsimd_rgb_gray_convert_mmx
|
||||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
@@ -184,6 +206,28 @@ EXTERN(void) jsimd_extxrgb_gray_convert_mips_dspr2
|
|||||||
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
JDIMENSION output_row, int num_rows);
|
JDIMENSION output_row, int num_rows);
|
||||||
|
|
||||||
|
EXTERN(void) jsimd_rgb_gray_convert_altivec
|
||||||
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
|
JDIMENSION output_row, int num_rows);
|
||||||
|
EXTERN(void) jsimd_extrgb_gray_convert_altivec
|
||||||
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
|
JDIMENSION output_row, int num_rows);
|
||||||
|
EXTERN(void) jsimd_extrgbx_gray_convert_altivec
|
||||||
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
|
JDIMENSION output_row, int num_rows);
|
||||||
|
EXTERN(void) jsimd_extbgr_gray_convert_altivec
|
||||||
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
|
JDIMENSION output_row, int num_rows);
|
||||||
|
EXTERN(void) jsimd_extbgrx_gray_convert_altivec
|
||||||
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
|
JDIMENSION output_row, int num_rows);
|
||||||
|
EXTERN(void) jsimd_extxbgr_gray_convert_altivec
|
||||||
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
|
JDIMENSION output_row, int num_rows);
|
||||||
|
EXTERN(void) jsimd_extxrgb_gray_convert_altivec
|
||||||
|
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
|
JDIMENSION output_row, int num_rows);
|
||||||
|
|
||||||
/* YCC --> RGB & extended RGB Colorspace Conversion */
|
/* YCC --> RGB & extended RGB Colorspace Conversion */
|
||||||
EXTERN(void) jsimd_ycc_rgb_convert_mmx
|
EXTERN(void) jsimd_ycc_rgb_convert_mmx
|
||||||
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
|
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||||
@@ -524,6 +568,9 @@ EXTERN(void) jsimd_convsamp_neon
|
|||||||
EXTERN(void) jsimd_convsamp_mips_dspr2
|
EXTERN(void) jsimd_convsamp_mips_dspr2
|
||||||
(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace);
|
(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace);
|
||||||
|
|
||||||
|
EXTERN(void) jsimd_convsamp_altivec
|
||||||
|
(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace);
|
||||||
|
|
||||||
/* Floating Point Sample Conversion */
|
/* Floating Point Sample Conversion */
|
||||||
EXTERN(void) jsimd_convsamp_float_3dnow
|
EXTERN(void) jsimd_convsamp_float_3dnow
|
||||||
(JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace);
|
(JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace);
|
||||||
@@ -545,6 +592,8 @@ EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM * data);
|
|||||||
|
|
||||||
EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM * data);
|
EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM * data);
|
||||||
|
|
||||||
|
EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM * data);
|
||||||
|
|
||||||
/* Fast Integer Forward DCT */
|
/* Fast Integer Forward DCT */
|
||||||
EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM * data);
|
EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM * data);
|
||||||
|
|
||||||
@@ -576,6 +625,9 @@ EXTERN(void) jsimd_quantize_neon
|
|||||||
EXTERN(void) jsimd_quantize_mips_dspr2
|
EXTERN(void) jsimd_quantize_mips_dspr2
|
||||||
(JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace);
|
(JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace);
|
||||||
|
|
||||||
|
EXTERN(void) jsimd_quantize_altivec
|
||||||
|
(JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace);
|
||||||
|
|
||||||
/* Floating Point Quantization */
|
/* Floating Point Quantization */
|
||||||
EXTERN(void) jsimd_quantize_float_3dnow
|
EXTERN(void) jsimd_quantize_float_3dnow
|
||||||
(JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace);
|
(JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace);
|
||||||
@@ -644,6 +696,10 @@ EXTERN(void) jsimd_idct_islow_mips_dspr2
|
|||||||
(void * dct_table, JCOEFPTR coef_block, int * output_buf,
|
(void * dct_table, JCOEFPTR coef_block, int * output_buf,
|
||||||
JSAMPLE * output_col);
|
JSAMPLE * output_col);
|
||||||
|
|
||||||
|
EXTERN(void) jsimd_idct_islow_altivec
|
||||||
|
(void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||||
|
JDIMENSION output_col);
|
||||||
|
|
||||||
/* Fast Integer Inverse DCT */
|
/* Fast Integer Inverse DCT */
|
||||||
EXTERN(void) jsimd_idct_ifast_mmx
|
EXTERN(void) jsimd_idct_ifast_mmx
|
||||||
(void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
(void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||||
@@ -665,6 +721,10 @@ EXTERN(void) jsimd_idct_ifast_rows_mips_dspr2
|
|||||||
(DCTELEM * wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
|
(DCTELEM * wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
|
||||||
const int * idct_coefs);
|
const int * idct_coefs);
|
||||||
|
|
||||||
|
EXTERN(void) jsimd_idct_ifast_altivec
|
||||||
|
(void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||||
|
JDIMENSION output_col);
|
||||||
|
|
||||||
/* Floating Point Inverse DCT */
|
/* Floating Point Inverse DCT */
|
||||||
EXTERN(void) jsimd_idct_float_3dnow
|
EXTERN(void) jsimd_idct_float_3dnow
|
||||||
(void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
(void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||||
|
|||||||
@@ -29,6 +29,14 @@
|
|||||||
#include "jsimd.h"
|
#include "jsimd.h"
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
|
|
||||||
|
|
||||||
|
/* Common code */
|
||||||
|
|
||||||
|
#define __4X(a) a, a, a, a
|
||||||
|
#define __4X2(a, b) a, b, a, b, a, b, a, b
|
||||||
|
#define __8X(a) __4X(a), __4X(a)
|
||||||
|
#define __16X(a) __8X(a), __8X(a)
|
||||||
|
|
||||||
#define TRANSPOSE(row, col) \
|
#define TRANSPOSE(row, col) \
|
||||||
{ \
|
{ \
|
||||||
__vector short row04l, row04h, row15l, row15h, \
|
__vector short row04l, row04h, row15l, row15h, \
|
||||||
@@ -47,7 +55,7 @@
|
|||||||
row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
|
row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
|
||||||
\
|
\
|
||||||
/* transpose coefficients (phase 2) */ \
|
/* transpose coefficients (phase 2) */ \
|
||||||
col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61} */ \
|
col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \
|
||||||
col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
|
col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
|
||||||
col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
|
col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
|
||||||
col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
|
col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
|
||||||
@@ -58,7 +66,7 @@
|
|||||||
\
|
\
|
||||||
/* transpose coefficients (phase 3) */ \
|
/* transpose coefficients (phase 3) */ \
|
||||||
col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
|
col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
|
||||||
col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71} */ \
|
col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \
|
||||||
col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
|
col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
|
||||||
col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
|
col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
|
||||||
col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
|
col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
|
||||||
@@ -66,125 +74,3 @@
|
|||||||
col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
|
col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
|
||||||
col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
|
col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
|
||||||
}
|
}
|
||||||
|
|
||||||
static const __vector short constants __attribute__((aligned(16))) =
|
|
||||||
{
|
|
||||||
98 << 5, /* FIX(0.382683433) */
|
|
||||||
139 << 5, /* FIX(0.541196100) */
|
|
||||||
181 << 5, /* FIX(0.707106781) */
|
|
||||||
334 << 5 /* FIX(1.306562965) */
|
|
||||||
};
|
|
||||||
|
|
||||||
#define DO_DCT() \
|
|
||||||
{ \
|
|
||||||
/* Even part */ \
|
|
||||||
\
|
|
||||||
tmp10 = vec_add(tmp0, tmp3); \
|
|
||||||
tmp13 = vec_sub(tmp0, tmp3); \
|
|
||||||
tmp11 = vec_add(tmp1, tmp2); \
|
|
||||||
tmp12 = vec_sub(tmp1, tmp2); \
|
|
||||||
\
|
|
||||||
out0 = vec_add(tmp10, tmp11); \
|
|
||||||
out4 = vec_sub(tmp10, tmp11); \
|
|
||||||
\
|
|
||||||
z1 = vec_add(tmp12, tmp13); \
|
|
||||||
z1 = vec_sl(z1, PRE_MULTIPLY_SCALE_BITS); \
|
|
||||||
z1 = vec_madds(z1, PW_0707, zero); \
|
|
||||||
\
|
|
||||||
out2 = vec_add(tmp13, z1); \
|
|
||||||
out6 = vec_sub(tmp13, z1); \
|
|
||||||
\
|
|
||||||
/* Odd part */ \
|
|
||||||
\
|
|
||||||
tmp10 = vec_add(tmp4, tmp5); \
|
|
||||||
tmp11 = vec_add(tmp5, tmp6); \
|
|
||||||
tmp12 = vec_add(tmp6, tmp7); \
|
|
||||||
\
|
|
||||||
tmp10 = vec_sl(tmp10, PRE_MULTIPLY_SCALE_BITS); \
|
|
||||||
tmp12 = vec_sl(tmp12, PRE_MULTIPLY_SCALE_BITS); \
|
|
||||||
z5 = vec_sub(tmp10, tmp12); \
|
|
||||||
z5 = vec_madds(z5, PW_0382, zero); \
|
|
||||||
\
|
|
||||||
z2 = vec_madds(tmp10, PW_0541, zero); \
|
|
||||||
z2 = vec_add(z2, z5); \
|
|
||||||
\
|
|
||||||
z4 = vec_madds(tmp12, PW_1306, zero); \
|
|
||||||
z4 = vec_add(z4, z5); \
|
|
||||||
\
|
|
||||||
tmp11 = vec_sl(tmp11, PRE_MULTIPLY_SCALE_BITS); \
|
|
||||||
z3 = vec_madds(tmp11, PW_0707, zero); \
|
|
||||||
\
|
|
||||||
z11 = vec_add(tmp7, z3); \
|
|
||||||
z13 = vec_sub(tmp7, z3); \
|
|
||||||
\
|
|
||||||
out5 = vec_add(z13, z2); \
|
|
||||||
out3 = vec_sub(z13, z2); \
|
|
||||||
out1 = vec_add(z11, z4); \
|
|
||||||
out7 = vec_sub(z11, z4); \
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
jsimd_fdct_ifast_altivec (DCTELEM *data)
|
|
||||||
{
|
|
||||||
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
|
|
||||||
col0, col1, col2, col3, col4, col5, col6, col7,
|
|
||||||
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
|
|
||||||
z1, z2, z3, z4, z5, z11, z13,
|
|
||||||
out0, out1, out2, out3, out4, out5, out6, out7;
|
|
||||||
|
|
||||||
/* Constants */
|
|
||||||
__vector short zero = vec_splat_s16(0),
|
|
||||||
PW_0382 = vec_splat(constants, 0),
|
|
||||||
PW_0541 = vec_splat(constants, 1),
|
|
||||||
PW_0707 = vec_splat(constants, 2),
|
|
||||||
PW_1306 = vec_splat(constants, 3);
|
|
||||||
__vector unsigned short PRE_MULTIPLY_SCALE_BITS = vec_splat_u16(2);
|
|
||||||
|
|
||||||
/* Pass 1: process rows. */
|
|
||||||
|
|
||||||
row0 = *(__vector short *)&data[0];
|
|
||||||
row1 = *(__vector short *)&data[8];
|
|
||||||
row2 = *(__vector short *)&data[16];
|
|
||||||
row3 = *(__vector short *)&data[24];
|
|
||||||
row4 = *(__vector short *)&data[32];
|
|
||||||
row5 = *(__vector short *)&data[40];
|
|
||||||
row6 = *(__vector short *)&data[48];
|
|
||||||
row7 = *(__vector short *)&data[56];
|
|
||||||
|
|
||||||
TRANSPOSE(row, col);
|
|
||||||
|
|
||||||
tmp0 = vec_add(col0, col7);
|
|
||||||
tmp7 = vec_sub(col0, col7);
|
|
||||||
tmp1 = vec_add(col1, col6);
|
|
||||||
tmp6 = vec_sub(col1, col6);
|
|
||||||
tmp2 = vec_add(col2, col5);
|
|
||||||
tmp5 = vec_sub(col2, col5);
|
|
||||||
tmp3 = vec_add(col3, col4);
|
|
||||||
tmp4 = vec_sub(col3, col4);
|
|
||||||
|
|
||||||
DO_DCT();
|
|
||||||
|
|
||||||
/* Pass 2: process columns. */
|
|
||||||
|
|
||||||
TRANSPOSE(out, row);
|
|
||||||
|
|
||||||
tmp0 = vec_add(row0, row7);
|
|
||||||
tmp7 = vec_sub(row0, row7);
|
|
||||||
tmp1 = vec_add(row1, row6);
|
|
||||||
tmp6 = vec_sub(row1, row6);
|
|
||||||
tmp2 = vec_add(row2, row5);
|
|
||||||
tmp5 = vec_sub(row2, row5);
|
|
||||||
tmp3 = vec_add(row3, row4);
|
|
||||||
tmp4 = vec_sub(row3, row4);
|
|
||||||
|
|
||||||
DO_DCT();
|
|
||||||
|
|
||||||
*(__vector short *)&data[0] = out0;
|
|
||||||
*(__vector short *)&data[8] = out1;
|
|
||||||
*(__vector short *)&data[16] = out2;
|
|
||||||
*(__vector short *)&data[24] = out3;
|
|
||||||
*(__vector short *)&data[32] = out4;
|
|
||||||
*(__vector short *)&data[40] = out5;
|
|
||||||
*(__vector short *)&data[48] = out6;
|
|
||||||
*(__vector short *)&data[56] = out7;
|
|
||||||
}
|
|
||||||
@@ -6,6 +6,7 @@
|
|||||||
* Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
|
* Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
|
||||||
* Copyright (C) 2013-2014, Linaro Limited
|
* Copyright (C) 2013-2014, Linaro Limited
|
||||||
* Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
|
* Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
|
||||||
|
* Copyright (C) 2014, D. R. Commander. All rights reserved.
|
||||||
*
|
*
|
||||||
* This software is provided 'as-is', without any express or implied
|
* This software is provided 'as-is', without any express or implied
|
||||||
* warranty. In no event will the authors be held liable for any damages
|
* warranty. In no event will the authors be held liable for any damages
|
||||||
@@ -197,21 +198,21 @@ _\fname:
|
|||||||
tmp13 = q1; \
|
tmp13 = q1; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define XFIX_0_899976223 v0.4h[0]
|
#define XFIX_0_899976223 v0.h[0]
|
||||||
#define XFIX_0_541196100 v0.4h[1]
|
#define XFIX_0_541196100 v0.h[1]
|
||||||
#define XFIX_2_562915447 v0.4h[2]
|
#define XFIX_2_562915447 v0.h[2]
|
||||||
#define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3]
|
#define XFIX_0_298631336_MINUS_0_899976223 v0.h[3]
|
||||||
#define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0]
|
#define XFIX_1_501321110_MINUS_0_899976223 v1.h[0]
|
||||||
#define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1]
|
#define XFIX_2_053119869_MINUS_2_562915447 v1.h[1]
|
||||||
#define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2]
|
#define XFIX_0_541196100_PLUS_0_765366865 v1.h[2]
|
||||||
#define XFIX_1_175875602 v1.4h[3]
|
#define XFIX_1_175875602 v1.h[3]
|
||||||
#define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0]
|
#define XFIX_1_175875602_MINUS_0_390180644 v2.h[0]
|
||||||
#define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1]
|
#define XFIX_0_541196100_MINUS_1_847759065 v2.h[1]
|
||||||
#define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2]
|
#define XFIX_3_072711026_MINUS_2_562915447 v2.h[2]
|
||||||
#define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3]
|
#define XFIX_1_175875602_MINUS_1_961570560 v2.h[3]
|
||||||
|
|
||||||
.balign 16
|
.balign 16
|
||||||
jsimd_idct_islow_neon_consts:
|
Ljsimd_idct_islow_neon_consts:
|
||||||
.short FIX_0_899976223 /* d0[0] */
|
.short FIX_0_899976223 /* d0[0] */
|
||||||
.short FIX_0_541196100 /* d0[1] */
|
.short FIX_0_541196100 /* d0[1] */
|
||||||
.short FIX_2_562915447 /* d0[2] */
|
.short FIX_2_562915447 /* d0[2] */
|
||||||
@@ -256,54 +257,54 @@ asm_function jsimd_idct_islow_neon
|
|||||||
/* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
|
/* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
|
||||||
sub sp, sp, 272
|
sub sp, sp, 272
|
||||||
str x15, [sp], 16
|
str x15, [sp], 16
|
||||||
adr x15, jsimd_idct_islow_neon_consts
|
adr x15, Ljsimd_idct_islow_neon_consts
|
||||||
st1 {v0.8b - v3.8b}, [sp], 32
|
st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||||
st1 {v4.8b - v7.8b}, [sp], 32
|
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||||
st1 {v8.8b - v11.8b}, [sp], 32
|
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||||
st1 {v12.8b - v15.8b}, [sp], 32
|
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||||
st1 {v16.8b - v19.8b}, [sp], 32
|
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||||
st1 {v20.8b - v23.8b}, [sp], 32
|
st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||||
st1 {v24.8b - v27.8b}, [sp], 32
|
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||||
st1 {v28.8b - v31.8b}, [sp], 32
|
st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||||
ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
|
ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
|
||||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
|
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
|
||||||
ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
|
ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
|
||||||
mul v16.4h, v16.4h, v0.4h
|
mul v16.4h, v16.4h, v0.4h
|
||||||
mul v17.4h, v17.4h, v1.4h
|
mul v17.4h, v17.4h, v1.4h
|
||||||
ins v16.2d[1], v17.2d[0] /* 128 bit q8 */
|
ins v16.d[1], v17.d[0] /* 128 bit q8 */
|
||||||
ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
|
ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
|
||||||
mul v18.4h, v18.4h, v2.4h
|
mul v18.4h, v18.4h, v2.4h
|
||||||
mul v19.4h, v19.4h, v3.4h
|
mul v19.4h, v19.4h, v3.4h
|
||||||
ins v18.2d[1], v19.2d[0] /* 128 bit q9 */
|
ins v18.d[1], v19.d[0] /* 128 bit q9 */
|
||||||
ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
|
ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
|
||||||
mul v20.4h, v20.4h, v4.4h
|
mul v20.4h, v20.4h, v4.4h
|
||||||
mul v21.4h, v21.4h, v5.4h
|
mul v21.4h, v21.4h, v5.4h
|
||||||
ins v20.2d[1], v21.2d[0] /* 128 bit q10 */
|
ins v20.d[1], v21.d[0] /* 128 bit q10 */
|
||||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
|
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
|
||||||
mul v22.4h, v22.4h, v6.4h
|
mul v22.4h, v22.4h, v6.4h
|
||||||
mul v23.4h, v23.4h, v7.4h
|
mul v23.4h, v23.4h, v7.4h
|
||||||
ins v22.2d[1], v23.2d[0] /* 128 bit q11 */
|
ins v22.d[1], v23.d[0] /* 128 bit q11 */
|
||||||
ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
|
ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
|
||||||
mul v24.4h, v24.4h, v0.4h
|
mul v24.4h, v24.4h, v0.4h
|
||||||
mul v25.4h, v25.4h, v1.4h
|
mul v25.4h, v25.4h, v1.4h
|
||||||
ins v24.2d[1], v25.2d[0] /* 128 bit q12 */
|
ins v24.d[1], v25.d[0] /* 128 bit q12 */
|
||||||
ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
|
ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
|
||||||
mul v28.4h, v28.4h, v4.4h
|
mul v28.4h, v28.4h, v4.4h
|
||||||
mul v29.4h, v29.4h, v5.4h
|
mul v29.4h, v29.4h, v5.4h
|
||||||
ins v28.2d[1], v29.2d[0] /* 128 bit q14 */
|
ins v28.d[1], v29.d[0] /* 128 bit q14 */
|
||||||
mul v26.4h, v26.4h, v2.4h
|
mul v26.4h, v26.4h, v2.4h
|
||||||
mul v27.4h, v27.4h, v3.4h
|
mul v27.4h, v27.4h, v3.4h
|
||||||
ins v26.2d[1], v27.2d[0] /* 128 bit q13 */
|
ins v26.d[1], v27.d[0] /* 128 bit q13 */
|
||||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */
|
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */
|
||||||
add x15, x15, #16
|
add x15, x15, #16
|
||||||
mul v30.4h, v30.4h, v6.4h
|
mul v30.4h, v30.4h, v6.4h
|
||||||
mul v31.4h, v31.4h, v7.4h
|
mul v31.4h, v31.4h, v7.4h
|
||||||
ins v30.2d[1], v31.2d[0] /* 128 bit q15 */
|
ins v30.d[1], v31.d[0] /* 128 bit q15 */
|
||||||
/* Go to the bottom of the stack */
|
/* Go to the bottom of the stack */
|
||||||
sub sp, sp, 352
|
sub sp, sp, 352
|
||||||
stp x4, x5, [sp], 16
|
stp x4, x5, [sp], 16
|
||||||
st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */
|
st1 {v8.4h, v9.4h, v10.4h, v11.4h}, [sp], 32 /* save NEON registers */
|
||||||
st1 {v12.4h - v15.4h}, [sp], 32
|
st1 {v12.4h, v13.4h, v14.4h, v15.4h}, [sp], 32
|
||||||
/* 1-D IDCT, pass 1, left 4x8 half */
|
/* 1-D IDCT, pass 1, left 4x8 half */
|
||||||
add v4.4h, ROW7L.4h, ROW3L.4h
|
add v4.4h, ROW7L.4h, ROW3L.4h
|
||||||
add v5.4h, ROW5L.4h, ROW1L.4h
|
add v5.4h, ROW5L.4h, ROW1L.4h
|
||||||
@@ -378,7 +379,7 @@ asm_function jsimd_idct_islow_neon
|
|||||||
rshrn ROW0L.4h, v12.4s, #11
|
rshrn ROW0L.4h, v12.4s, #11
|
||||||
rshrn ROW4L.4h, v6.4s, #11
|
rshrn ROW4L.4h, v6.4s, #11
|
||||||
|
|
||||||
beq 3f /* Go to do some special handling for the sparse right 4x8 half */
|
b.eq 3f /* Go to do some special handling for the sparse right 4x8 half */
|
||||||
|
|
||||||
/* 1-D IDCT, pass 1, right 4x8 half */
|
/* 1-D IDCT, pass 1, right 4x8 half */
|
||||||
ld1 {v2.4h}, [x15] /* reload constants */
|
ld1 {v2.4h}, [x15] /* reload constants */
|
||||||
@@ -553,33 +554,33 @@ asm_function jsimd_idct_islow_neon
|
|||||||
shrn ROW4R.4h, v6.4s, #16
|
shrn ROW4R.4h, v6.4s, #16
|
||||||
|
|
||||||
2: /* Descale to 8-bit and range limit */
|
2: /* Descale to 8-bit and range limit */
|
||||||
ins v16.2d[1], v17.2d[0]
|
ins v16.d[1], v17.d[0]
|
||||||
ins v18.2d[1], v19.2d[0]
|
ins v18.d[1], v19.d[0]
|
||||||
ins v20.2d[1], v21.2d[0]
|
ins v20.d[1], v21.d[0]
|
||||||
ins v22.2d[1], v23.2d[0]
|
ins v22.d[1], v23.d[0]
|
||||||
sqrshrn v16.8b, v16.8h, #2
|
sqrshrn v16.8b, v16.8h, #2
|
||||||
sqrshrn2 v16.16b, v18.8h, #2
|
sqrshrn2 v16.16b, v18.8h, #2
|
||||||
sqrshrn v18.8b, v20.8h, #2
|
sqrshrn v18.8b, v20.8h, #2
|
||||||
sqrshrn2 v18.16b, v22.8h, #2
|
sqrshrn2 v18.16b, v22.8h, #2
|
||||||
|
|
||||||
/* vpop {v8.4h - d15.4h} */ /* restore NEON registers */
|
/* vpop {v8.4h - d15.4h} */ /* restore NEON registers */
|
||||||
ld1 {v8.4h - v11.4h}, [sp], 32
|
ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [sp], 32
|
||||||
ld1 {v12.4h - v15.4h}, [sp], 32
|
ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [sp], 32
|
||||||
ins v24.2d[1], v25.2d[0]
|
ins v24.d[1], v25.d[0]
|
||||||
|
|
||||||
sqrshrn v20.8b, v24.8h, #2
|
sqrshrn v20.8b, v24.8h, #2
|
||||||
/* Transpose the final 8-bit samples and do signed->unsigned conversion */
|
/* Transpose the final 8-bit samples and do signed->unsigned conversion */
|
||||||
/* trn1 v16.8h, v16.8h, v18.8h */
|
/* trn1 v16.8h, v16.8h, v18.8h */
|
||||||
transpose v16, v18, v3, .16b, .8h
|
transpose v16, v18, v3, .16b, .8h
|
||||||
ins v26.2d[1], v27.2d[0]
|
ins v26.d[1], v27.d[0]
|
||||||
ins v28.2d[1], v29.2d[0]
|
ins v28.d[1], v29.d[0]
|
||||||
ins v30.2d[1], v31.2d[0]
|
ins v30.d[1], v31.d[0]
|
||||||
sqrshrn2 v20.16b, v26.8h, #2
|
sqrshrn2 v20.16b, v26.8h, #2
|
||||||
sqrshrn v22.8b, v28.8h, #2
|
sqrshrn v22.8b, v28.8h, #2
|
||||||
movi v0.16b, #(CENTERJSAMPLE)
|
movi v0.16b, #(CENTERJSAMPLE)
|
||||||
sqrshrn2 v22.16b, v30.8h, #2
|
sqrshrn2 v22.16b, v30.8h, #2
|
||||||
transpose_single v16, v17, v3, .2d, .8b
|
transpose_single v16, v17, v3, .d, .8b
|
||||||
transpose_single v18, v19, v3, .2d, .8b
|
transpose_single v18, v19, v3, .d, .8b
|
||||||
add v16.8b, v16.8b, v0.8b
|
add v16.8b, v16.8b, v0.8b
|
||||||
add v17.8b, v17.8b, v0.8b
|
add v17.8b, v17.8b, v0.8b
|
||||||
add v18.8b, v18.8b, v0.8b
|
add v18.8b, v18.8b, v0.8b
|
||||||
@@ -590,7 +591,7 @@ asm_function jsimd_idct_islow_neon
|
|||||||
add TMP1, TMP1, OUTPUT_COL
|
add TMP1, TMP1, OUTPUT_COL
|
||||||
add TMP2, TMP2, OUTPUT_COL
|
add TMP2, TMP2, OUTPUT_COL
|
||||||
st1 {v16.8b}, [TMP1]
|
st1 {v16.8b}, [TMP1]
|
||||||
transpose_single v20, v21, v3, .2d, .8b
|
transpose_single v20, v21, v3, .d, .8b
|
||||||
st1 {v17.8b}, [TMP2]
|
st1 {v17.8b}, [TMP2]
|
||||||
ldp TMP1, TMP2, [OUTPUT_BUF], 16
|
ldp TMP1, TMP2, [OUTPUT_BUF], 16
|
||||||
add TMP1, TMP1, OUTPUT_COL
|
add TMP1, TMP1, OUTPUT_COL
|
||||||
@@ -605,7 +606,7 @@ asm_function jsimd_idct_islow_neon
|
|||||||
add TMP2, TMP2, OUTPUT_COL
|
add TMP2, TMP2, OUTPUT_COL
|
||||||
add TMP3, TMP3, OUTPUT_COL
|
add TMP3, TMP3, OUTPUT_COL
|
||||||
add TMP4, TMP4, OUTPUT_COL
|
add TMP4, TMP4, OUTPUT_COL
|
||||||
transpose_single v22, v23, v3, .2d, .8b
|
transpose_single v22, v23, v3, .d, .8b
|
||||||
st1 {v20.8b}, [TMP1]
|
st1 {v20.8b}, [TMP1]
|
||||||
add v22.8b, v22.8b, v0.8b
|
add v22.8b, v22.8b, v0.8b
|
||||||
add v23.8b, v23.8b, v0.8b
|
add v23.8b, v23.8b, v0.8b
|
||||||
@@ -613,14 +614,14 @@ asm_function jsimd_idct_islow_neon
|
|||||||
st1 {v22.8b}, [TMP3]
|
st1 {v22.8b}, [TMP3]
|
||||||
st1 {v23.8b}, [TMP4]
|
st1 {v23.8b}, [TMP4]
|
||||||
ldr x15, [sp], 16
|
ldr x15, [sp], 16
|
||||||
ld1 {v0.8b - v3.8b}, [sp], 32
|
ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||||
ld1 {v4.8b - v7.8b}, [sp], 32
|
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||||
ld1 {v8.8b - v11.8b}, [sp], 32
|
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||||
ld1 {v12.8b - v15.8b}, [sp], 32
|
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||||
ld1 {v16.8b - v19.8b}, [sp], 32
|
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||||
ld1 {v20.8b - v23.8b}, [sp], 32
|
ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||||
ld1 {v24.8b - v27.8b}, [sp], 32
|
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||||
ld1 {v28.8b - v31.8b}, [sp], 32
|
ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||||
blr x30
|
blr x30
|
||||||
|
|
||||||
3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
|
3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
|
||||||
@@ -636,17 +637,17 @@ asm_function jsimd_idct_islow_neon
|
|||||||
transpose ROW0L, ROW2L, v3, .16b, .2s
|
transpose ROW0L, ROW2L, v3, .16b, .2s
|
||||||
transpose ROW5L, ROW7L, v3, .16b, .2s
|
transpose ROW5L, ROW7L, v3, .16b, .2s
|
||||||
cmp x0, #0
|
cmp x0, #0
|
||||||
beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
|
b.eq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
|
||||||
|
|
||||||
/* Only row 0 is non-zero for the right 4x8 half */
|
/* Only row 0 is non-zero for the right 4x8 half */
|
||||||
dup ROW1R.4h, ROW0R.4h[1]
|
dup ROW1R.4h, ROW0R.h[1]
|
||||||
dup ROW2R.4h, ROW0R.4h[2]
|
dup ROW2R.4h, ROW0R.h[2]
|
||||||
dup ROW3R.4h, ROW0R.4h[3]
|
dup ROW3R.4h, ROW0R.h[3]
|
||||||
dup ROW4R.4h, ROW0R.4h[0]
|
dup ROW4R.4h, ROW0R.h[0]
|
||||||
dup ROW5R.4h, ROW0R.4h[1]
|
dup ROW5R.4h, ROW0R.h[1]
|
||||||
dup ROW6R.4h, ROW0R.4h[2]
|
dup ROW6R.4h, ROW0R.h[2]
|
||||||
dup ROW7R.4h, ROW0R.4h[3]
|
dup ROW7R.4h, ROW0R.h[3]
|
||||||
dup ROW0R.4h, ROW0R.4h[0]
|
dup ROW0R.4h, ROW0R.h[0]
|
||||||
b 1b /* Go to 'normal' second pass */
|
b 1b /* Go to 'normal' second pass */
|
||||||
|
|
||||||
4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
|
4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
|
||||||
@@ -770,13 +771,13 @@ asm_function jsimd_idct_islow_neon
|
|||||||
* per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
|
* per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define XFIX_1_082392200 v0.4h[0]
|
#define XFIX_1_082392200 v0.h[0]
|
||||||
#define XFIX_1_414213562 v0.4h[1]
|
#define XFIX_1_414213562 v0.h[1]
|
||||||
#define XFIX_1_847759065 v0.4h[2]
|
#define XFIX_1_847759065 v0.h[2]
|
||||||
#define XFIX_2_613125930 v0.4h[3]
|
#define XFIX_2_613125930 v0.h[3]
|
||||||
|
|
||||||
.balign 16
|
.balign 16
|
||||||
jsimd_idct_ifast_neon_consts:
|
Ljsimd_idct_ifast_neon_consts:
|
||||||
.short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
|
.short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
|
||||||
.short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
|
.short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
|
||||||
.short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
|
.short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
|
||||||
@@ -810,12 +811,12 @@ asm_function jsimd_idct_ifast_neon
|
|||||||
/* Save NEON registers used in fast IDCT */
|
/* Save NEON registers used in fast IDCT */
|
||||||
sub sp, sp, #176
|
sub sp, sp, #176
|
||||||
stp x22, x23, [sp], 16
|
stp x22, x23, [sp], 16
|
||||||
adr x23, jsimd_idct_ifast_neon_consts
|
adr x23, Ljsimd_idct_ifast_neon_consts
|
||||||
st1 {v0.8b - v3.8b}, [sp], 32
|
st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||||
st1 {v4.8b - v7.8b}, [sp], 32
|
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||||
st1 {v8.8b - v11.8b}, [sp], 32
|
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||||
st1 {v12.8b - v15.8b}, [sp], 32
|
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||||
st1 {v16.8b - v19.8b}, [sp], 32
|
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||||
ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32
|
ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32
|
||||||
ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
|
ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
|
||||||
ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32
|
ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32
|
||||||
@@ -909,24 +910,24 @@ asm_function jsimd_idct_ifast_neon
|
|||||||
trn2 v15.4s, v18.4s, v15.4s
|
trn2 v15.4s, v18.4s, v15.4s
|
||||||
/* vswp v14.4h, v10-MSB.4h */
|
/* vswp v14.4h, v10-MSB.4h */
|
||||||
umov x22, v14.d[0]
|
umov x22, v14.d[0]
|
||||||
ins v14.2d[0], v10.2d[1]
|
ins v14.d[0], v10.d[1]
|
||||||
ins v10.2d[1], x22
|
ins v10.d[1], x22
|
||||||
/* vswp v13.4h, v9MSB.4h */
|
/* vswp v13.4h, v9MSB.4h */
|
||||||
|
|
||||||
umov x22, v13.d[0]
|
umov x22, v13.d[0]
|
||||||
ins v13.2d[0], v9.2d[1]
|
ins v13.d[0], v9.d[1]
|
||||||
ins v9.2d[1], x22
|
ins v9.d[1], x22
|
||||||
/* 1-D IDCT, pass 2 */
|
/* 1-D IDCT, pass 2 */
|
||||||
sub v2.8h, v10.8h, v14.8h
|
sub v2.8h, v10.8h, v14.8h
|
||||||
/* vswp v15.4h, v11MSB.4h */
|
/* vswp v15.4h, v11MSB.4h */
|
||||||
umov x22, v15.d[0]
|
umov x22, v15.d[0]
|
||||||
ins v15.2d[0], v11.2d[1]
|
ins v15.d[0], v11.d[1]
|
||||||
ins v11.2d[1], x22
|
ins v11.d[1], x22
|
||||||
add v14.8h, v10.8h, v14.8h
|
add v14.8h, v10.8h, v14.8h
|
||||||
/* vswp v12.4h, v8-MSB.4h */
|
/* vswp v12.4h, v8-MSB.4h */
|
||||||
umov x22, v12.d[0]
|
umov x22, v12.d[0]
|
||||||
ins v12.2d[0], v8.2d[1]
|
ins v12.d[0], v8.d[1]
|
||||||
ins v8.2d[1], x22
|
ins v8.d[1], x22
|
||||||
sub v1.8h, v11.8h, v13.8h
|
sub v1.8h, v11.8h, v13.8h
|
||||||
add v13.8h, v11.8h, v13.8h
|
add v13.8h, v11.8h, v13.8h
|
||||||
sub v5.8h, v9.8h, v15.8h
|
sub v5.8h, v9.8h, v15.8h
|
||||||
@@ -997,13 +998,13 @@ asm_function jsimd_idct_ifast_neon
|
|||||||
trn1 v9.4s, v9.4s, v11.4s
|
trn1 v9.4s, v9.4s, v11.4s
|
||||||
trn2 v11.4s, v18.4s, v11.4s
|
trn2 v11.4s, v18.4s, v11.4s
|
||||||
/* make copy */
|
/* make copy */
|
||||||
ins v17.2d[0], v8.2d[1]
|
ins v17.d[0], v8.d[1]
|
||||||
/* Transpose d16-d17-msb */
|
/* Transpose d16-d17-msb */
|
||||||
mov v18.16b, v8.16b
|
mov v18.16b, v8.16b
|
||||||
trn1 v8.8b, v8.8b, v17.8b
|
trn1 v8.8b, v8.8b, v17.8b
|
||||||
trn2 v17.8b, v18.8b, v17.8b
|
trn2 v17.8b, v18.8b, v17.8b
|
||||||
/* make copy */
|
/* make copy */
|
||||||
ins v19.2d[0], v9.2d[1]
|
ins v19.d[0], v9.d[1]
|
||||||
mov v18.16b, v9.16b
|
mov v18.16b, v9.16b
|
||||||
trn1 v9.8b, v9.8b, v19.8b
|
trn1 v9.8b, v9.8b, v19.8b
|
||||||
trn2 v19.8b, v18.8b, v19.8b
|
trn2 v19.8b, v18.8b, v19.8b
|
||||||
@@ -1018,7 +1019,7 @@ asm_function jsimd_idct_ifast_neon
|
|||||||
add TMP2, TMP2, OUTPUT_COL
|
add TMP2, TMP2, OUTPUT_COL
|
||||||
st1 {v9.8b}, [TMP1]
|
st1 {v9.8b}, [TMP1]
|
||||||
/* make copy */
|
/* make copy */
|
||||||
ins v7.2d[0], v10.2d[1]
|
ins v7.d[0], v10.d[1]
|
||||||
mov v18.16b, v10.16b
|
mov v18.16b, v10.16b
|
||||||
trn1 v10.8b, v10.8b, v7.8b
|
trn1 v10.8b, v10.8b, v7.8b
|
||||||
trn2 v7.8b, v18.8b, v7.8b
|
trn2 v7.8b, v18.8b, v7.8b
|
||||||
@@ -1031,7 +1032,7 @@ asm_function jsimd_idct_ifast_neon
|
|||||||
add TMP5, TMP5, OUTPUT_COL
|
add TMP5, TMP5, OUTPUT_COL
|
||||||
st1 {v10.8b}, [TMP1]
|
st1 {v10.8b}, [TMP1]
|
||||||
/* make copy */
|
/* make copy */
|
||||||
ins v16.2d[0], v11.2d[1]
|
ins v16.d[0], v11.d[1]
|
||||||
mov v18.16b, v11.16b
|
mov v18.16b, v11.16b
|
||||||
trn1 v11.8b, v11.8b, v16.8b
|
trn1 v11.8b, v11.8b, v16.8b
|
||||||
trn2 v16.8b, v18.8b, v16.8b
|
trn2 v16.8b, v18.8b, v16.8b
|
||||||
@@ -1040,11 +1041,11 @@ asm_function jsimd_idct_ifast_neon
|
|||||||
st1 {v16.8b}, [TMP5]
|
st1 {v16.8b}, [TMP5]
|
||||||
sub sp, sp, #176
|
sub sp, sp, #176
|
||||||
ldp x22, x23, [sp], 16
|
ldp x22, x23, [sp], 16
|
||||||
ld1 {v0.8b - v3.8b}, [sp], 32
|
ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||||
ld1 {v4.8b - v7.8b}, [sp], 32
|
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||||
ld1 {v8.8b - v11.8b}, [sp], 32
|
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||||
ld1 {v12.8b - v15.8b}, [sp], 32
|
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||||
ld1 {v16.8b - v19.8b}, [sp], 32
|
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||||
blr x30
|
blr x30
|
||||||
|
|
||||||
.unreq DCT_TABLE
|
.unreq DCT_TABLE
|
||||||
@@ -1095,38 +1096,38 @@ asm_function jsimd_idct_ifast_neon
|
|||||||
#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
|
#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
|
||||||
|
|
||||||
.balign 16
|
.balign 16
|
||||||
jsimd_idct_4x4_neon_consts:
|
Ljsimd_idct_4x4_neon_consts:
|
||||||
.short FIX_1_847759065 /* v0.4h[0] */
|
.short FIX_1_847759065 /* v0.h[0] */
|
||||||
.short -FIX_0_765366865 /* v0.4h[1] */
|
.short -FIX_0_765366865 /* v0.h[1] */
|
||||||
.short -FIX_0_211164243 /* v0.4h[2] */
|
.short -FIX_0_211164243 /* v0.h[2] */
|
||||||
.short FIX_1_451774981 /* v0.4h[3] */
|
.short FIX_1_451774981 /* v0.h[3] */
|
||||||
.short -FIX_2_172734803 /* d1[0] */
|
.short -FIX_2_172734803 /* d1[0] */
|
||||||
.short FIX_1_061594337 /* d1[1] */
|
.short FIX_1_061594337 /* d1[1] */
|
||||||
.short -FIX_0_509795579 /* d1[2] */
|
.short -FIX_0_509795579 /* d1[2] */
|
||||||
.short -FIX_0_601344887 /* d1[3] */
|
.short -FIX_0_601344887 /* d1[3] */
|
||||||
.short FIX_0_899976223 /* v2.4h[0] */
|
.short FIX_0_899976223 /* v2.h[0] */
|
||||||
.short FIX_2_562915447 /* v2.4h[1] */
|
.short FIX_2_562915447 /* v2.h[1] */
|
||||||
.short 1 << (CONST_BITS+1) /* v2.4h[2] */
|
.short 1 << (CONST_BITS+1) /* v2.h[2] */
|
||||||
.short 0 /* v2.4h[3] */
|
.short 0 /* v2.h[3] */
|
||||||
|
|
||||||
.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
|
.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
|
||||||
smull v28.4s, \x4, v2.4h[2]
|
smull v28.4s, \x4, v2.h[2]
|
||||||
smlal v28.4s, \x8, v0.4h[0]
|
smlal v28.4s, \x8, v0.h[0]
|
||||||
smlal v28.4s, \x14, v0.4h[1]
|
smlal v28.4s, \x14, v0.h[1]
|
||||||
|
|
||||||
smull v26.4s, \x16, v1.4h[2]
|
smull v26.4s, \x16, v1.h[2]
|
||||||
smlal v26.4s, \x12, v1.4h[3]
|
smlal v26.4s, \x12, v1.h[3]
|
||||||
smlal v26.4s, \x10, v2.4h[0]
|
smlal v26.4s, \x10, v2.h[0]
|
||||||
smlal v26.4s, \x6, v2.4h[1]
|
smlal v26.4s, \x6, v2.h[1]
|
||||||
|
|
||||||
smull v30.4s, \x4, v2.4h[2]
|
smull v30.4s, \x4, v2.h[2]
|
||||||
smlsl v30.4s, \x8, v0.4h[0]
|
smlsl v30.4s, \x8, v0.h[0]
|
||||||
smlsl v30.4s, \x14, v0.4h[1]
|
smlsl v30.4s, \x14, v0.h[1]
|
||||||
|
|
||||||
smull v24.4s, \x16, v0.4h[2]
|
smull v24.4s, \x16, v0.h[2]
|
||||||
smlal v24.4s, \x12, v0.4h[3]
|
smlal v24.4s, \x12, v0.h[3]
|
||||||
smlal v24.4s, \x10, v1.4h[0]
|
smlal v24.4s, \x10, v1.h[0]
|
||||||
smlal v24.4s, \x6, v1.4h[1]
|
smlal v24.4s, \x6, v1.h[1]
|
||||||
|
|
||||||
add v20.4s, v28.4s, v26.4s
|
add v20.4s, v28.4s, v26.4s
|
||||||
sub v28.4s, v28.4s, v26.4s
|
sub v28.4s, v28.4s, v26.4s
|
||||||
@@ -1171,15 +1172,15 @@ asm_function jsimd_idct_4x4_neon
|
|||||||
sub sp, sp, 272
|
sub sp, sp, 272
|
||||||
str x15, [sp], 16
|
str x15, [sp], 16
|
||||||
/* Load constants (v3.4h is just used for padding) */
|
/* Load constants (v3.4h is just used for padding) */
|
||||||
adr TMP4, jsimd_idct_4x4_neon_consts
|
adr TMP4, Ljsimd_idct_4x4_neon_consts
|
||||||
st1 {v0.8b - v3.8b}, [sp], 32
|
st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||||
st1 {v4.8b - v7.8b}, [sp], 32
|
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||||
st1 {v8.8b - v11.8b}, [sp], 32
|
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||||
st1 {v12.8b - v15.8b}, [sp], 32
|
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||||
st1 {v16.8b - v19.8b}, [sp], 32
|
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||||
st1 {v20.8b - v23.8b}, [sp], 32
|
st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||||
st1 {v24.8b - v27.8b}, [sp], 32
|
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||||
st1 {v28.8b - v31.8b}, [sp], 32
|
st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
|
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
|
||||||
|
|
||||||
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
||||||
@@ -1203,45 +1204,45 @@ asm_function jsimd_idct_4x4_neon
|
|||||||
ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
|
ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
|
||||||
mul v4.4h, v4.4h, v18.4h
|
mul v4.4h, v4.4h, v18.4h
|
||||||
mul v5.4h, v5.4h, v19.4h
|
mul v5.4h, v5.4h, v19.4h
|
||||||
ins v4.2d[1], v5.2d[0] /* 128 bit q4 */
|
ins v4.d[1], v5.d[0] /* 128 bit q4 */
|
||||||
ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
|
ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
|
||||||
mul v6.4h, v6.4h, v20.4h
|
mul v6.4h, v6.4h, v20.4h
|
||||||
mul v7.4h, v7.4h, v21.4h
|
mul v7.4h, v7.4h, v21.4h
|
||||||
ins v6.2d[1], v7.2d[0] /* 128 bit q6 */
|
ins v6.d[1], v7.d[0] /* 128 bit q6 */
|
||||||
mul v8.4h, v8.4h, v22.4h
|
mul v8.4h, v8.4h, v22.4h
|
||||||
mul v9.4h, v9.4h, v23.4h
|
mul v9.4h, v9.4h, v23.4h
|
||||||
ins v8.2d[1], v9.2d[0] /* 128 bit q8 */
|
ins v8.d[1], v9.d[0] /* 128 bit q8 */
|
||||||
add DCT_TABLE, DCT_TABLE, #16
|
add DCT_TABLE, DCT_TABLE, #16
|
||||||
ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
|
ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
|
||||||
mul v10.4h, v10.4h, v24.4h
|
mul v10.4h, v10.4h, v24.4h
|
||||||
mul v11.4h, v11.4h, v25.4h
|
mul v11.4h, v11.4h, v25.4h
|
||||||
ins v10.2d[1], v11.2d[0] /* 128 bit q10 */
|
ins v10.d[1], v11.d[0] /* 128 bit q10 */
|
||||||
mul v12.4h, v12.4h, v26.4h
|
mul v12.4h, v12.4h, v26.4h
|
||||||
mul v13.4h, v13.4h, v27.4h
|
mul v13.4h, v13.4h, v27.4h
|
||||||
ins v12.2d[1], v13.2d[0] /* 128 bit q12 */
|
ins v12.d[1], v13.d[0] /* 128 bit q12 */
|
||||||
ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
|
ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
|
||||||
mul v14.4h, v14.4h, v28.4h
|
mul v14.4h, v14.4h, v28.4h
|
||||||
mul v15.4h, v15.4h, v29.4h
|
mul v15.4h, v15.4h, v29.4h
|
||||||
ins v14.2d[1], v15.2d[0] /* 128 bit q14 */
|
ins v14.d[1], v15.d[0] /* 128 bit q14 */
|
||||||
mul v16.4h, v16.4h, v30.4h
|
mul v16.4h, v16.4h, v30.4h
|
||||||
mul v17.4h, v17.4h, v31.4h
|
mul v17.4h, v17.4h, v31.4h
|
||||||
ins v16.2d[1], v17.2d[0] /* 128 bit q16 */
|
ins v16.d[1], v17.d[0] /* 128 bit q16 */
|
||||||
|
|
||||||
/* Pass 1 */
|
/* Pass 1 */
|
||||||
idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4.4h, v6.4h, v8.4h, v10.4h
|
idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4.4h, v6.4h, v8.4h, v10.4h
|
||||||
transpose_4x4 v4, v6, v8, v10, v3
|
transpose_4x4 v4, v6, v8, v10, v3
|
||||||
ins v10.2d[1], v11.2d[0]
|
ins v10.d[1], v11.d[0]
|
||||||
idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5.4h, v7.4h, v9.4h, v11.4h
|
idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5.4h, v7.4h, v9.4h, v11.4h
|
||||||
transpose_4x4 v5, v7, v9, v11, v3
|
transpose_4x4 v5, v7, v9, v11, v3
|
||||||
ins v10.2d[1], v11.2d[0]
|
ins v10.d[1], v11.d[0]
|
||||||
/* Pass 2 */
|
/* Pass 2 */
|
||||||
idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4h, v27.4h, v28.4h, v29.4h
|
idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4h, v27.4h, v28.4h, v29.4h
|
||||||
transpose_4x4 v26, v27, v28, v29, v3
|
transpose_4x4 v26, v27, v28, v29, v3
|
||||||
|
|
||||||
/* Range limit */
|
/* Range limit */
|
||||||
movi v30.8h, #0x80
|
movi v30.8h, #0x80
|
||||||
ins v26.2d[1], v27.2d[0]
|
ins v26.d[1], v27.d[0]
|
||||||
ins v28.2d[1], v29.2d[0]
|
ins v28.d[1], v29.d[0]
|
||||||
add v26.8h, v26.8h, v30.8h
|
add v26.8h, v26.8h, v30.8h
|
||||||
add v28.8h, v28.8h, v30.8h
|
add v28.8h, v28.8h, v30.8h
|
||||||
sqxtun v26.8b, v26.8h
|
sqxtun v26.8b, v26.8h
|
||||||
@@ -1286,14 +1287,14 @@ asm_function jsimd_idct_4x4_neon
|
|||||||
/* vpop {v8.4h - v15.4h} ;not available */
|
/* vpop {v8.4h - v15.4h} ;not available */
|
||||||
sub sp, sp, #272
|
sub sp, sp, #272
|
||||||
ldr x15, [sp], 16
|
ldr x15, [sp], 16
|
||||||
ld1 {v0.8b - v3.8b}, [sp], 32
|
ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||||
ld1 {v4.8b - v7.8b}, [sp], 32
|
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||||
ld1 {v8.8b - v11.8b}, [sp], 32
|
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||||
ld1 {v12.8b - v15.8b}, [sp], 32
|
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||||
ld1 {v16.8b - v19.8b}, [sp], 32
|
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||||
ld1 {v20.8b - v23.8b}, [sp], 32
|
ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||||
ld1 {v24.8b - v27.8b}, [sp], 32
|
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||||
ld1 {v28.8b - v31.8b}, [sp], 32
|
ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||||
blr x30
|
blr x30
|
||||||
|
|
||||||
.unreq DCT_TABLE
|
.unreq DCT_TABLE
|
||||||
@@ -1325,7 +1326,7 @@ asm_function jsimd_idct_4x4_neon
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
.balign 8
|
.balign 8
|
||||||
jsimd_idct_2x2_neon_consts:
|
Ljsimd_idct_2x2_neon_consts:
|
||||||
.short -FIX_0_720959822 /* v14[0] */
|
.short -FIX_0_720959822 /* v14[0] */
|
||||||
.short FIX_0_850430095 /* v14[1] */
|
.short FIX_0_850430095 /* v14[1] */
|
||||||
.short -FIX_1_272758580 /* v14[2] */
|
.short -FIX_1_272758580 /* v14[2] */
|
||||||
@@ -1333,10 +1334,10 @@ jsimd_idct_2x2_neon_consts:
|
|||||||
|
|
||||||
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
|
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
|
||||||
sshll v15.4s, \x4, #15
|
sshll v15.4s, \x4, #15
|
||||||
smull v26.4s, \x6, v14.4h[3]
|
smull v26.4s, \x6, v14.h[3]
|
||||||
smlal v26.4s, \x10, v14.4h[2]
|
smlal v26.4s, \x10, v14.h[2]
|
||||||
smlal v26.4s, \x12, v14.4h[1]
|
smlal v26.4s, \x12, v14.h[1]
|
||||||
smlal v26.4s, \x16, v14.4h[0]
|
smlal v26.4s, \x16, v14.h[0]
|
||||||
|
|
||||||
add v20.4s, v15.4s, v26.4s
|
add v20.4s, v15.4s, v26.4s
|
||||||
sub v15.4s, v15.4s, v26.4s
|
sub v15.4s, v15.4s, v26.4s
|
||||||
@@ -1367,14 +1368,14 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
str x15, [sp], 16
|
str x15, [sp], 16
|
||||||
|
|
||||||
/* Load constants */
|
/* Load constants */
|
||||||
adr TMP2, jsimd_idct_2x2_neon_consts
|
adr TMP2, Ljsimd_idct_2x2_neon_consts
|
||||||
st1 {v4.8b - v7.8b}, [sp], 32
|
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||||
st1 {v8.8b - v11.8b}, [sp], 32
|
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||||
st1 {v12.8b - v15.8b}, [sp], 32
|
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||||
st1 {v16.8b - v19.8b}, [sp], 32
|
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||||
st1 {v21.8b - v22.8b}, [sp], 16
|
st1 {v21.8b, v22.8b}, [sp], 16
|
||||||
st1 {v24.8b - v27.8b}, [sp], 32
|
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||||
st1 {v30.8b - v31.8b}, [sp], 16
|
st1 {v30.8b, v31.8b}, [sp], 16
|
||||||
ld1 {v14.4h}, [TMP2]
|
ld1 {v14.4h}, [TMP2]
|
||||||
|
|
||||||
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
||||||
@@ -1400,25 +1401,25 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
|
ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
|
||||||
mul v4.4h, v4.4h, v18.4h
|
mul v4.4h, v4.4h, v18.4h
|
||||||
mul v5.4h, v5.4h, v19.4h
|
mul v5.4h, v5.4h, v19.4h
|
||||||
ins v4.2d[1], v5.2d[0]
|
ins v4.d[1], v5.d[0]
|
||||||
mul v6.4h, v6.4h, v20.4h
|
mul v6.4h, v6.4h, v20.4h
|
||||||
mul v7.4h, v7.4h, v21.4h
|
mul v7.4h, v7.4h, v21.4h
|
||||||
ins v6.2d[1], v7.2d[0]
|
ins v6.d[1], v7.d[0]
|
||||||
add DCT_TABLE, DCT_TABLE, #16
|
add DCT_TABLE, DCT_TABLE, #16
|
||||||
ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
|
ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
|
||||||
mul v10.4h, v10.4h, v24.4h
|
mul v10.4h, v10.4h, v24.4h
|
||||||
mul v11.4h, v11.4h, v25.4h
|
mul v11.4h, v11.4h, v25.4h
|
||||||
ins v10.2d[1], v11.2d[0]
|
ins v10.d[1], v11.d[0]
|
||||||
add DCT_TABLE, DCT_TABLE, #16
|
add DCT_TABLE, DCT_TABLE, #16
|
||||||
ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
|
ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
|
||||||
mul v12.4h, v12.4h, v26.4h
|
mul v12.4h, v12.4h, v26.4h
|
||||||
mul v13.4h, v13.4h, v27.4h
|
mul v13.4h, v13.4h, v27.4h
|
||||||
ins v12.2d[1], v13.2d[0]
|
ins v12.d[1], v13.d[0]
|
||||||
add DCT_TABLE, DCT_TABLE, #16
|
add DCT_TABLE, DCT_TABLE, #16
|
||||||
ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
|
ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
|
||||||
mul v16.4h, v16.4h, v30.4h
|
mul v16.4h, v16.4h, v30.4h
|
||||||
mul v17.4h, v17.4h, v31.4h
|
mul v17.4h, v17.4h, v31.4h
|
||||||
ins v16.2d[1], v17.2d[0]
|
ins v16.d[1], v17.d[0]
|
||||||
|
|
||||||
/* Pass 1 */
|
/* Pass 1 */
|
||||||
#if 0
|
#if 0
|
||||||
@@ -1427,14 +1428,14 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
|
idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
|
||||||
transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
|
transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
|
||||||
#else
|
#else
|
||||||
smull v26.4s, v6.4h, v14.4h[3]
|
smull v26.4s, v6.4h, v14.h[3]
|
||||||
smlal v26.4s, v10.4h, v14.4h[2]
|
smlal v26.4s, v10.4h, v14.h[2]
|
||||||
smlal v26.4s, v12.4h, v14.4h[1]
|
smlal v26.4s, v12.4h, v14.h[1]
|
||||||
smlal v26.4s, v16.4h, v14.4h[0]
|
smlal v26.4s, v16.4h, v14.h[0]
|
||||||
smull v24.4s, v7.4h, v14.4h[3]
|
smull v24.4s, v7.4h, v14.h[3]
|
||||||
smlal v24.4s, v11.4h, v14.4h[2]
|
smlal v24.4s, v11.4h, v14.h[2]
|
||||||
smlal v24.4s, v13.4h, v14.4h[1]
|
smlal v24.4s, v13.4h, v14.h[1]
|
||||||
smlal v24.4s, v17.4h, v14.4h[0]
|
smlal v24.4s, v17.4h, v14.h[0]
|
||||||
sshll v15.4s, v4.4h, #15
|
sshll v15.4s, v4.4h, #15
|
||||||
sshll v30.4s, v5.4h, #15
|
sshll v30.4s, v5.4h, #15
|
||||||
add v20.4s, v15.4s, v26.4s
|
add v20.4s, v15.4s, v26.4s
|
||||||
@@ -1445,12 +1446,12 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
sub v15.4s, v30.4s, v24.4s
|
sub v15.4s, v30.4s, v24.4s
|
||||||
rshrn v5.4h, v20.4s, #13
|
rshrn v5.4h, v20.4s, #13
|
||||||
rshrn v7.4h, v15.4s, #13
|
rshrn v7.4h, v15.4s, #13
|
||||||
ins v4.2d[1], v5.2d[0]
|
ins v4.d[1], v5.d[0]
|
||||||
ins v6.2d[1], v7.2d[0]
|
ins v6.d[1], v7.d[0]
|
||||||
transpose v4, v6, v3, .16b, .8h
|
transpose v4, v6, v3, .16b, .8h
|
||||||
transpose v6, v10, v3, .16b, .4s
|
transpose v6, v10, v3, .16b, .4s
|
||||||
ins v11.2d[0], v10.2d[1]
|
ins v11.d[0], v10.d[1]
|
||||||
ins v7.2d[0], v6.2d[1]
|
ins v7.d[0], v6.d[1]
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Pass 2 */
|
/* Pass 2 */
|
||||||
@@ -1458,10 +1459,10 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
|
|
||||||
/* Range limit */
|
/* Range limit */
|
||||||
movi v30.8h, #0x80
|
movi v30.8h, #0x80
|
||||||
ins v26.2d[1], v27.2d[0]
|
ins v26.d[1], v27.d[0]
|
||||||
add v26.8h, v26.8h, v30.8h
|
add v26.8h, v26.8h, v30.8h
|
||||||
sqxtun v30.8b, v26.8h
|
sqxtun v30.8b, v26.8h
|
||||||
ins v26.2d[0], v30.2d[0]
|
ins v26.d[0], v30.d[0]
|
||||||
sqxtun v27.8b, v26.8h
|
sqxtun v27.8b, v26.8h
|
||||||
|
|
||||||
/* Store results to the output buffer */
|
/* Store results to the output buffer */
|
||||||
@@ -1476,13 +1477,13 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
|
|
||||||
sub sp, sp, #208
|
sub sp, sp, #208
|
||||||
ldr x15, [sp], 16
|
ldr x15, [sp], 16
|
||||||
ld1 {v4.8b - v7.8b}, [sp], 32
|
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||||
ld1 {v8.8b - v11.8b}, [sp], 32
|
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||||
ld1 {v12.8b - v15.8b}, [sp], 32
|
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||||
ld1 {v16.8b - v19.8b}, [sp], 32
|
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||||
ld1 {v21.8b - v22.8b}, [sp], 16
|
ld1 {v21.8b, v22.8b}, [sp], 16
|
||||||
ld1 {v24.8b - v27.8b}, [sp], 32
|
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||||
ld1 {v30.8b - v31.8b}, [sp], 16
|
ld1 {v30.8b, v31.8b}, [sp], 16
|
||||||
blr x30
|
blr x30
|
||||||
|
|
||||||
.unreq DCT_TABLE
|
.unreq DCT_TABLE
|
||||||
@@ -1514,9 +1515,9 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
ld1 {v4.8b}, [U], 8
|
ld1 {v4.8b}, [U], 8
|
||||||
ld1 {v5.8b}, [V], 8
|
ld1 {v5.8b}, [V], 8
|
||||||
ld1 {v0.8b}, [Y], 8
|
ld1 {v0.8b}, [Y], 8
|
||||||
prfm PLDL1KEEP, [U, #64]
|
prfm pldl1keep, [U, #64]
|
||||||
prfm PLDL1KEEP, [V, #64]
|
prfm pldl1keep, [V, #64]
|
||||||
prfm PLDL1KEEP, [Y, #64]
|
prfm pldl1keep, [Y, #64]
|
||||||
.elseif \size == 4
|
.elseif \size == 4
|
||||||
ld1 {v4.b}[0], [U], 1
|
ld1 {v4.b}[0], [U], 1
|
||||||
ld1 {v4.b}[1], [U], 1
|
ld1 {v4.b}[1], [U], 1
|
||||||
@@ -1606,14 +1607,14 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
.macro do_yuv_to_rgb_stage1
|
.macro do_yuv_to_rgb_stage1
|
||||||
uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
|
uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
|
||||||
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
|
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
|
||||||
smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
|
smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
|
||||||
smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
|
smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
|
||||||
smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
|
smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
|
||||||
smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
|
smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
|
||||||
smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
|
smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
|
||||||
smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
|
smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
|
||||||
smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
|
smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
|
||||||
smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
|
smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro do_yuv_to_rgb_stage2
|
.macro do_yuv_to_rgb_stage2
|
||||||
@@ -1656,18 +1657,18 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
sqxtun v1\g_offs\defsize, v20.8h
|
sqxtun v1\g_offs\defsize, v20.8h
|
||||||
ld1 {v0.8b}, [Y], 8
|
ld1 {v0.8b}, [Y], 8
|
||||||
sqxtun v1\r_offs\defsize, v24.8h
|
sqxtun v1\r_offs\defsize, v24.8h
|
||||||
prfm PLDL1KEEP, [U, #64]
|
prfm pldl1keep, [U, #64]
|
||||||
prfm PLDL1KEEP, [V, #64]
|
prfm pldl1keep, [V, #64]
|
||||||
prfm PLDL1KEEP, [Y, #64]
|
prfm pldl1keep, [Y, #64]
|
||||||
sqxtun v1\b_offs\defsize, v28.8h
|
sqxtun v1\b_offs\defsize, v28.8h
|
||||||
uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
|
uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
|
||||||
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
|
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
|
||||||
smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
|
smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
|
||||||
smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
|
smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
|
||||||
smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
|
smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
|
||||||
smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
|
smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
|
||||||
smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
|
smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
|
||||||
smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
|
smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
|
||||||
.else /**************************** rgb565 ***********************************/
|
.else /**************************** rgb565 ***********************************/
|
||||||
sqshlu v21.8h, v20.8h, #8
|
sqshlu v21.8h, v20.8h, #8
|
||||||
sqshlu v25.8h, v24.8h, #8
|
sqshlu v25.8h, v24.8h, #8
|
||||||
@@ -1675,21 +1676,21 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
|
uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
|
||||||
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
|
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
|
||||||
ld1 {v0.8b}, [Y], 8
|
ld1 {v0.8b}, [Y], 8
|
||||||
smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
|
smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
|
||||||
smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
|
smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
|
||||||
smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
|
smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
|
||||||
smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
|
smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
|
||||||
sri v25.8h, v21.8h, #5
|
sri v25.8h, v21.8h, #5
|
||||||
smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
|
smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
|
||||||
smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
|
smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
|
||||||
prfm PLDL1KEEP, [U, #64]
|
prfm pldl1keep, [U, #64]
|
||||||
prfm PLDL1KEEP, [V, #64]
|
prfm pldl1keep, [V, #64]
|
||||||
prfm PLDL1KEEP, [Y, #64]
|
prfm pldl1keep, [Y, #64]
|
||||||
sri v25.8h, v29.8h, #11
|
sri v25.8h, v29.8h, #11
|
||||||
.endif
|
.endif
|
||||||
do_store \bpp, 8
|
do_store \bpp, 8
|
||||||
smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
|
smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
|
||||||
smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
|
smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro do_yuv_to_rgb
|
.macro do_yuv_to_rgb
|
||||||
@@ -1702,7 +1703,7 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
.balign 16
|
.balign 16
|
||||||
jsimd_ycc_\colorid\()_neon_consts:
|
Ljsimd_ycc_\colorid\()_neon_consts:
|
||||||
.short 0, 0, 0, 0
|
.short 0, 0, 0, 0
|
||||||
.short 22971, -11277, -23401, 29033
|
.short 22971, -11277, -23401, 29033
|
||||||
.short -128, -128, -128, -128
|
.short -128, -128, -128, -128
|
||||||
@@ -1717,7 +1718,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
|||||||
|
|
||||||
INPUT_BUF0 .req x5
|
INPUT_BUF0 .req x5
|
||||||
INPUT_BUF1 .req x6
|
INPUT_BUF1 .req x6
|
||||||
INPUT_BUF2 .req INPUT_BUF
|
INPUT_BUF2 .req x1
|
||||||
|
|
||||||
RGB .req x7
|
RGB .req x7
|
||||||
Y .req x8
|
Y .req x8
|
||||||
@@ -1728,16 +1729,16 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
|||||||
sub sp, sp, 336
|
sub sp, sp, 336
|
||||||
str x15, [sp], 16
|
str x15, [sp], 16
|
||||||
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
|
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
|
||||||
adr x15, jsimd_ycc_\colorid\()_neon_consts
|
adr x15, Ljsimd_ycc_\colorid\()_neon_consts
|
||||||
/* Save NEON registers */
|
/* Save NEON registers */
|
||||||
st1 {v0.8b - v3.8b}, [sp], 32
|
st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||||
st1 {v4.8b - v7.8b}, [sp], 32
|
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||||
st1 {v8.8b - v11.8b}, [sp], 32
|
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||||
st1 {v12.8b - v15.8b}, [sp], 32
|
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||||
st1 {v16.8b - v19.8b}, [sp], 32
|
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||||
st1 {v20.8b - v23.8b}, [sp], 32
|
st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||||
st1 {v24.8b - v27.8b}, [sp], 32
|
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||||
st1 {v28.8b - v31.8b}, [sp], 32
|
st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||||
ld1 {v0.4h, v1.4h}, [x15], 16
|
ld1 {v0.4h, v1.4h}, [x15], 16
|
||||||
ld1 {v2.8h}, [x15]
|
ld1 {v2.8h}, [x15]
|
||||||
|
|
||||||
@@ -1748,8 +1749,8 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
|||||||
stp x8, x9, [sp], 16
|
stp x8, x9, [sp], 16
|
||||||
stp x10, x30, [sp], 16
|
stp x10, x30, [sp], 16
|
||||||
ldr INPUT_BUF0, [INPUT_BUF]
|
ldr INPUT_BUF0, [INPUT_BUF]
|
||||||
ldr INPUT_BUF1, [INPUT_BUF, 8]
|
ldr INPUT_BUF1, [INPUT_BUF, #8]
|
||||||
ldr INPUT_BUF2, [INPUT_BUF, 16]
|
ldr INPUT_BUF2, [INPUT_BUF, #16]
|
||||||
.unreq INPUT_BUF
|
.unreq INPUT_BUF
|
||||||
|
|
||||||
/* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
|
/* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
|
||||||
@@ -1758,7 +1759,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
|||||||
|
|
||||||
/* Outer loop over scanlines */
|
/* Outer loop over scanlines */
|
||||||
cmp NUM_ROWS, #1
|
cmp NUM_ROWS, #1
|
||||||
blt 9f
|
b.lt 9f
|
||||||
0:
|
0:
|
||||||
lsl x16, INPUT_ROW, #3
|
lsl x16, INPUT_ROW, #3
|
||||||
ldr Y, [INPUT_BUF0, x16]
|
ldr Y, [INPUT_BUF0, x16]
|
||||||
@@ -1770,60 +1771,60 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
|||||||
|
|
||||||
/* Inner loop over pixels */
|
/* Inner loop over pixels */
|
||||||
subs N, N, #8
|
subs N, N, #8
|
||||||
blt 3f
|
b.lt 3f
|
||||||
do_load 8
|
do_load 8
|
||||||
do_yuv_to_rgb_stage1
|
do_yuv_to_rgb_stage1
|
||||||
subs N, N, #8
|
subs N, N, #8
|
||||||
blt 2f
|
b.lt 2f
|
||||||
1:
|
1:
|
||||||
do_yuv_to_rgb_stage2_store_load_stage1
|
do_yuv_to_rgb_stage2_store_load_stage1
|
||||||
subs N, N, #8
|
subs N, N, #8
|
||||||
bge 1b
|
b.ge 1b
|
||||||
2:
|
2:
|
||||||
do_yuv_to_rgb_stage2
|
do_yuv_to_rgb_stage2
|
||||||
do_store \bpp, 8
|
do_store \bpp, 8
|
||||||
tst N, #7
|
tst N, #7
|
||||||
beq 8f
|
b.eq 8f
|
||||||
3:
|
3:
|
||||||
tst N, #4
|
tst N, #4
|
||||||
beq 3f
|
b.eq 3f
|
||||||
do_load 4
|
do_load 4
|
||||||
3:
|
3:
|
||||||
tst N, #2
|
tst N, #2
|
||||||
beq 4f
|
b.eq 4f
|
||||||
do_load 2
|
do_load 2
|
||||||
4:
|
4:
|
||||||
tst N, #1
|
tst N, #1
|
||||||
beq 5f
|
b.eq 5f
|
||||||
do_load 1
|
do_load 1
|
||||||
5:
|
5:
|
||||||
do_yuv_to_rgb
|
do_yuv_to_rgb
|
||||||
tst N, #4
|
tst N, #4
|
||||||
beq 6f
|
b.eq 6f
|
||||||
do_store \bpp, 4
|
do_store \bpp, 4
|
||||||
6:
|
6:
|
||||||
tst N, #2
|
tst N, #2
|
||||||
beq 7f
|
b.eq 7f
|
||||||
do_store \bpp, 2
|
do_store \bpp, 2
|
||||||
7:
|
7:
|
||||||
tst N, #1
|
tst N, #1
|
||||||
beq 8f
|
b.eq 8f
|
||||||
do_store \bpp, 1
|
do_store \bpp, 1
|
||||||
8:
|
8:
|
||||||
subs NUM_ROWS, NUM_ROWS, #1
|
subs NUM_ROWS, NUM_ROWS, #1
|
||||||
bgt 0b
|
b.gt 0b
|
||||||
9:
|
9:
|
||||||
/* Restore all registers and return */
|
/* Restore all registers and return */
|
||||||
sub sp, sp, #336
|
sub sp, sp, #336
|
||||||
ldr x15, [sp], 16
|
ldr x15, [sp], 16
|
||||||
ld1 {v0.8b - v3.8b}, [sp], 32
|
ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||||
ld1 {v4.8b - v7.8b}, [sp], 32
|
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||||
ld1 {v8.8b - v11.8b}, [sp], 32
|
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||||
ld1 {v12.8b - v15.8b}, [sp], 32
|
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||||
ld1 {v16.8b - v19.8b}, [sp], 32
|
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||||
ld1 {v20.8b - v23.8b}, [sp], 32
|
ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||||
ld1 {v24.8b - v27.8b}, [sp], 32
|
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||||
ld1 {v28.8b - v31.8b}, [sp], 32
|
ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||||
/* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
|
/* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
|
||||||
ldp x4, x5, [sp], 16
|
ldp x4, x5, [sp], 16
|
||||||
ldp x6, x7, [sp], 16
|
ldp x6, x7, [sp], 16
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* jsimd_powerpc64.c
|
* jsimd_powerpc.c
|
||||||
*
|
*
|
||||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||||
* Copyright 2009-2011, 2014 D. R. Commander
|
* Copyright 2009-2011, 2014 D. R. Commander
|
||||||
@@ -42,12 +42,38 @@ init_simd (void)
|
|||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
jsimd_can_rgb_ycc (void)
|
jsimd_can_rgb_ycc (void)
|
||||||
{
|
{
|
||||||
|
init_simd();
|
||||||
|
|
||||||
|
/* The code is optimised for these values only */
|
||||||
|
if (BITS_IN_JSAMPLE != 8)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(JDIMENSION) != 4)
|
||||||
|
return 0;
|
||||||
|
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (simd_support & JSIMD_ALTIVEC)
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
jsimd_can_rgb_gray (void)
|
jsimd_can_rgb_gray (void)
|
||||||
{
|
{
|
||||||
|
init_simd();
|
||||||
|
|
||||||
|
/* The code is optimised for these values only */
|
||||||
|
if (BITS_IN_JSAMPLE != 8)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(JDIMENSION) != 4)
|
||||||
|
return 0;
|
||||||
|
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (simd_support & JSIMD_ALTIVEC)
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -68,6 +94,37 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
|
|||||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
JDIMENSION output_row, int num_rows)
|
JDIMENSION output_row, int num_rows)
|
||||||
{
|
{
|
||||||
|
void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
|
||||||
|
|
||||||
|
switch(cinfo->in_color_space) {
|
||||||
|
case JCS_EXT_RGB:
|
||||||
|
altivecfct=jsimd_extrgb_ycc_convert_altivec;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_RGBX:
|
||||||
|
case JCS_EXT_RGBA:
|
||||||
|
altivecfct=jsimd_extrgbx_ycc_convert_altivec;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_BGR:
|
||||||
|
altivecfct=jsimd_extbgr_ycc_convert_altivec;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_BGRX:
|
||||||
|
case JCS_EXT_BGRA:
|
||||||
|
altivecfct=jsimd_extbgrx_ycc_convert_altivec;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_XBGR:
|
||||||
|
case JCS_EXT_ABGR:
|
||||||
|
altivecfct=jsimd_extxbgr_ycc_convert_altivec;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_XRGB:
|
||||||
|
case JCS_EXT_ARGB:
|
||||||
|
altivecfct=jsimd_extxrgb_ycc_convert_altivec;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
altivecfct=jsimd_rgb_ycc_convert_altivec;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(void)
|
GLOBAL(void)
|
||||||
@@ -75,6 +132,37 @@ jsimd_rgb_gray_convert (j_compress_ptr cinfo,
|
|||||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||||
JDIMENSION output_row, int num_rows)
|
JDIMENSION output_row, int num_rows)
|
||||||
{
|
{
|
||||||
|
void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
|
||||||
|
|
||||||
|
switch(cinfo->in_color_space) {
|
||||||
|
case JCS_EXT_RGB:
|
||||||
|
altivecfct=jsimd_extrgb_gray_convert_altivec;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_RGBX:
|
||||||
|
case JCS_EXT_RGBA:
|
||||||
|
altivecfct=jsimd_extrgbx_gray_convert_altivec;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_BGR:
|
||||||
|
altivecfct=jsimd_extbgr_gray_convert_altivec;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_BGRX:
|
||||||
|
case JCS_EXT_BGRA:
|
||||||
|
altivecfct=jsimd_extbgrx_gray_convert_altivec;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_XBGR:
|
||||||
|
case JCS_EXT_ABGR:
|
||||||
|
altivecfct=jsimd_extxbgr_gray_convert_altivec;
|
||||||
|
break;
|
||||||
|
case JCS_EXT_XRGB:
|
||||||
|
case JCS_EXT_ARGB:
|
||||||
|
altivecfct=jsimd_extxrgb_gray_convert_altivec;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
altivecfct=jsimd_rgb_gray_convert_altivec;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(void)
|
GLOBAL(void)
|
||||||
@@ -202,6 +290,21 @@ jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
|
|||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
jsimd_can_convsamp (void)
|
jsimd_can_convsamp (void)
|
||||||
{
|
{
|
||||||
|
init_simd();
|
||||||
|
|
||||||
|
/* The code is optimised for these values only */
|
||||||
|
if (DCTSIZE != 8)
|
||||||
|
return 0;
|
||||||
|
if (BITS_IN_JSAMPLE != 8)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(JDIMENSION) != 4)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(DCTELEM) != 2)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (simd_support & JSIMD_ALTIVEC)
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -215,6 +318,7 @@ GLOBAL(void)
|
|||||||
jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
|
jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||||
DCTELEM * workspace)
|
DCTELEM * workspace)
|
||||||
{
|
{
|
||||||
|
jsimd_convsamp_altivec(sample_data, start_col, workspace);
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(void)
|
GLOBAL(void)
|
||||||
@@ -226,6 +330,17 @@ jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
|
|||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
jsimd_can_fdct_islow (void)
|
jsimd_can_fdct_islow (void)
|
||||||
{
|
{
|
||||||
|
init_simd();
|
||||||
|
|
||||||
|
/* The code is optimised for these values only */
|
||||||
|
if (DCTSIZE != 8)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(DCTELEM) != 2)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (simd_support & JSIMD_ALTIVEC)
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -255,6 +370,7 @@ jsimd_can_fdct_float (void)
|
|||||||
GLOBAL(void)
|
GLOBAL(void)
|
||||||
jsimd_fdct_islow (DCTELEM * data)
|
jsimd_fdct_islow (DCTELEM * data)
|
||||||
{
|
{
|
||||||
|
jsimd_fdct_islow_altivec(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(void)
|
GLOBAL(void)
|
||||||
@@ -271,6 +387,19 @@ jsimd_fdct_float (FAST_FLOAT * data)
|
|||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
jsimd_can_quantize (void)
|
jsimd_can_quantize (void)
|
||||||
{
|
{
|
||||||
|
init_simd();
|
||||||
|
|
||||||
|
/* The code is optimised for these values only */
|
||||||
|
if (DCTSIZE != 8)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(JCOEF) != 2)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(DCTELEM) != 2)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (simd_support & JSIMD_ALTIVEC)
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -284,6 +413,7 @@ GLOBAL(void)
|
|||||||
jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
|
jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
|
||||||
DCTELEM * workspace)
|
DCTELEM * workspace)
|
||||||
{
|
{
|
||||||
|
jsimd_quantize_altivec(coef_block, divisors, workspace);
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(void)
|
GLOBAL(void)
|
||||||
@@ -321,12 +451,34 @@ jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
|||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
jsimd_can_idct_islow (void)
|
jsimd_can_idct_islow (void)
|
||||||
{
|
{
|
||||||
|
init_simd();
|
||||||
|
|
||||||
|
/* The code is optimised for these values only */
|
||||||
|
if (DCTSIZE != 8)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(JCOEF) != 2)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (simd_support & JSIMD_ALTIVEC)
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(int)
|
GLOBAL(int)
|
||||||
jsimd_can_idct_ifast (void)
|
jsimd_can_idct_ifast (void)
|
||||||
{
|
{
|
||||||
|
init_simd();
|
||||||
|
|
||||||
|
/* The code is optimised for these values only */
|
||||||
|
if (DCTSIZE != 8)
|
||||||
|
return 0;
|
||||||
|
if (sizeof(JCOEF) != 2)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (simd_support & JSIMD_ALTIVEC)
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -341,6 +493,8 @@ jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
|||||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||||
JDIMENSION output_col)
|
JDIMENSION output_col)
|
||||||
{
|
{
|
||||||
|
jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
|
||||||
|
output_col);
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(void)
|
GLOBAL(void)
|
||||||
@@ -348,6 +502,8 @@ jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
|||||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||||
JDIMENSION output_col)
|
JDIMENSION output_col)
|
||||||
{
|
{
|
||||||
|
jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
|
||||||
|
output_col);
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL(void)
|
GLOBAL(void)
|
||||||
|
|||||||
@@ -50,4 +50,7 @@ TURBOJPEG_1.4
|
|||||||
tjDecompressToYUVPlanes;
|
tjDecompressToYUVPlanes;
|
||||||
tjEncodeYUV3;
|
tjEncodeYUV3;
|
||||||
tjEncodeYUVPlanes;
|
tjEncodeYUVPlanes;
|
||||||
|
tjPlaneHeight;
|
||||||
|
tjPlaneSizeYUV;
|
||||||
|
tjPlaneWidth;
|
||||||
} TURBOJPEG_1.2;
|
} TURBOJPEG_1.2;
|
||||||
|
|||||||
@@ -76,6 +76,9 @@ TURBOJPEG_1.4
|
|||||||
tjDecompressToYUVPlanes;
|
tjDecompressToYUVPlanes;
|
||||||
tjEncodeYUV3;
|
tjEncodeYUV3;
|
||||||
tjEncodeYUVPlanes;
|
tjEncodeYUVPlanes;
|
||||||
|
tjPlaneHeight;
|
||||||
|
tjPlaneSizeYUV;
|
||||||
|
tjPlaneWidth;
|
||||||
Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII;
|
Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII;
|
||||||
Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFromYUV___3_3B_3II_3III_3BII;
|
Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFromYUV___3_3B_3II_3III_3BII;
|
||||||
Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIIIII_3_3B_3I_3III;
|
Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIIIII_3_3B_3I_3III;
|
||||||
|
|||||||
@@ -320,6 +320,14 @@ static int setDecompDefaults(struct jpeg_decompress_struct *dinfo,
|
|||||||
static int getSubsamp(j_decompress_ptr dinfo)
|
static int getSubsamp(j_decompress_ptr dinfo)
|
||||||
{
|
{
|
||||||
int retval=-1, i, k;
|
int retval=-1, i, k;
|
||||||
|
|
||||||
|
/* The sampling factors actually have no meaning with grayscale JPEG files,
|
||||||
|
and in fact it's possible to generate grayscale JPEGs with sampling
|
||||||
|
factors > 1 (even though those sampling factors are ignored by the
|
||||||
|
decompressor.) Thus, we need to treat grayscale as a special case. */
|
||||||
|
if(dinfo->num_components==1 && dinfo->jpeg_color_space==JCS_GRAYSCALE)
|
||||||
|
return TJSAMP_GRAY;
|
||||||
|
|
||||||
for(i=0; i<NUMSUBOPT; i++)
|
for(i=0; i<NUMSUBOPT; i++)
|
||||||
{
|
{
|
||||||
if(dinfo->num_components==pixelsize[i]
|
if(dinfo->num_components==pixelsize[i]
|
||||||
|
|||||||
Reference in New Issue
Block a user