Merge tag '1.5.1'
Tag 1.5.1 release * tag '1.5.1': ARM64 NEON: Fix another ABI conformance issue Build: Remove ARMv6 support from 'make iosdmg' Fix out-of-bounds write in partial decomp. feature Silence additional UBSan warnings Fix unsigned int overflow in libjpeg memory mgr. TurboJPEG: Decomp. 4:2:2/4:4:0 JPEGs w/unusual SFs Silence pedantic GCC6 code formatting warnings Use plain upsampling if merged isn't accelerated Implement h1v2 fancy upsampling Fix AArch64 ABI conformance issue in SIMD code Don't install libturbojpeg.pc if TJPEG disabled Linux/PPC: Only enable AltiVec if CPU supports it ARM/MIPS: Change the behavior of JSIMD_FORCE* Bump version to 1.5.1 to prepare for new commits
This commit is contained in:
34
BUILDING.md
34
BUILDING.md
@@ -323,11 +323,6 @@ Set the following shell variables for simplicity:
|
||||
IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
|
||||
IOS_GCC=$IOS_PLATFORMDIR/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2
|
||||
|
||||
*ARMv6 (code will run on all iOS devices, not SIMD-accelerated)*
|
||||
[NOTE: Requires Xcode 4.4.x or earlier]
|
||||
|
||||
IOS_CFLAGS="-march=armv6 -mcpu=arm1176jzf-s -mfpu=vfp"
|
||||
|
||||
*ARMv7 (code will run on iPhone 3GS-4S/iPad 1st-3rd Generation and newer)*
|
||||
|
||||
IOS_CFLAGS="-march=armv7 -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon"
|
||||
@@ -399,8 +394,8 @@ NOTE: You can also add `-miphoneos-version-min={version}` to `$IOS_CFLAGS`
|
||||
above in order to support older versions of iOS than the default version
|
||||
supported by the SDK.
|
||||
|
||||
Once built, lipo can be used to combine the ARMv6, v7, v7s, and/or v8 variants
|
||||
into a universal library.
|
||||
Once built, lipo can be used to combine the ARMv7, v7s, and/or v8 variants into
|
||||
a universal library.
|
||||
|
||||
|
||||
### Building libjpeg-turbo for Android
|
||||
@@ -782,7 +777,6 @@ default, but you can override this by setting the `BUILDDIR32` variable on the
|
||||
make command line as shown above.
|
||||
|
||||
make iosdmg [BUILDDIR32={32-bit build directory}] \
|
||||
[BUILDDIRARMV6={ARMv6 build directory}] \
|
||||
[BUILDDIRARMV7={ARMv7 build directory}] \
|
||||
[BUILDDIRARMV7S={ARMv7s build directory}] \
|
||||
[BUILDDIRARMV8={ARMv8 build directory}]
|
||||
@@ -791,19 +785,17 @@ On OS X systems, this creates a Macintosh package and disk image in which the
|
||||
libjpeg-turbo static libraries contain ARM architectures necessary to build
|
||||
iOS applications. If building on an x86-64 system, the binaries will also
|
||||
contain the i386 architecture, as with `make udmg` above. You should first
|
||||
configure ARMv6, ARMv7, ARMv7s, and/or ARMv8 out-of-tree builds of
|
||||
libjpeg-turbo (see "Building libjpeg-turbo for iOS" above.) If you are
|
||||
building an x86-64 version of libjpeg-turbo, you should configure a 32-bit
|
||||
out-of-tree build as well. Next, build libjpeg-turbo as you would normally,
|
||||
using an out-of-tree build. When it is built, run `make iosdmg` from the
|
||||
build directory. The build system will look for the ARMv6 build under
|
||||
*{source_directory}*/iosarmv6 by default, the ARMv7 build under
|
||||
*{source_directory}*/iosarmv7 by default, the ARMv7s build under
|
||||
*{source_directory}*/iosarmv7s by default, the ARMv8 build under
|
||||
*{source_directory}*/iosarmv8 by default, and (if applicable) the 32-bit build
|
||||
under *{source_directory}*/osxx86 by default, but you can override this by
|
||||
setting the `BUILDDIR32`, `BUILDDIRARMV6`, `BUILDDIRARMV7`, `BUILDDIRARMV7S`,
|
||||
and/or `BUILDDIRARMV8` variables on the `make` command line as shown above.
|
||||
configure ARMv7, ARMv7s, and/or ARMv8 out-of-tree builds of libjpeg-turbo (see
|
||||
"Building libjpeg-turbo for iOS" above.) If you are building an x86-64 version
|
||||
of libjpeg-turbo, you should configure a 32-bit out-of-tree build as well.
|
||||
Next, build libjpeg-turbo as you would normally, using an out-of-tree build.
|
||||
When it is built, run `make iosdmg` from the build directory. The build system
|
||||
will look for the ARMv7 build under *{source_directory}*/iosarmv7 by default,
|
||||
the ARMv7s build under *{source_directory}*/iosarmv7s by default, the ARMv8
|
||||
build under *{source_directory}*/iosarmv8 by default, and (if applicable) the
|
||||
32-bit build under *{source_directory}*/osxx86 by default, but you can override
|
||||
this by setting the `BUILDDIR32`, `BUILDDIRARMV7`, `BUILDDIRARMV7S`, and/or
|
||||
`BUILDDIRARMV8` variables on the `make` command line as shown above.
|
||||
|
||||
NOTE: If including an ARMv8 build in the package, then you may need to use
|
||||
Xcode's version of lipo instead of the operating system's. To do this, pass
|
||||
|
||||
@@ -9,7 +9,7 @@ if(POLICY CMP0022)
|
||||
endif()
|
||||
|
||||
project(mozjpeg C)
|
||||
set(VERSION 3.2)
|
||||
set(VERSION 3.3)
|
||||
|
||||
if(NOT WIN32)
|
||||
message(FATAL_ERROR "Platform not supported by this build system. Use autotools instead.")
|
||||
|
||||
91
ChangeLog.md
91
ChangeLog.md
@@ -1,3 +1,94 @@
|
||||
1.5.1
|
||||
=====
|
||||
|
||||
### Significant changes relative to 1.5.0:
|
||||
|
||||
1. Previously, the undocumented `JSIMD_FORCE*` environment variables could be
|
||||
used to force-enable a particular SIMD instruction set if multiple instruction
|
||||
sets were available on a particular platform. On x86 platforms, where CPU
|
||||
feature detection is bulletproof and multiple SIMD instruction sets are
|
||||
available, it makes sense for those environment variables to allow forcing the
|
||||
use of an instruction set only if that instruction set is available. However,
|
||||
since the ARM implementations of libjpeg-turbo can only use one SIMD
|
||||
instruction set, and since their feature detection code is less bulletproof
|
||||
(parsing /proc/cpuinfo), it makes sense for the `JSIMD_FORCENEON` environment
|
||||
variable to bypass the feature detection code and really force the use of NEON
|
||||
instructions. A new environment variable (`JSIMD_FORCEDSPR2`) was introduced
|
||||
in the MIPS implementation for the same reasons, and the existing
|
||||
`JSIMD_FORCENONE` environment variable was extended to that implementation.
|
||||
These environment variables provide a workaround for those attempting to test
|
||||
ARM and MIPS builds of libjpeg-turbo in QEMU, which passes through
|
||||
/proc/cpuinfo from the host system.
|
||||
|
||||
2. libjpeg-turbo previously assumed that AltiVec instructions were always
|
||||
available on PowerPC platforms, which led to "illegal instruction" errors when
|
||||
running on PowerPC chips that lack AltiVec support (such as the older 7xx/G3
|
||||
and newer e5500 series.) libjpeg-turbo now examines /proc/cpuinfo on
|
||||
Linux/Android systems and enables AltiVec instructions only if the CPU supports
|
||||
them. It also now provides two environment variables, `JSIMD_FORCEALTIVEC` and
|
||||
`JSIMD_FORCENONE`, to force-enable and force-disable AltiVec instructions in
|
||||
environments where /proc/cpuinfo is an unreliable means of CPU feature
|
||||
detection (such as when running in QEMU.) On OS X, libjpeg-turbo continues to
|
||||
assume that AltiVec support is always available, which means that libjpeg-turbo
|
||||
cannot be used with G3 Macs unless you set the environment variable
|
||||
`JSIMD_FORCENONE` to `1`.
|
||||
|
||||
3. Fixed an issue whereby 64-bit ARM (AArch64) builds of libjpeg-turbo would
|
||||
crash when built with recent releases of the Clang/LLVM compiler. This was
|
||||
caused by an ABI conformance issue in some of libjpeg-turbo's 64-bit NEON SIMD
|
||||
routines. Those routines were incorrectly using 64-bit instructions to
|
||||
transfer a 32-bit JDIMENSION argument, whereas the ABI allows the upper
|
||||
(unused) 32 bits of a 32-bit argument's register to be undefined. The new
|
||||
Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit
|
||||
structure members into a single 64-bit register, and this exposed the ABI
|
||||
conformance issue.
|
||||
|
||||
4. Fancy upsampling is now supported when decompressing JPEG images that use
|
||||
4:4:0 (h1v2) chroma subsampling. These images are generated when losslessly
|
||||
rotating or transposing JPEG images that use 4:2:2 (h2v1) chroma subsampling.
|
||||
The h1v2 fancy upsampling algorithm is not currently SIMD-accelerated.
|
||||
|
||||
5. If merged upsampling isn't SIMD-accelerated but YCbCr-to-RGB conversion is,
|
||||
then libjpeg-turbo will now disable merged upsampling when decompressing YCbCr
|
||||
JPEG images into RGB or extended RGB output images. This significantly speeds
|
||||
up the decompression of 4:2:0 and 4:2:2 JPEGs on ARM platforms if fancy
|
||||
upsampling is not used (for example, if the `-nosmooth` option to djpeg is
|
||||
specified.)
|
||||
|
||||
6. The TurboJPEG API will now decompress 4:2:2 and 4:4:0 JPEG images with
|
||||
2x2 luminance sampling factors and 2x1 or 1x2 chrominance sampling factors.
|
||||
This is a non-standard way of specifying 2x subsampling (normally 4:2:2 JPEGs
|
||||
have 2x1 luminance and 1x1 chrominance sampling factors, and 4:4:0 JPEGs have
|
||||
1x2 luminance and 1x1 chrominance sampling factors), but the JPEG specification
|
||||
and the libjpeg API both allow it.
|
||||
|
||||
7. Fixed an unsigned integer overflow in the libjpeg memory manager, detected
|
||||
by the Clang undefined behavior sanitizer, that could be triggered by
|
||||
attempting to decompress a specially-crafted malformed JPEG image. This issue
|
||||
affected only 32-bit code and did not pose a security threat, but removing the
|
||||
warning makes it easier to detect actual security issues, should they arise in
|
||||
the future.
|
||||
|
||||
8. Fixed additional negative left shifts and other issues reported by the GCC
|
||||
and Clang undefined behavior sanitizers when attempting to decompress
|
||||
specially-crafted malformed JPEG images. None of these issues posed a security
|
||||
threat, but removing the warnings makes it easier to detect actual security
|
||||
issues, should they arise in the future.
|
||||
|
||||
9. Fixed an out-of-bounds array reference, introduced by 1.4.90[2] (partial
|
||||
image decompression) and detected by the Clang undefined behavior sanitizer,
|
||||
that could be triggered by a specially-crafted malformed JPEG image with more
|
||||
than four components. Because the out-of-bounds reference was still within the
|
||||
same structure, it was not known to pose a security threat, but removing the
|
||||
warning makes it easier to detect actual security issues, should they arise in
|
||||
the future.
|
||||
|
||||
10. Fixed another ABI conformance issue in the 64-bit ARM (AArch64) NEON SIMD
|
||||
code. Some of the routines were incorrectly reading and storing data below the
|
||||
stack pointer, which caused segfaults in certain applications under specific
|
||||
circumstances.
|
||||
|
||||
|
||||
1.5.0
|
||||
=====
|
||||
|
||||
|
||||
@@ -11,7 +11,10 @@ endif
|
||||
nodist_include_HEADERS = jconfig.h
|
||||
|
||||
pkgconfigdir = $(libdir)/pkgconfig
|
||||
pkgconfig_DATA = pkgscripts/libjpeg.pc pkgscripts/libturbojpeg.pc
|
||||
pkgconfig_DATA = pkgscripts/libjpeg.pc
|
||||
if WITH_TURBOJPEG
|
||||
pkgconfig_DATA += pkgscripts/libturbojpeg.pc
|
||||
endif
|
||||
|
||||
HDRS = jchuff.h jcmaster.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h \
|
||||
jmorecfg.h jpegint.h jpeglib.h jversion.h jsimd.h jsimddct.h jpegcomp.h \
|
||||
@@ -771,12 +774,12 @@ udmg: all pkgscripts/makemacpkg pkgscripts/uninstall
|
||||
sh pkgscripts/makemacpkg -build32 ${BUILDDIR32}
|
||||
|
||||
iosdmg: all pkgscripts/makemacpkg pkgscripts/uninstall
|
||||
sh pkgscripts/makemacpkg -build32 ${BUILDDIR32} -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S} -buildarmv8 ${BUILDDIRARMV8} -lipo "${LIPO}"
|
||||
sh pkgscripts/makemacpkg -build32 ${BUILDDIR32} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S} -buildarmv8 ${BUILDDIRARMV8} -lipo "${LIPO}"
|
||||
|
||||
else
|
||||
|
||||
iosdmg: all pkgscripts/makemacpkg pkgscripts/uninstall
|
||||
sh pkgscripts/makemacpkg -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S} -buildarmv8 ${BUILDDIRARMV8} -lipo "${LIPO}"
|
||||
sh pkgscripts/makemacpkg -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S} -buildarmv8 ${BUILDDIRARMV8} -lipo "${LIPO}"
|
||||
|
||||
endif
|
||||
|
||||
|
||||
21
bmp.c
21
bmp.c
@@ -108,10 +108,14 @@ static void pixelconvert(unsigned char *srcbuf, int srcpf, int srcbottomup,
|
||||
m=(m-k)/(1.0-k);
|
||||
y=(y-k)/(1.0-k);
|
||||
}
|
||||
if(c>1.0) c=1.0; if(c<0.) c=0.;
|
||||
if(m>1.0) m=1.0; if(m<0.) m=0.;
|
||||
if(y>1.0) y=1.0; if(y<0.) y=0.;
|
||||
if(k>1.0) k=1.0; if(k<0.) k=0.;
|
||||
if(c>1.0) c=1.0;
|
||||
if(c<0.) c=0.;
|
||||
if(m>1.0) m=1.0;
|
||||
if(m<0.) m=0.;
|
||||
if(y>1.0) y=1.0;
|
||||
if(y<0.) y=0.;
|
||||
if(k>1.0) k=1.0;
|
||||
if(k<0.) k=0.;
|
||||
*dstcolptr++=(unsigned char)(255.0-c*255.0+0.5);
|
||||
*dstcolptr++=(unsigned char)(255.0-m*255.0+0.5);
|
||||
*dstcolptr++=(unsigned char)(255.0-y*255.0+0.5);
|
||||
@@ -133,9 +137,12 @@ static void pixelconvert(unsigned char *srcbuf, int srcpf, int srcbottomup,
|
||||
double r=c*k/255.;
|
||||
double g=m*k/255.;
|
||||
double b=y*k/255.;
|
||||
if(r>255.0) r=255.0; if(r<0.) r=0.;
|
||||
if(g>255.0) g=255.0; if(g<0.) g=0.;
|
||||
if(b>255.0) b=255.0; if(b<0.) b=0.;
|
||||
if(r>255.0) r=255.0;
|
||||
if(r<0.) r=0.;
|
||||
if(g>255.0) g=255.0;
|
||||
if(g<0.) g=0.;
|
||||
if(b>255.0) b=255.0;
|
||||
if(b<0.) b=0.;
|
||||
dstcolptr[tjRedOffset[dstpf]]=(unsigned char)(r+0.5);
|
||||
dstcolptr[tjGreenOffset[dstpf]]=(unsigned char)(g+0.5);
|
||||
dstcolptr[tjBlueOffset[dstpf]]=(unsigned char)(b+0.5);
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# Process this file with autoconf to produce a configure script.
|
||||
|
||||
AC_PREREQ([2.56])
|
||||
AC_INIT([mozjpeg], [3.2])
|
||||
AC_INIT([mozjpeg], [3.3])
|
||||
BUILD=`date +%Y%m%d`
|
||||
|
||||
AM_INIT_AUTOMAKE([-Wall foreign dist-bzip2])
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
* This file was part of the Independent JPEG Group's software:
|
||||
* Developed 1997-2015 by Guido Vollbeding.
|
||||
* libjpeg-turbo Modifications:
|
||||
* Copyright (C) 2015, D. R. Commander.
|
||||
* Copyright (C) 2015-2016, D. R. Commander.
|
||||
* For conditions of distribution and use, see the accompanying README.ijg
|
||||
* file.
|
||||
*
|
||||
@@ -382,7 +382,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
|
||||
if (arith_decode(cinfo, st)) v |= m;
|
||||
v += 1; if (sign) v = -v;
|
||||
/* Scale and output coefficient in natural (dezigzagged) order */
|
||||
(*block)[jpeg_natural_order[k]] = (JCOEF) (v << cinfo->Al);
|
||||
(*block)[jpeg_natural_order[k]] = (JCOEF) ((unsigned)v << cinfo->Al);
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
|
||||
4
jdhuff.c
4
jdhuff.c
@@ -109,9 +109,9 @@ start_pass_huff_decoder (j_decompress_ptr cinfo)
|
||||
actbl = compptr->ac_tbl_no;
|
||||
/* Compute derived values for Huffman tables */
|
||||
/* We may do this more than once for a table, but it's not expensive */
|
||||
pdtbl = entropy->dc_derived_tbls + dctbl;
|
||||
pdtbl = (d_derived_tbl **)(entropy->dc_derived_tbls) + dctbl;
|
||||
jpeg_make_d_derived_tbl(cinfo, TRUE, dctbl, pdtbl);
|
||||
pdtbl = entropy->ac_derived_tbls + actbl;
|
||||
pdtbl = (d_derived_tbl **)(entropy->ac_derived_tbls) + actbl;
|
||||
jpeg_make_d_derived_tbl(cinfo, FALSE, actbl, pdtbl);
|
||||
/* Initialize DC predictions to 0 */
|
||||
entropy->saved.last_dc_val[ci] = 0;
|
||||
|
||||
12
jdmaster.c
12
jdmaster.c
@@ -22,6 +22,7 @@
|
||||
#include "jpeglib.h"
|
||||
#include "jpegcomp.h"
|
||||
#include "jdmaster.h"
|
||||
#include "jsimd.h"
|
||||
|
||||
|
||||
/*
|
||||
@@ -69,6 +70,17 @@ use_merged_upsample (j_decompress_ptr cinfo)
|
||||
cinfo->comp_info[1]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
|
||||
cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size)
|
||||
return FALSE;
|
||||
#ifdef WITH_SIMD
|
||||
/* If YCbCr-to-RGB color conversion is SIMD-accelerated but merged upsampling
|
||||
isn't, then disabling merged upsampling is likely to be faster when
|
||||
decompressing YCbCr JPEG images. */
|
||||
if (!jsimd_can_h2v2_merged_upsample() && !jsimd_can_h2v1_merged_upsample() &&
|
||||
jsimd_can_ycc_rgb() && cinfo->jpeg_color_space == JCS_YCbCr &&
|
||||
(cinfo->out_color_space == JCS_RGB ||
|
||||
(cinfo->out_color_space >= JCS_EXT_RGB &&
|
||||
cinfo->out_color_space <= JCS_EXT_ARGB)))
|
||||
return FALSE;
|
||||
#endif
|
||||
/* ??? also need to test for upsample-time rescaling, when & if supported */
|
||||
return TRUE; /* by golly, it'll work... */
|
||||
#else
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
* This file was part of the Independent JPEG Group's software:
|
||||
* Copyright (C) 1995-1997, Thomas G. Lane.
|
||||
* libjpeg-turbo Modifications:
|
||||
* Copyright (C) 2015, D. R. Commander.
|
||||
* Copyright (C) 2015-2016, D. R. Commander.
|
||||
* For conditions of distribution and use, see the accompanying README.ijg
|
||||
* file.
|
||||
*
|
||||
@@ -170,12 +170,12 @@ start_pass_phuff_decoder (j_decompress_ptr cinfo)
|
||||
if (is_DC_band) {
|
||||
if (cinfo->Ah == 0) { /* DC refinement needs no table */
|
||||
tbl = compptr->dc_tbl_no;
|
||||
pdtbl = entropy->derived_tbls + tbl;
|
||||
pdtbl = (d_derived_tbl **)(entropy->derived_tbls) + tbl;
|
||||
jpeg_make_d_derived_tbl(cinfo, TRUE, tbl, pdtbl);
|
||||
}
|
||||
} else {
|
||||
tbl = compptr->ac_tbl_no;
|
||||
pdtbl = entropy->derived_tbls + tbl;
|
||||
pdtbl = (d_derived_tbl **)(entropy->derived_tbls) + tbl;
|
||||
jpeg_make_d_derived_tbl(cinfo, FALSE, tbl, pdtbl);
|
||||
/* remember the single active table */
|
||||
entropy->ac_derived_tbl = entropy->derived_tbls[tbl];
|
||||
|
||||
47
jdsample.c
47
jdsample.c
@@ -303,6 +303,48 @@ h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Fancy processing for 1:1 horizontal and 2:1 vertical (4:4:0 subsampling).
|
||||
*
|
||||
* This is a less common case, but it can be encountered when losslessly
|
||||
* rotating/transposing a JPEG file that uses 4:2:2 chroma subsampling.
|
||||
*/
|
||||
|
||||
METHODDEF(void)
|
||||
h1v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
JSAMPARRAY output_data = *output_data_ptr;
|
||||
JSAMPROW inptr0, inptr1, outptr;
|
||||
#if BITS_IN_JSAMPLE == 8
|
||||
int thiscolsum;
|
||||
#else
|
||||
JLONG thiscolsum;
|
||||
#endif
|
||||
JDIMENSION colctr;
|
||||
int inrow, outrow, v;
|
||||
|
||||
inrow = outrow = 0;
|
||||
while (outrow < cinfo->max_v_samp_factor) {
|
||||
for (v = 0; v < 2; v++) {
|
||||
/* inptr0 points to nearest input row, inptr1 points to next nearest */
|
||||
inptr0 = input_data[inrow];
|
||||
if (v == 0) /* next nearest is row above */
|
||||
inptr1 = input_data[inrow-1];
|
||||
else /* next nearest is row below */
|
||||
inptr1 = input_data[inrow+1];
|
||||
outptr = output_data[outrow++];
|
||||
|
||||
for(colctr = 0; colctr < compptr->downsampled_width; colctr++) {
|
||||
thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
|
||||
*outptr++ = (JSAMPLE) ((thiscolsum + 1) >> 2);
|
||||
}
|
||||
}
|
||||
inrow++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
|
||||
* Again a triangle filter; see comments for h2v1 case, above.
|
||||
@@ -431,6 +473,11 @@ jinit_upsampler (j_decompress_ptr cinfo)
|
||||
else
|
||||
upsample->methods[ci] = h2v1_upsample;
|
||||
}
|
||||
} else if (h_in_group == h_out_group &&
|
||||
v_in_group * 2 == v_out_group && do_fancy) {
|
||||
/* Non-fancy upsampling is handled by the generic method */
|
||||
upsample->methods[ci] = h1v2_fancy_upsample;
|
||||
upsample->pub.need_context_rows = TRUE;
|
||||
} else if (h_in_group * 2 == h_out_group &&
|
||||
v_in_group * 2 == v_out_group) {
|
||||
/* Special cases for 2h2v upsampling */
|
||||
|
||||
17
jmemmgr.c
17
jmemmgr.c
@@ -32,6 +32,7 @@
|
||||
#include "jinclude.h"
|
||||
#include "jpeglib.h"
|
||||
#include "jmemsys.h" /* import the system-dependent declarations */
|
||||
#include <stdint.h>
|
||||
|
||||
#ifndef NO_GETENV
|
||||
#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare getenv() */
|
||||
@@ -650,18 +651,26 @@ realize_virt_arrays (j_common_ptr cinfo)
|
||||
maximum_space = 0;
|
||||
for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
|
||||
if (sptr->mem_buffer == NULL) { /* if not realized yet */
|
||||
size_t new_space = (long) sptr->rows_in_array *
|
||||
(long) sptr->samplesperrow * sizeof(JSAMPLE);
|
||||
|
||||
space_per_minheight += (long) sptr->maxaccess *
|
||||
(long) sptr->samplesperrow * sizeof(JSAMPLE);
|
||||
maximum_space += (long) sptr->rows_in_array *
|
||||
(long) sptr->samplesperrow * sizeof(JSAMPLE);
|
||||
if (SIZE_MAX - maximum_space < new_space)
|
||||
out_of_memory(cinfo, 10);
|
||||
maximum_space += new_space;
|
||||
}
|
||||
}
|
||||
for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
|
||||
if (bptr->mem_buffer == NULL) { /* if not realized yet */
|
||||
size_t new_space = (long) bptr->rows_in_array *
|
||||
(long) bptr->blocksperrow * sizeof(JBLOCK);
|
||||
|
||||
space_per_minheight += (long) bptr->maxaccess *
|
||||
(long) bptr->blocksperrow * sizeof(JBLOCK);
|
||||
maximum_space += (long) bptr->rows_in_array *
|
||||
(long) bptr->blocksperrow * sizeof(JBLOCK);
|
||||
if (SIZE_MAX - maximum_space < new_space)
|
||||
out_of_memory(cinfo, 11);
|
||||
maximum_space += new_space;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -217,8 +217,8 @@ struct jpeg_decomp_master {
|
||||
/* Partial decompression variables */
|
||||
JDIMENSION first_iMCU_col;
|
||||
JDIMENSION last_iMCU_col;
|
||||
JDIMENSION first_MCU_col[MAX_COMPS_IN_SCAN];
|
||||
JDIMENSION last_MCU_col[MAX_COMPS_IN_SCAN];
|
||||
JDIMENSION first_MCU_col[MAX_COMPONENTS];
|
||||
JDIMENSION last_MCU_col[MAX_COMPONENTS];
|
||||
boolean jinit_upsampler_no_alloc;
|
||||
};
|
||||
|
||||
|
||||
@@ -73,19 +73,24 @@ endif
|
||||
|
||||
if SIMD_POWERPC
|
||||
|
||||
libsimd_la_SOURCES = jsimd_powerpc.c jsimd_altivec.h jcsample.h \
|
||||
noinst_LTLIBRARIES += libsimd_altivec.la
|
||||
|
||||
libsimd_altivec_la_SOURCES = \
|
||||
jccolor-altivec.c jcgray-altivec.c jcsample-altivec.c \
|
||||
jdcolor-altivec.c jdmerge-altivec.c jdsample-altivec.c \
|
||||
jfdctfst-altivec.c jfdctint-altivec.c \
|
||||
jidctfst-altivec.c jidctint-altivec.c \
|
||||
jquanti-altivec.c
|
||||
libsimd_la_CFLAGS = -maltivec
|
||||
libsimd_altivec_la_CFLAGS = -maltivec
|
||||
|
||||
jccolor-altivec.lo: jccolext-altivec.c
|
||||
jcgray-altivec.lo: jcgryext-altivec.c
|
||||
jdcolor-altivec.lo: jdcolext-altivec.c
|
||||
jdmerge-altivec.lo: jdmrgext-altivec.c
|
||||
|
||||
libsimd_la_SOURCES = jsimd_powerpc.c jsimd_altivec.h jcsample.h
|
||||
libsimd_la_LIBADD = libsimd_altivec.la
|
||||
|
||||
endif
|
||||
|
||||
AM_CPPFLAGS = -I$(top_srcdir)
|
||||
|
||||
@@ -125,7 +125,7 @@ init_simd (void)
|
||||
/* Force different settings through environment variables */
|
||||
env = getenv("JSIMD_FORCENEON");
|
||||
if ((env != NULL) && (strcmp(env, "1") == 0))
|
||||
simd_support &= JSIMD_ARM_NEON;
|
||||
simd_support = JSIMD_ARM_NEON;
|
||||
env = getenv("JSIMD_FORCENONE");
|
||||
if ((env != NULL) && (strcmp(env, "1") == 0))
|
||||
simd_support = 0;
|
||||
|
||||
@@ -142,7 +142,7 @@ init_simd (void)
|
||||
/* Force different settings through environment variables */
|
||||
env = getenv("JSIMD_FORCENEON");
|
||||
if ((env != NULL) && (strcmp(env, "1") == 0))
|
||||
simd_support &= JSIMD_ARM_NEON;
|
||||
simd_support = JSIMD_ARM_NEON;
|
||||
env = getenv("JSIMD_FORCENONE");
|
||||
if ((env != NULL) && (strcmp(env, "1") == 0))
|
||||
simd_support = 0;
|
||||
|
||||
@@ -210,10 +210,16 @@ asm_function jsimd_idct_islow_neon
|
||||
TMP7 .req x13
|
||||
TMP8 .req x14
|
||||
|
||||
/* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
|
||||
guarantee that the upper (unused) 32 bits of x3 are valid. This
|
||||
instruction ensures that those bits are set to zero. */
|
||||
uxtw x3, w3
|
||||
|
||||
sub sp, sp, #64
|
||||
adr x15, Ljsimd_idct_islow_neon_consts
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
|
||||
mov x10, sp
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
|
||||
ld1 {v0.8h, v1.8h}, [x15]
|
||||
ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
|
||||
ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
|
||||
@@ -238,7 +244,6 @@ asm_function jsimd_idct_islow_neon
|
||||
shl v10.8h, v2.8h, #(PASS1_BITS)
|
||||
sqxtn v16.8b, v15.8h
|
||||
mov TMP1, v16.d[0]
|
||||
sub sp, sp, #64
|
||||
mvn TMP2, TMP1
|
||||
|
||||
cbnz TMP2, 2f
|
||||
@@ -807,6 +812,11 @@ asm_function jsimd_idct_ifast_neon
|
||||
TMP7 .req x13
|
||||
TMP8 .req x14
|
||||
|
||||
/* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
|
||||
guarantee that the upper (unused) 32 bits of x3 are valid. This
|
||||
instruction ensures that those bits are set to zero. */
|
||||
uxtw x3, w3
|
||||
|
||||
/* Load and dequantize coefficients into NEON registers
|
||||
* with the following allocation:
|
||||
* 0 1 2 3 | 4 5 6 7
|
||||
@@ -1101,19 +1111,18 @@ asm_function jsimd_idct_4x4_neon
|
||||
TMP3 .req x2
|
||||
TMP4 .req x15
|
||||
|
||||
/* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
|
||||
guarantee that the upper (unused) 32 bits of x3 are valid. This
|
||||
instruction ensures that those bits are set to zero. */
|
||||
uxtw x3, w3
|
||||
|
||||
/* Save all used NEON registers */
|
||||
sub sp, sp, 272
|
||||
str x15, [sp], 16
|
||||
sub sp, sp, 64
|
||||
mov x9, sp
|
||||
/* Load constants (v3.4h is just used for padding) */
|
||||
adr TMP4, Ljsimd_idct_4x4_neon_consts
|
||||
st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
|
||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
|
||||
|
||||
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
||||
@@ -1222,16 +1231,8 @@ asm_function jsimd_idct_4x4_neon
|
||||
#endif
|
||||
|
||||
/* vpop {v8.4h - v15.4h} ;not available */
|
||||
sub sp, sp, #272
|
||||
ldr x15, [sp], 16
|
||||
ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
blr x30
|
||||
|
||||
.unreq DCT_TABLE
|
||||
@@ -1299,19 +1300,19 @@ asm_function jsimd_idct_2x2_neon
|
||||
TMP1 .req x0
|
||||
TMP2 .req x15
|
||||
|
||||
/* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
|
||||
guarantee that the upper (unused) 32 bits of x3 are valid. This
|
||||
instruction ensures that those bits are set to zero. */
|
||||
uxtw x3, w3
|
||||
|
||||
/* vpush {v8.4h - v15.4h} ; not available */
|
||||
sub sp, sp, 208
|
||||
str x15, [sp], 16
|
||||
sub sp, sp, 64
|
||||
mov x9, sp
|
||||
|
||||
/* Load constants */
|
||||
adr TMP2, Ljsimd_idct_2x2_neon_consts
|
||||
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
st1 {v21.8b, v22.8b}, [sp], 16
|
||||
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
st1 {v30.8b, v31.8b}, [sp], 16
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
|
||||
ld1 {v14.4h}, [TMP2]
|
||||
|
||||
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
||||
@@ -1411,15 +1412,8 @@ asm_function jsimd_idct_2x2_neon
|
||||
st1 {v26.b}[1], [TMP2], 1
|
||||
st1 {v27.b}[5], [TMP2], 1
|
||||
|
||||
sub sp, sp, #208
|
||||
ldr x15, [sp], 16
|
||||
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
ld1 {v21.8b, v22.8b}, [sp], 16
|
||||
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
ld1 {v30.8b, v31.8b}, [sp], 16
|
||||
blr x30
|
||||
|
||||
.unreq DCT_TABLE
|
||||
@@ -1688,24 +1682,24 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
.else
|
||||
asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
|
||||
.endif
|
||||
OUTPUT_WIDTH .req x0
|
||||
OUTPUT_WIDTH .req w0
|
||||
INPUT_BUF .req x1
|
||||
INPUT_ROW .req x2
|
||||
INPUT_ROW .req w2
|
||||
OUTPUT_BUF .req x3
|
||||
NUM_ROWS .req x4
|
||||
NUM_ROWS .req w4
|
||||
|
||||
INPUT_BUF0 .req x5
|
||||
INPUT_BUF1 .req x6
|
||||
INPUT_BUF2 .req x1
|
||||
|
||||
RGB .req x7
|
||||
Y .req x8
|
||||
U .req x9
|
||||
V .req x10
|
||||
N .req x15
|
||||
Y .req x9
|
||||
U .req x10
|
||||
V .req x11
|
||||
N .req w15
|
||||
|
||||
sub sp, sp, 336
|
||||
str x15, [sp], 16
|
||||
sub sp, sp, 64
|
||||
mov x9, sp
|
||||
|
||||
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
|
||||
.if \fast_st3 == 1
|
||||
@@ -1715,23 +1709,11 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
|
||||
.endif
|
||||
|
||||
/* Save NEON registers */
|
||||
st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
|
||||
ld1 {v0.4h, v1.4h}, [x15], 16
|
||||
ld1 {v2.8h}, [x15]
|
||||
|
||||
/* Save ARM registers and handle input arguments */
|
||||
/* push {x4, x5, x6, x7, x8, x9, x10, x30} */
|
||||
stp x4, x5, [sp], 16
|
||||
stp x6, x7, [sp], 16
|
||||
stp x8, x9, [sp], 16
|
||||
stp x10, x30, [sp], 16
|
||||
ldr INPUT_BUF0, [INPUT_BUF]
|
||||
ldr INPUT_BUF1, [INPUT_BUF, #8]
|
||||
ldr INPUT_BUF2, [INPUT_BUF, #16]
|
||||
@@ -1745,11 +1727,10 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
|
||||
cmp NUM_ROWS, #1
|
||||
b.lt 9f
|
||||
0:
|
||||
lsl x16, INPUT_ROW, #3
|
||||
ldr Y, [INPUT_BUF0, x16]
|
||||
ldr U, [INPUT_BUF1, x16]
|
||||
ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
|
||||
ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
|
||||
mov N, OUTPUT_WIDTH
|
||||
ldr V, [INPUT_BUF2, x16]
|
||||
ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
|
||||
add INPUT_ROW, INPUT_ROW, #1
|
||||
ldr RGB, [OUTPUT_BUF], #8
|
||||
|
||||
@@ -1799,21 +1780,8 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
|
||||
b.gt 0b
|
||||
9:
|
||||
/* Restore all registers and return */
|
||||
sub sp, sp, #336
|
||||
ldr x15, [sp], 16
|
||||
ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
/* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
|
||||
ldp x4, x5, [sp], 16
|
||||
ldp x6, x7, [sp], 16
|
||||
ldp x8, x9, [sp], 16
|
||||
ldp x10, x30, [sp], 16
|
||||
br x30
|
||||
.unreq OUTPUT_WIDTH
|
||||
.unreq INPUT_ROW
|
||||
@@ -2054,8 +2022,8 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
|
||||
OUTPUT_WIDTH .req w0
|
||||
INPUT_BUF .req x1
|
||||
OUTPUT_BUF .req x2
|
||||
OUTPUT_ROW .req x3
|
||||
NUM_ROWS .req x4
|
||||
OUTPUT_ROW .req w3
|
||||
NUM_ROWS .req w4
|
||||
|
||||
OUTPUT_BUF0 .req x5
|
||||
OUTPUT_BUF1 .req x6
|
||||
@@ -2082,17 +2050,18 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
|
||||
|
||||
/* Save NEON registers */
|
||||
sub sp, sp, #64
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
mov x9, sp
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
|
||||
|
||||
/* Outer loop over scanlines */
|
||||
cmp NUM_ROWS, #1
|
||||
b.lt 9f
|
||||
0:
|
||||
ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #3]
|
||||
ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #3]
|
||||
ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
|
||||
ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
|
||||
mov N, OUTPUT_WIDTH
|
||||
ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #3]
|
||||
ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
|
||||
add OUTPUT_ROW, OUTPUT_ROW, #1
|
||||
ldr RGB, [INPUT_BUF], #8
|
||||
|
||||
@@ -2136,7 +2105,6 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
|
||||
b.gt 0b
|
||||
9:
|
||||
/* Restore all registers and return */
|
||||
sub sp, sp, #64
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
br x30
|
||||
@@ -2199,6 +2167,11 @@ asm_function jsimd_convsamp_neon
|
||||
TMP8 .req x4
|
||||
TMPDUP .req w3
|
||||
|
||||
/* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
|
||||
guarantee that the upper (unused) 32 bits of x1 are valid. This
|
||||
instruction ensures that those bits are set to zero. */
|
||||
uxtw x1, w1
|
||||
|
||||
mov TMPDUP, #128
|
||||
ldp TMP1, TMP2, [SAMPLE_DATA], 16
|
||||
ldp TMP3, TMP4, [SAMPLE_DATA], 16
|
||||
@@ -2335,8 +2308,9 @@ asm_function jsimd_fdct_islow_neon
|
||||
|
||||
/* Save NEON registers */
|
||||
sub sp, sp, #64
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
mov x10, sp
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
|
||||
|
||||
/* Load all DATA into NEON registers with the following allocation:
|
||||
* 0 1 2 3 | 4 5 6 7
|
||||
@@ -2566,7 +2540,6 @@ asm_function jsimd_fdct_islow_neon
|
||||
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
|
||||
|
||||
/* Restore NEON registers */
|
||||
sub sp, sp, #64
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
|
||||
@@ -3080,7 +3053,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
|
||||
sub sp, sp, 272
|
||||
sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
|
||||
/* Save ARM registers */
|
||||
stp x19, x20, [sp], 16
|
||||
stp x19, x20, [sp]
|
||||
.if \fast_tbl == 1
|
||||
adr x15, Ljsimd_huff_encode_one_block_neon_consts
|
||||
.else
|
||||
@@ -3294,7 +3267,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
|
||||
and v18.16b, v18.16b, v23.16b
|
||||
add x3, x4, #0x400 /* r1 = dctbl->ehufsi */
|
||||
and v20.16b, v20.16b, v23.16b
|
||||
add x15, sp, #0x80 /* x15 = t2 */
|
||||
add x15, sp, #0x90 /* x15 = t2 */
|
||||
and v22.16b, v22.16b, v23.16b
|
||||
ldr w10, [x4, x12, lsl #2]
|
||||
addp v16.16b, v16.16b, v18.16b
|
||||
@@ -3317,7 +3290,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
|
||||
rbit x9, x9 /* x9 = index0 */
|
||||
ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */
|
||||
cmp w12, #(64-8)
|
||||
mov x11, sp
|
||||
add x11, sp, #16
|
||||
b.lt 4f
|
||||
cbz x9, 6f
|
||||
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
|
||||
@@ -3421,7 +3394,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
|
||||
put_bits x3, x11
|
||||
cbnz x9, 1b
|
||||
6:
|
||||
add x13, sp, #0xfe
|
||||
add x13, sp, #0x10e
|
||||
cmp x15, x13
|
||||
b.hs 1f
|
||||
ldr w12, [x5]
|
||||
@@ -3429,7 +3402,6 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
|
||||
checkbuf47
|
||||
put_bits x12, x14
|
||||
1:
|
||||
sub sp, sp, 16
|
||||
str PUT_BUFFER, [x0, #0x10]
|
||||
str PUT_BITSw, [x0, #0x18]
|
||||
ldp x19, x20, [sp], 16
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
* jsimd_mips.c
|
||||
*
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2009-2011, 2014, D. R. Commander.
|
||||
* Copyright (C) 2009-2011, 2014, 2016, D. R. Commander.
|
||||
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
|
||||
* Copyright (C) 2015, Matthieu Darbois.
|
||||
*
|
||||
@@ -77,6 +77,14 @@ init_simd (void)
|
||||
if (!parse_proc_cpuinfo("MIPS 74K"))
|
||||
return;
|
||||
#endif
|
||||
|
||||
/* Force different settings through environment variables */
|
||||
env = getenv("JSIMD_FORCEDSPR2");
|
||||
if ((env != NULL) && (strcmp(env, "1") == 0))
|
||||
simd_support = JSIMD_MIPS_DSPR2;
|
||||
env = getenv("JSIMD_FORCENONE");
|
||||
if ((env != NULL) && (strcmp(env, "1") == 0))
|
||||
simd_support = 0;
|
||||
}
|
||||
|
||||
static const int mips_idct_ifast_coefs[4] = {
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
* jsimd_powerpc.c
|
||||
*
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2009-2011, 2014-2015, D. R. Commander.
|
||||
* Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
|
||||
* Copyright (C) 2015, Matthieu Darbois.
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library,
|
||||
@@ -22,19 +22,106 @@
|
||||
#include "../jsimddct.h"
|
||||
#include "jsimd.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
|
||||
static unsigned int simd_support = ~0;
|
||||
|
||||
#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
|
||||
|
||||
#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
|
||||
|
||||
LOCAL(int)
|
||||
check_feature (char *buffer, char *feature)
|
||||
{
|
||||
char *p;
|
||||
if (*feature == 0)
|
||||
return 0;
|
||||
if (strncmp(buffer, "cpu", 3) != 0)
|
||||
return 0;
|
||||
buffer += 3;
|
||||
while (isspace(*buffer))
|
||||
buffer++;
|
||||
|
||||
/* Check if 'feature' is present in the buffer as a separate word */
|
||||
while ((p = strstr(buffer, feature))) {
|
||||
if (p > buffer && !isspace(*(p - 1))) {
|
||||
buffer++;
|
||||
continue;
|
||||
}
|
||||
p += strlen(feature);
|
||||
if (*p != 0 && !isspace(*p)) {
|
||||
buffer++;
|
||||
continue;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
LOCAL(int)
|
||||
parse_proc_cpuinfo (int bufsize)
|
||||
{
|
||||
char *buffer = (char *)malloc(bufsize);
|
||||
FILE *fd;
|
||||
simd_support = 0;
|
||||
|
||||
if (!buffer)
|
||||
return 0;
|
||||
|
||||
fd = fopen("/proc/cpuinfo", "r");
|
||||
if (fd) {
|
||||
while (fgets(buffer, bufsize, fd)) {
|
||||
if (!strchr(buffer, '\n') && !feof(fd)) {
|
||||
/* "impossible" happened - insufficient size of the buffer! */
|
||||
fclose(fd);
|
||||
free(buffer);
|
||||
return 0;
|
||||
}
|
||||
if (check_feature(buffer, "altivec"))
|
||||
simd_support |= JSIMD_ALTIVEC;
|
||||
}
|
||||
fclose(fd);
|
||||
}
|
||||
free(buffer);
|
||||
return 1;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Check what SIMD accelerations are supported.
|
||||
*
|
||||
* FIXME: This code is racy under a multi-threaded environment.
|
||||
*/
|
||||
LOCAL(void)
|
||||
init_simd (void)
|
||||
{
|
||||
char *env = NULL;
|
||||
#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
|
||||
int bufsize = 1024; /* an initial guess for the line buffer size limit */
|
||||
#endif
|
||||
|
||||
if (simd_support != ~0U)
|
||||
return;
|
||||
|
||||
simd_support = JSIMD_ALTIVEC;
|
||||
simd_support = 0;
|
||||
|
||||
#if defined(__ALTIVEC__) || defined(__APPLE__)
|
||||
simd_support |= JSIMD_ALTIVEC;
|
||||
#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
|
||||
while (!parse_proc_cpuinfo(bufsize)) {
|
||||
bufsize *= 2;
|
||||
if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Force different settings through environment variables */
|
||||
env = getenv("JSIMD_FORCEALTIVEC");
|
||||
if ((env != NULL) && (strcmp(env, "1") == 0))
|
||||
simd_support = JSIMD_ALTIVEC;
|
||||
env = getenv("JSIMD_FORCENONE");
|
||||
if ((env != NULL) && (strcmp(env, "1") == 0))
|
||||
simd_support = 0;
|
||||
|
||||
18
tjbench.c
18
tjbench.c
@@ -248,7 +248,8 @@ int decomp(unsigned char *srcbuf, unsigned char **jpegbuf,
|
||||
int y=(int)((double)srcbuf[rindex]*0.299
|
||||
+ (double)srcbuf[gindex]*0.587
|
||||
+ (double)srcbuf[bindex]*0.114 + 0.5);
|
||||
if(y>255) y=255; if(y<0) y=0;
|
||||
if(y>255) y=255;
|
||||
if(y<0) y=0;
|
||||
dstbuf[rindex]=abs(dstbuf[rindex]-y);
|
||||
dstbuf[gindex]=abs(dstbuf[gindex]-y);
|
||||
dstbuf[bindex]=abs(dstbuf[bindex]-y);
|
||||
@@ -300,7 +301,8 @@ int fullTest(unsigned char *srcbuf, int w, int h, int subsamp, int jpegqual,
|
||||
|
||||
for(tilew=dotile? 8:w, tileh=dotile? 8:h; ; tilew*=2, tileh*=2)
|
||||
{
|
||||
if(tilew>w) tilew=w; if(tileh>h) tileh=h;
|
||||
if(tilew>w) tilew=w;
|
||||
if(tileh>h) tileh=h;
|
||||
ntilesw=(w+tilew-1)/tilew; ntilesh=(h+tileh-1)/tileh;
|
||||
|
||||
if((jpegbuf=(unsigned char **)malloc(sizeof(unsigned char *)
|
||||
@@ -447,7 +449,8 @@ int fullTest(unsigned char *srcbuf, int w, int h, int subsamp, int jpegqual,
|
||||
|
||||
for(i=0; i<ntilesw*ntilesh; i++)
|
||||
{
|
||||
if(jpegbuf[i]) tjFree(jpegbuf[i]); jpegbuf[i]=NULL;
|
||||
if(jpegbuf[i]) tjFree(jpegbuf[i]);
|
||||
jpegbuf[i]=NULL;
|
||||
}
|
||||
free(jpegbuf); jpegbuf=NULL;
|
||||
free(jpegsize); jpegsize=NULL;
|
||||
@@ -465,7 +468,8 @@ int fullTest(unsigned char *srcbuf, int w, int h, int subsamp, int jpegqual,
|
||||
{
|
||||
for(i=0; i<ntilesw*ntilesh; i++)
|
||||
{
|
||||
if(jpegbuf[i]) tjFree(jpegbuf[i]); jpegbuf[i]=NULL;
|
||||
if(jpegbuf[i]) tjFree(jpegbuf[i]);
|
||||
jpegbuf[i]=NULL;
|
||||
}
|
||||
free(jpegbuf); jpegbuf=NULL;
|
||||
}
|
||||
@@ -532,7 +536,8 @@ int decompTest(char *filename)
|
||||
|
||||
for(tilew=dotile? 16:w, tileh=dotile? 16:h; ; tilew*=2, tileh*=2)
|
||||
{
|
||||
if(tilew>w) tilew=w; if(tileh>h) tileh=h;
|
||||
if(tilew>w) tilew=w;
|
||||
if(tileh>h) tileh=h;
|
||||
ntilesw=(w+tilew-1)/tilew; ntilesh=(h+tileh-1)/tileh;
|
||||
|
||||
if((jpegbuf=(unsigned char **)malloc(sizeof(unsigned char *)
|
||||
@@ -692,7 +697,8 @@ int decompTest(char *filename)
|
||||
{
|
||||
for(i=0; i<ntilesw*ntilesh; i++)
|
||||
{
|
||||
if(jpegbuf[i]) tjFree(jpegbuf[i]); jpegbuf[i]=NULL;
|
||||
if(jpegbuf[i]) tjFree(jpegbuf[i]);
|
||||
jpegbuf[i]=NULL;
|
||||
}
|
||||
free(jpegbuf); jpegbuf=NULL;
|
||||
}
|
||||
|
||||
29
turbojpeg.c
29
turbojpeg.c
@@ -376,6 +376,29 @@ static int getSubsamp(j_decompress_ptr dinfo)
|
||||
retval=i; break;
|
||||
}
|
||||
}
|
||||
/* Handle 4:2:2 and 4:4:0 images whose sampling factors are specified
|
||||
in non-standard ways. */
|
||||
if(dinfo->comp_info[0].h_samp_factor==2 &&
|
||||
dinfo->comp_info[0].v_samp_factor==2 &&
|
||||
(i==TJSAMP_422 || i==TJSAMP_440))
|
||||
{
|
||||
int match=0;
|
||||
for(k=1; k<dinfo->num_components; k++)
|
||||
{
|
||||
int href=tjMCUHeight[i]/8, vref=tjMCUWidth[i]/8;
|
||||
if(dinfo->jpeg_color_space==JCS_YCCK && k==3)
|
||||
{
|
||||
href=vref=2;
|
||||
}
|
||||
if(dinfo->comp_info[k].h_samp_factor==href
|
||||
&& dinfo->comp_info[k].v_samp_factor==vref)
|
||||
match++;
|
||||
}
|
||||
if(match==dinfo->num_components-1)
|
||||
{
|
||||
retval=i; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return retval;
|
||||
@@ -578,7 +601,8 @@ static tjhandle _tjInitCompress(tjinstance *this)
|
||||
if(setjmp(this->jerr.setjmp_buffer))
|
||||
{
|
||||
/* If we get here, the JPEG code has signaled an error. */
|
||||
if(this) free(this); return NULL;
|
||||
if(this) free(this);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
jpeg_create_compress(&this->cinfo);
|
||||
@@ -1239,7 +1263,8 @@ static tjhandle _tjInitDecompress(tjinstance *this)
|
||||
if(setjmp(this->jerr.setjmp_buffer))
|
||||
{
|
||||
/* If we get here, the JPEG code has signaled an error. */
|
||||
if(this) free(this); return NULL;
|
||||
if(this) free(this);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
jpeg_create_decompress(&this->dinfo);
|
||||
|
||||
Reference in New Issue
Block a user