Merge branch 'master' into dev

This commit is contained in:
DRC
2020-10-27 16:42:14 -05:00
14 changed files with 1390 additions and 1323 deletions

View File

@@ -127,7 +127,6 @@ EXTERN(void) read_color_map(j_decompress_ptr cinfo, FILE *infile);
/* common support routines (in cdjpeg.c) */
EXTERN(void) enable_signal_catcher(j_common_ptr cinfo);
EXTERN(void) start_progress_monitor(j_common_ptr cinfo,
cd_progress_ptr progress);
EXTERN(void) end_progress_monitor(j_common_ptr cinfo);

View File

@@ -571,11 +571,10 @@ ycck_cmyk_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
* RGB565 conversion
*/
#define PACK_SHORT_565_LE(r, g, b) ((((r) << 8) & 0xF800) | \
(((g) << 3) & 0x7E0) | ((b) >> 3))
#define PACK_SHORT_565_BE(r, g, b) (((r) & 0xF8) | ((g) >> 5) | \
(((g) << 11) & 0xE000) | \
(((b) << 5) & 0x1F00))
#define PACK_SHORT_565_LE(r, g, b) \
((((r) << 8) & 0xF800) | (((g) << 3) & 0x7E0) | ((b) >> 3))
#define PACK_SHORT_565_BE(r, g, b) \
(((r) & 0xF8) | ((g) >> 5) | (((g) << 11) & 0xE000) | (((b) << 5) & 0x1F00))
#define PACK_TWO_PIXELS_LE(l, r) ((r << 16) | l)
#define PACK_TWO_PIXELS_BE(l, r) ((l << 16) | r)

View File

@@ -392,11 +392,10 @@ h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
* RGB565 conversion
*/
#define PACK_SHORT_565_LE(r, g, b) ((((r) << 8) & 0xF800) | \
(((g) << 3) & 0x7E0) | ((b) >> 3))
#define PACK_SHORT_565_BE(r, g, b) (((r) & 0xF8) | ((g) >> 5) | \
(((g) << 11) & 0xE000) | \
(((b) << 5) & 0x1F00))
#define PACK_SHORT_565_LE(r, g, b) \
((((r) << 8) & 0xF800) | (((g) << 3) & 0x7E0) | ((b) >> 3))
#define PACK_SHORT_565_BE(r, g, b) \
(((r) & 0xF8) | ((g) >> 5) | (((g) << 11) & 0xE000) | (((b) << 5) & 0x1F00))
#define PACK_TWO_PIXELS_LE(l, r) ((r << 16) | l)
#define PACK_TWO_PIXELS_BE(l, r) ((l << 16) | r)

View File

@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1995-2019, Thomas G. Lane, Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2010, 2014, 2017, 2019, D. R. Commander.
* Copyright (C) 2010, 2014, 2017, 2019-2020, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*

View File

@@ -649,11 +649,12 @@ start_input_ppm(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
if (maxval > 255) {
source->pub.get_pixel_rows = get_word_rgb_row;
} else if (maxval == MAXJSAMPLE && sizeof(JSAMPLE) == sizeof(U_CHAR) &&
(cinfo->in_color_space == JCS_EXT_RGB
#if RGB_RED == 0 && RGB_GREEN == 1 && RGB_BLUE == 2 && RGB_PIXELSIZE == 3
|| cinfo->in_color_space == JCS_RGB
(cinfo->in_color_space == JCS_EXT_RGB ||
cinfo->in_color_space == JCS_RGB)) {
#else
cinfo->in_color_space == JCS_EXT_RGB) {
#endif
)) {
source->pub.get_pixel_rows = get_raw_row;
use_raw_buffer = TRUE;
need_rescale = FALSE;

View File

@@ -334,8 +334,9 @@ start_input_tga(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
unsigned int width, height, maplen;
boolean is_bottom_up;
#define GET_2B(offset) ((unsigned int)UCH(targaheader[offset]) + \
(((unsigned int)UCH(targaheader[offset + 1])) << 8))
#define GET_2B(offset) \
((unsigned int)UCH(targaheader[offset]) + \
(((unsigned int)UCH(targaheader[offset + 1])) << 8))
if (!ReadOK(source->pub.input_file, targaheader, 18))
ERREXIT(cinfo, JERR_INPUT_EOF);

View File

@@ -1,36 +1,36 @@
%global _docdir %{_defaultdocdir}/%{name}-%{version}
%define _prefix @CMAKE_INSTALL_PREFIX@
%define _bindir @CMAKE_INSTALL_FULL_BINDIR@
%define _datarootdir @CMAKE_INSTALL_FULL_DATAROOTDIR@
%define _includedir @CMAKE_INSTALL_FULL_INCLUDEDIR@
%define _javadir @CMAKE_INSTALL_FULL_JAVADIR@
%define _mandir @CMAKE_INSTALL_FULL_MANDIR@
%define _enable_static @ENABLE_STATIC@
%define _enable_shared @ENABLE_SHARED@
%define _with_turbojpeg @WITH_TURBOJPEG@
%define _with_java @WITH_JAVA@
%define _prefix @CMAKE_INSTALL_PREFIX@
%define _bindir @CMAKE_INSTALL_FULL_BINDIR@
%define _datarootdir @CMAKE_INSTALL_FULL_DATAROOTDIR@
%define _includedir @CMAKE_INSTALL_FULL_INCLUDEDIR@
%define _javadir @CMAKE_INSTALL_FULL_JAVADIR@
%define _mandir @CMAKE_INSTALL_FULL_MANDIR@
%define _enable_static @ENABLE_STATIC@
%define _enable_shared @ENABLE_SHARED@
%define _with_turbojpeg @WITH_TURBOJPEG@
%define _with_java @WITH_JAVA@
%if "%{?__isa_bits:1}" == "1"
%define _bits %{__isa_bits}
%define _bits %{__isa_bits}
%else
# RPM < 4.6
%if "%{_lib}" == "lib64"
%define _bits 64
%define _bits 64
%else
%define _bits 32
%define _bits 32
%endif
%endif
#-->%if 1
%if "%{_bits}" == "64"
%define _libdir %{_exec_prefix}/lib64
%define _libdir %{_exec_prefix}/lib64
%else
%if "%{_prefix}" == "/opt/libjpeg-turbo"
%define _libdir %{_exec_prefix}/lib32
%define _libdir %{_exec_prefix}/lib32
%endif
%endif
#-->%else
%define _libdir @CMAKE_INSTALL_FULL_LIBDIR@
%define _libdir @CMAKE_INSTALL_FULL_LIBDIR@
#-->%endif
Summary: A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs
@@ -101,7 +101,6 @@ broader range of users and developers.
#-->make DESTDIR=$RPM_BUILD_ROOT
%install
rm -rf $RPM_BUILD_ROOT
make install DESTDIR=$RPM_BUILD_ROOT
/sbin/ldconfig -n $RPM_BUILD_ROOT%{_libdir}
@@ -163,25 +162,25 @@ rm -rf $RPM_BUILD_ROOT
%doc %{_docdir}/*
%dir %{_prefix}
%if "%{_prefix}" == "@CMAKE_INSTALL_DEFAULT_PREFIX@" && "%{_docdir}" != "%{_prefix}/doc"
%{_prefix}/doc
%{_prefix}/doc
%endif
%dir %{_bindir}
%{_bindir}/cjpeg
%{_bindir}/djpeg
%{_bindir}/jpegtran
%if "%{_with_turbojpeg}" == "1"
%{_bindir}/tjbench
%{_bindir}/tjbench
%endif
%{_bindir}/rdjpgcom
%{_bindir}/wrjpgcom
%dir %{_libdir}
%if "%{_enable_shared}" == "1"
%{_libdir}/libjpeg.so.@SO_MAJOR_VERSION@.@SO_AGE@.@SO_MINOR_VERSION@
%{_libdir}/libjpeg.so.@SO_MAJOR_VERSION@
%{_libdir}/libjpeg.so
%{_libdir}/libjpeg.so.@SO_MAJOR_VERSION@.@SO_AGE@.@SO_MINOR_VERSION@
%{_libdir}/libjpeg.so.@SO_MAJOR_VERSION@
%{_libdir}/libjpeg.so
%endif
%if "%{_enable_static}" == "1"
%{_libdir}/libjpeg.a
%{_libdir}/libjpeg.a
%endif
%dir %{_libdir}/pkgconfig
%{_libdir}/pkgconfig/libjpeg.pc
@@ -189,15 +188,15 @@ rm -rf $RPM_BUILD_ROOT
%dir %{_libdir}/cmake/@CMAKE_PROJECT_NAME@
%{_libdir}/cmake/@CMAKE_PROJECT_NAME@
%if "%{_with_turbojpeg}" == "1"
%if "%{_enable_shared}" == "1" || "%{_with_java}" == "1"
%{_libdir}/libturbojpeg.so.@TURBOJPEG_SO_VERSION@
%{_libdir}/libturbojpeg.so.@TURBOJPEG_SO_MAJOR_VERSION@
%{_libdir}/libturbojpeg.so
%endif
%if "%{_enable_static}" == "1"
%{_libdir}/libturbojpeg.a
%endif
%{_libdir}/pkgconfig/libturbojpeg.pc
%if "%{_enable_shared}" == "1" || "%{_with_java}" == "1"
%{_libdir}/libturbojpeg.so.@TURBOJPEG_SO_VERSION@
%{_libdir}/libturbojpeg.so.@TURBOJPEG_SO_MAJOR_VERSION@
%{_libdir}/libturbojpeg.so
%endif
%if "%{_enable_static}" == "1"
%{_libdir}/libturbojpeg.a
%endif
%{_libdir}/pkgconfig/libturbojpeg.pc
%endif
%dir %{_includedir}
%{_includedir}/jconfig.h
@@ -205,7 +204,7 @@ rm -rf $RPM_BUILD_ROOT
%{_includedir}/jmorecfg.h
%{_includedir}/jpeglib.h
%if "%{_with_turbojpeg}" == "1"
%{_includedir}/turbojpeg.h
%{_includedir}/turbojpeg.h
%endif
%dir %{_mandir}
%dir %{_mandir}/man1
@@ -215,10 +214,11 @@ rm -rf $RPM_BUILD_ROOT
%{_mandir}/man1/rdjpgcom.1*
%{_mandir}/man1/wrjpgcom.1*
%if "%{_prefix}" != "%{_datarootdir}"
%dir %{_datarootdir}
%dir %{_datarootdir}
%endif
%if "%{_with_java}" == "1"
%dir %{_javadir}
%{_javadir}/turbojpeg.jar
%dir %{_javadir}
%{_javadir}/turbojpeg.jar
%endif
%changelog

View File

@@ -107,69 +107,69 @@ _\fname:
* Uses some ideas from the comments in 'simd/jiss2int-64.asm'
*/
#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
JLONG q1, q2, q3, q4, q5, q6, q7; \
JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \
\
/* 1-D iDCT input data */ \
row0 = xrow0; \
row1 = xrow1; \
row2 = xrow2; \
row3 = xrow3; \
row4 = xrow4; \
row5 = xrow5; \
row6 = xrow6; \
row7 = xrow7; \
\
q5 = row7 + row3; \
q4 = row5 + row1; \
q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
MULTIPLY(q4, FIX_1_175875602); \
q7 = MULTIPLY(q5, FIX_1_175875602) + \
MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
q2 = MULTIPLY(row2, FIX_0_541196100) + \
MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
q4 = q6; \
q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
q6 += MULTIPLY(row5, -FIX_2_562915447) + \
MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
/* now we can use q1 (reloadable constants have been used up) */ \
q1 = q3 + q2; \
q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
MULTIPLY(row1, -FIX_0_899976223); \
q5 = q7; \
q1 = q1 + q6; \
q7 += MULTIPLY(row7, -FIX_0_899976223) + \
MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
\
/* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
tmp11_plus_tmp2 = q1; \
row1 = 0; \
\
q1 = q1 - q6; \
q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
MULTIPLY(row3, -FIX_2_562915447); \
q1 = q1 - q6; \
q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
MULTIPLY(row6, FIX_0_541196100); \
q3 = q3 - q2; \
\
/* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
tmp11_minus_tmp2 = q1; \
\
q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
q2 = q1 + q6; \
q1 = q1 - q6; \
\
/* pick up the results */ \
tmp0 = q4; \
tmp1 = q5; \
tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
tmp3 = q7; \
tmp10 = q2; \
tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
tmp12 = q3; \
tmp13 = q1; \
DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
JLONG q1, q2, q3, q4, q5, q6, q7; \
JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \
\
/* 1-D iDCT input data */ \
row0 = xrow0; \
row1 = xrow1; \
row2 = xrow2; \
row3 = xrow3; \
row4 = xrow4; \
row5 = xrow5; \
row6 = xrow6; \
row7 = xrow7; \
\
q5 = row7 + row3; \
q4 = row5 + row1; \
q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
MULTIPLY(q4, FIX_1_175875602); \
q7 = MULTIPLY(q5, FIX_1_175875602) + \
MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
q2 = MULTIPLY(row2, FIX_0_541196100) + \
MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
q4 = q6; \
q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
q6 += MULTIPLY(row5, -FIX_2_562915447) + \
MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
/* now we can use q1 (reloadable constants have been used up) */ \
q1 = q3 + q2; \
q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
MULTIPLY(row1, -FIX_0_899976223); \
q5 = q7; \
q1 = q1 + q6; \
q7 += MULTIPLY(row7, -FIX_0_899976223) + \
MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
\
/* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
tmp11_plus_tmp2 = q1; \
row1 = 0; \
\
q1 = q1 - q6; \
q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
MULTIPLY(row3, -FIX_2_562915447); \
q1 = q1 - q6; \
q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
MULTIPLY(row6, FIX_0_541196100); \
q3 = q3 - q2; \
\
/* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
tmp11_minus_tmp2 = q1; \
\
q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
q2 = q1 + q6; \
q1 = q1 - q6; \
\
/* pick up the results */ \
tmp0 = q4; \
tmp1 = q5; \
tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
tmp3 = q7; \
tmp10 = q2; \
tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
tmp12 = q3; \
tmp13 = q1; \
}
#define XFIX_0_899976223 d0[0]
@@ -261,7 +261,7 @@ asm_function jsimd_idct_islow_neon
vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
add ip, ip, #16
vmul.s16 q15, q15, q3
vpush {d8-d15} /* save Neon registers */
vpush {d8 - d15} /* save Neon registers */
/* 1-D IDCT, pass 1, left 4x8 half */
vadd.s16 d4, ROW7L, ROW3L
vadd.s16 d5, ROW5L, ROW1L
@@ -507,7 +507,7 @@ asm_function jsimd_idct_islow_neon
vqrshrn.s16 d17, q9, #2
vqrshrn.s16 d18, q10, #2
vqrshrn.s16 d19, q11, #2
vpop {d8-d15} /* restore Neon registers */
vpop {d8 - d15} /* restore Neon registers */
vqrshrn.s16 d20, q12, #2
/* Transpose the final 8-bit samples and do signed->unsigned conversion */
vtrn.16 q8, q9
@@ -749,7 +749,7 @@ asm_function jsimd_idct_ifast_neon
vmul.s16 q13, q13, q1
vld1.16 {d0}, [ip, :64] /* load constants */
vmul.s16 q15, q15, q3
vpush {d8-d13} /* save Neon registers */
vpush {d8 - d13} /* save Neon registers */
/* 1-D IDCT, pass 1 */
vsub.s16 q2, q10, q14
vadd.s16 q14, q10, q14
@@ -842,7 +842,7 @@ asm_function jsimd_idct_ifast_neon
vadd.s16 q14, q5, q3
vsub.s16 q9, q5, q3
vsub.s16 q13, q10, q2
vpop {d8-d13} /* restore Neon registers */
vpop {d8 - d13} /* restore Neon registers */
vadd.s16 q10, q10, q2
vsub.s16 q11, q12, q1
vadd.s16 q12, q12, q1
@@ -1010,7 +1010,7 @@ asm_function jsimd_idct_4x4_neon
TMP3 .req r2
TMP4 .req ip
vpush {d8-d15}
vpush {d8 - d15}
/* Load constants (d3 is just used for padding) */
adr TMP4, jsimd_idct_4x4_neon_consts
@@ -1099,7 +1099,7 @@ asm_function jsimd_idct_4x4_neon
vst1.8 {d27[7]}, [TMP4]!
#endif
vpop {d8-d15}
vpop {d8 - d15}
bx lr
.unreq DCT_TABLE
@@ -1167,7 +1167,7 @@ asm_function jsimd_idct_2x2_neon
TMP1 .req r0
TMP2 .req ip
vpush {d8-d15}
vpush {d8 - d15}
/* Load constants */
adr TMP2, jsimd_idct_2x2_neon_consts
@@ -1254,7 +1254,7 @@ asm_function jsimd_idct_2x2_neon
vst1.8 {d26[1]}, [TMP2]!
vst1.8 {d27[5]}, [TMP2]!
vpop {d8-d15}
vpop {d8 - d15}
bx lr
.unreq DCT_TABLE
@@ -1508,7 +1508,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
.unreq INPUT_BUF
/* Save Neon registers */
vpush {d8-d15}
vpush {d8 - d15}
/* Initially set d10, d11, d12, d13 to 0xFF */
vmov.u8 q5, #255
@@ -1571,7 +1571,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
bgt 0b
9:
/* Restore all registers and return */
vpop {d8-d15}
vpop {d8 - d15}
pop {r4, r5, r6, r7, r8, r9, r10, pc}
.unreq OUTPUT_WIDTH
@@ -1823,7 +1823,7 @@ asm_function jsimd_\colorid\()_ycc_convert_neon
.unreq OUTPUT_BUF
/* Save Neon registers */
vpush {d8-d15}
vpush {d8 - d15}
/* Outer loop over scanlines */
cmp NUM_ROWS, #1
@@ -1882,7 +1882,7 @@ asm_function jsimd_\colorid\()_ycc_convert_neon
bgt 0b
9:
/* Restore all registers and return */
vpop {d8-d15}
vpop {d8 - d15}
pop {r4, r5, r6, r7, r8, r9, r10, pc}
.unreq OUTPUT_WIDTH
@@ -2011,7 +2011,7 @@ asm_function jsimd_fdct_ifast_neon
DATA .req r0
TMP .req ip
vpush {d8-d15}
vpush {d8 - d15}
/* Load constants */
adr TMP, jsimd_fdct_ifast_neon_consts
@@ -2096,7 +2096,7 @@ asm_function jsimd_fdct_ifast_neon
vst1.16 {d24, d25, d26, d27}, [DATA, :128]!
vst1.16 {d28, d29, d30, d31}, [DATA, :128]
vpop {d8-d15}
vpop {d8 - d15}
bx lr
.unreq DATA
@@ -2404,7 +2404,7 @@ asm_function jsimd_h2v1_fancy_upsample_neon
TMP .req lr
push {r4, r5, r6, lr}
vpush {d8-d15}
vpush {d8 - d15}
ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
cmp MAX_V_SAMP_FACTOR, #0
@@ -2422,7 +2422,7 @@ asm_function jsimd_h2v1_fancy_upsample_neon
bgt 11b
99:
vpop {d8-d15}
vpop {d8 - d15}
pop {r4, r5, r6, pc}
.unreq MAX_V_SAMP_FACTOR

View File

@@ -627,21 +627,21 @@ asm_function jsimd_idct_islow_neon
movi v0.16b, #(CENTERJSAMPLE)
/* Prepare pointers (dual-issue with Neon instructions) */
ldp TMP1, TMP2, [OUTPUT_BUF], 16
sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
sqrshrn v28.8b, v2.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
ldp TMP3, TMP4, [OUTPUT_BUF], 16
sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
sqrshrn v29.8b, v3.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
add TMP1, TMP1, OUTPUT_COL
sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
sqrshrn v30.8b, v4.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
add TMP2, TMP2, OUTPUT_COL
sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
sqrshrn v31.8b, v5.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
add TMP3, TMP3, OUTPUT_COL
sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
sqrshrn2 v28.16b, v6.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
add TMP4, TMP4, OUTPUT_COL
sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
sqrshrn2 v29.16b, v7.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
ldp TMP5, TMP6, [OUTPUT_BUF], 16
sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
sqrshrn2 v30.16b, v8.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
ldp TMP7, TMP8, [OUTPUT_BUF], 16
sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
sqrshrn2 v31.16b, v9.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
add TMP5, TMP5, OUTPUT_COL
add v16.16b, v28.16b, v0.16b
add TMP6, TMP6, OUTPUT_COL
@@ -753,14 +753,14 @@ asm_function jsimd_idct_islow_neon
add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
mov v6.16b, v15.16b
mov v7.16b, v15.16b
mov v8.16b, v15.16b
@@ -837,14 +837,14 @@ asm_function jsimd_idct_islow_neon
mov v3.16b, v14.16b
mov v4.16b, v14.16b
mov v5.16b, v14.16b
rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
b 1b
.balign 16
@@ -947,22 +947,22 @@ asm_function jsimd_idct_islow_neon
sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
b 1b
.unreq DCT_TABLE
@@ -1419,7 +1419,7 @@ asm_function jsimd_idct_4x4_neon
st1 {v27.b}[7], [TMP4], 1
#endif
/* vpop {v8.4h - v15.4h} ;not available */
/* vpop {v8.4h - v15.4h} (not available) */
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
blr x30
@@ -1487,7 +1487,7 @@ asm_function jsimd_idct_2x2_neon
instruction ensures that those bits are set to zero. */
uxtw x3, w3
/* vpush {v8.4h - v15.4h} ; not available */
/* vpush {v8.4h - v15.4h} (not available) */
sub sp, sp, 64
mov x9, sp
@@ -3457,136 +3457,136 @@ generate_jsimd_huff_encode_one_block 0
*/
.macro LOAD16
ldr T0d, [LUT, #(0*4)]
ldr T1d, [LUT, #(8*4)]
ldr T0d, [LUT, #(0 * 4)]
ldr T1d, [LUT, #(8 * 4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[0], [T0]
ld1 {Y1.h}[0], [T1]
ldr T0d, [LUT, #(1*4)]
ldr T1d, [LUT, #(9*4)]
ldr T0d, [LUT, #(1 * 4)]
ldr T1d, [LUT, #(9 * 4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[1], [T0]
ld1 {Y1.h}[1], [T1]
ldr T0d, [LUT, #(2*4)]
ldr T1d, [LUT, #(10*4)]
ldr T0d, [LUT, #(2 * 4)]
ldr T1d, [LUT, #(10 * 4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[2], [T0]
ld1 {Y1.h}[2], [T1]
ldr T0d, [LUT, #(3*4)]
ldr T1d, [LUT, #(11*4)]
ldr T0d, [LUT, #(3 * 4)]
ldr T1d, [LUT, #(11 * 4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[3], [T0]
ld1 {Y1.h}[3], [T1]
ldr T0d, [LUT, #(4*4)]
ldr T1d, [LUT, #(12*4)]
ldr T0d, [LUT, #(4 * 4)]
ldr T1d, [LUT, #(12 * 4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[4], [T0]
ld1 {Y1.h}[4], [T1]
ldr T0d, [LUT, #(5*4)]
ldr T1d, [LUT, #(13*4)]
ldr T0d, [LUT, #(5 * 4)]
ldr T1d, [LUT, #(13 * 4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[5], [T0]
ld1 {Y1.h}[5], [T1]
ldr T0d, [LUT, #(6*4)]
ldr T1d, [LUT, #(14*4)]
ldr T0d, [LUT, #(6 * 4)]
ldr T1d, [LUT, #(14 * 4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[6], [T0]
ld1 {Y1.h}[6], [T1]
ldr T0d, [LUT, #(7*4)]
ldr T1d, [LUT, #(15*4)]
ldr T0d, [LUT, #(7 * 4)]
ldr T1d, [LUT, #(15 * 4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[7], [T0]
ld1 {Y1.h}[7], [T1]
add LUT, LUT, #(16*4)
add LUT, LUT, #(16 * 4)
.endm
.macro LOAD15
eor Y1.16b, Y1.16b, Y1.16b
ldr T0d, [LUT, #(0*4)]
ldr T1d, [LUT, #(8*4)]
ldr T0d, [LUT, #(0 * 4)]
ldr T1d, [LUT, #(8 * 4)]
add T0, BLOCK, T0, lsl #1
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[0], [T0]
ld1 {Y1.h}[0], [T1]
ldr T0d, [LUT, #(1*4)]
ldr T0d, [LUT, #(1 * 4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[1], [T0]
ldr T0d, [LUT, #(2*4)]
ldr T0d, [LUT, #(2 * 4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[2], [T0]
ldr T0d, [LUT, #(3*4)]
ldr T0d, [LUT, #(3 * 4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[3], [T0]
ldr T0d, [LUT, #(4*4)]
ldr T0d, [LUT, #(4 * 4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[4], [T0]
ldr T0d, [LUT, #(5*4)]
ldr T0d, [LUT, #(5 * 4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[5], [T0]
ldr T0d, [LUT, #(6*4)]
ldr T0d, [LUT, #(6 * 4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[6], [T0]
ldr T0d, [LUT, #(7*4)]
ldr T0d, [LUT, #(7 * 4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[7], [T0]
cmp LENEND, #2
b.lt 1515f
ldr T1d, [LUT, #(9*4)]
ldr T1d, [LUT, #(9 * 4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y1.h}[1], [T1]
cmp LENEND, #3
b.lt 1515f
ldr T1d, [LUT, #(10*4)]
ldr T1d, [LUT, #(10 * 4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y1.h}[2], [T1]
cmp LENEND, #4
b.lt 1515f
ldr T1d, [LUT, #(11*4)]
ldr T1d, [LUT, #(11 * 4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y1.h}[3], [T1]
cmp LENEND, #5
b.lt 1515f
ldr T1d, [LUT, #(12*4)]
ldr T1d, [LUT, #(12 * 4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y1.h}[4], [T1]
cmp LENEND, #6
b.lt 1515f
ldr T1d, [LUT, #(13*4)]
ldr T1d, [LUT, #(13 * 4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y1.h}[5], [T1]
cmp LENEND, #7
b.lt 1515f
ldr T1d, [LUT, #(14*4)]
ldr T1d, [LUT, #(14 * 4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y1.h}[6], [T1]
@@ -3594,35 +3594,35 @@ generate_jsimd_huff_encode_one_block 0
.endm
.macro LOAD8
ldr T0d, [LUT, #(0*4)]
ldr T0d, [LUT, #(0 * 4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[0], [T0]
ldr T0d, [LUT, #(1*4)]
ldr T0d, [LUT, #(1 * 4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[1], [T0]
ldr T0d, [LUT, #(2*4)]
ldr T0d, [LUT, #(2 * 4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[2], [T0]
ldr T0d, [LUT, #(3*4)]
ldr T0d, [LUT, #(3 * 4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[3], [T0]
ldr T0d, [LUT, #(4*4)]
ldr T0d, [LUT, #(4 * 4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[4], [T0]
ldr T0d, [LUT, #(5*4)]
ldr T0d, [LUT, #(5 * 4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[5], [T0]
ldr T0d, [LUT, #(6*4)]
ldr T0d, [LUT, #(6 * 4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[6], [T0]
ldr T0d, [LUT, #(7*4)]
ldr T0d, [LUT, #(7 * 4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[7], [T0]
.endm
@@ -3630,43 +3630,43 @@ generate_jsimd_huff_encode_one_block 0
.macro LOAD7
eor Y0.16b, Y0.16b, Y0.16b
ldr T0d, [LUT, #(0*4)]
ldr T0d, [LUT, #(0 * 4)]
add T0, BLOCK, T0, lsl #1
ld1 {Y0.h}[0], [T0]
cmp LENEND, #2
b.lt 77f
ldr T1d, [LUT, #(1*4)]
ldr T1d, [LUT, #(1 * 4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[1], [T1]
cmp LENEND, #3
b.lt 77f
ldr T1d, [LUT, #(2*4)]
ldr T1d, [LUT, #(2 * 4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[2], [T1]
cmp LENEND, #4
b.lt 77f
ldr T1d, [LUT, #(3*4)]
ldr T1d, [LUT, #(3 * 4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[3], [T1]
cmp LENEND, #5
b.lt 77f
ldr T1d, [LUT, #(4*4)]
ldr T1d, [LUT, #(4 * 4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[4], [T1]
cmp LENEND, #6
b.lt 77f
ldr T1d, [LUT, #(5*4)]
ldr T1d, [LUT, #(5 * 4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[5], [T1]
cmp LENEND, #7
b.lt 77f
ldr T1d, [LUT, #(6*4)]
ldr T1d, [LUT, #(6 * 4)]
add T1, BLOCK, T1, lsl #1
ld1 {Y0.h}[6], [T1]

File diff suppressed because it is too large Load Diff

View File

@@ -56,12 +56,14 @@
((uint64_t)(uint8_t)g << 8) | \
((uint64_t)(uint8_t)h))
#define _uint64_set1_pi8(a) _uint64_set_pi8(a, a, a, a, a, a, a, a)
#define _uint64_set_pi16(a, b, c, d) (((uint64_t)(uint16_t)a << 48) | \
((uint64_t)(uint16_t)b << 32) | \
((uint64_t)(uint16_t)c << 16) | \
((uint64_t)(uint16_t)d))
#define _uint64_set_pi16(a, b, c, d) \
(((uint64_t)(uint16_t)a << 48) | \
((uint64_t)(uint16_t)b << 32) | \
((uint64_t)(uint16_t)c << 16) | \
((uint64_t)(uint16_t)d))
#define _uint64_set1_pi16(a) _uint64_set_pi16(a, a, a, a)
#define _uint64_set_pi32(a, b) (((uint64_t)(uint32_t)a << 32) | \
((uint64_t)(uint32_t)b))
#define _uint64_set_pi32(a, b) \
(((uint64_t)(uint32_t)a << 32) | \
((uint64_t)(uint32_t)b))
#define get_const_value(index) (*(__m64 *)&const_value[index])

View File

@@ -1,8 +1,10 @@
// This file generates the include file for the assembly
// implementations by abusing the C preprocessor.
//
// Note: Some things are manually defined as they need to
// be mapped to NASM types.
/*
* This file generates the include file for the assembly
* implementations by abusing the C preprocessor.
*
* Note: Some things are manually defined as they need to
* be mapped to NASM types.
*/
;
; Automatically generated include file from jsimdcfg.inc.h

View File

@@ -816,8 +816,7 @@ do_flip_h(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
(JDIMENSION)compptr->v_samp_factor, TRUE);
src_buffer = (*srcinfo->mem->access_virt_barray)
((j_common_ptr)srcinfo, src_coef_arrays[ci],
dst_blk_y + y_crop_blocks,
((j_common_ptr)srcinfo, src_coef_arrays[ci], dst_blk_y + y_crop_blocks,
(JDIMENSION)compptr->v_samp_factor, FALSE);
for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
dst_row_ptr = dst_buffer[offset_y];
@@ -830,8 +829,9 @@ do_flip_h(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
/* this unrolled loop doesn't need to know which row it's on... */
for (k = 0; k < DCTSIZE2; k += 2) {
*dst_ptr++ = *src_ptr++; /* copy even column */
*dst_ptr++ = -(*src_ptr++); /* copy odd column with sign change */
*dst_ptr++ = *src_ptr++; /* copy even column */
*dst_ptr++ = -(*src_ptr++); /* copy odd column with sign
change */
}
} else {
/* Copy last partial block(s) verbatim */
@@ -916,8 +916,7 @@ do_flip_v(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
} else {
/* Just copy row verbatim. */
jcopy_block_row(src_buffer[offset_y] + x_crop_blocks,
dst_buffer[offset_y],
compptr->width_in_blocks);
dst_buffer[offset_y], compptr->width_in_blocks);
}
}
}

View File

@@ -5,7 +5,7 @@
* Copyright (C) 1991-1996, Thomas G. Lane.
* Modified 2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2017, 2019, D. R. Commander.
* Copyright (C) 2017, 2019-2020, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -326,11 +326,12 @@ jinit_write_ppm(j_decompress_ptr cinfo)
if (cinfo->quantize_colors || BITS_IN_JSAMPLE != 8 ||
sizeof(JSAMPLE) != sizeof(char) ||
(cinfo->out_color_space != JCS_EXT_RGB
#if RGB_RED == 0 && RGB_GREEN == 1 && RGB_BLUE == 2 && RGB_PIXELSIZE == 3
&& cinfo->out_color_space != JCS_RGB
(cinfo->out_color_space != JCS_EXT_RGB &&
cinfo->out_color_space != JCS_RGB)) {
#else
cinfo->out_color_space != JCS_EXT_RGB) {
#endif
)) {
/* When quantizing, we need an output buffer for colormap indexes
* that's separate from the physical I/O buffer. We also need a
* separate buffer if pixel format translation must take place.