diff --git a/jdcolor.c b/jdcolor.c index dc0e3b6c..d3ae40c7 100644 --- a/jdcolor.c +++ b/jdcolor.c @@ -571,11 +571,10 @@ ycck_cmyk_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, * RGB565 conversion */ -#define PACK_SHORT_565_LE(r, g, b) ((((r) << 8) & 0xF800) | \ - (((g) << 3) & 0x7E0) | ((b) >> 3)) -#define PACK_SHORT_565_BE(r, g, b) (((r) & 0xF8) | ((g) >> 5) | \ - (((g) << 11) & 0xE000) | \ - (((b) << 5) & 0x1F00)) +#define PACK_SHORT_565_LE(r, g, b) \ + ((((r) << 8) & 0xF800) | (((g) << 3) & 0x7E0) | ((b) >> 3)) +#define PACK_SHORT_565_BE(r, g, b) \ + (((r) & 0xF8) | ((g) >> 5) | (((g) << 11) & 0xE000) | (((b) << 5) & 0x1F00)) #define PACK_TWO_PIXELS_LE(l, r) ((r << 16) | l) #define PACK_TWO_PIXELS_BE(l, r) ((l << 16) | r) diff --git a/jdmerge.c b/jdmerge.c index 833ad675..3a456d65 100644 --- a/jdmerge.c +++ b/jdmerge.c @@ -392,11 +392,10 @@ h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, * RGB565 conversion */ -#define PACK_SHORT_565_LE(r, g, b) ((((r) << 8) & 0xF800) | \ - (((g) << 3) & 0x7E0) | ((b) >> 3)) -#define PACK_SHORT_565_BE(r, g, b) (((r) & 0xF8) | ((g) >> 5) | \ - (((g) << 11) & 0xE000) | \ - (((b) << 5) & 0x1F00)) +#define PACK_SHORT_565_LE(r, g, b) \ + ((((r) << 8) & 0xF800) | (((g) << 3) & 0x7E0) | ((b) >> 3)) +#define PACK_SHORT_565_BE(r, g, b) \ + (((r) & 0xF8) | ((g) >> 5) | (((g) << 11) & 0xE000) | (((b) << 5) & 0x1F00)) #define PACK_TWO_PIXELS_LE(l, r) ((r << 16) | l) #define PACK_TWO_PIXELS_BE(l, r) ((l << 16) | r) diff --git a/rdppm.c b/rdppm.c index a8507b90..2a58e796 100644 --- a/rdppm.c +++ b/rdppm.c @@ -659,11 +659,12 @@ start_input_ppm(j_compress_ptr cinfo, cjpeg_source_ptr sinfo) if (maxval > 255) { source->pub.get_pixel_rows = get_word_rgb_row; } else if (maxval == MAXJSAMPLE && sizeof(JSAMPLE) == sizeof(U_CHAR) && - (cinfo->in_color_space == JCS_EXT_RGB #if RGB_RED == 0 && RGB_GREEN == 1 && RGB_BLUE == 2 && RGB_PIXELSIZE == 3 - || cinfo->in_color_space == JCS_RGB + (cinfo->in_color_space == JCS_EXT_RGB || + cinfo->in_color_space == JCS_RGB)) { +#else + cinfo->in_color_space == JCS_EXT_RGB) { #endif - )) { source->pub.get_pixel_rows = get_raw_row; use_raw_buffer = TRUE; need_rescale = FALSE; diff --git a/rdtarga.c b/rdtarga.c index 37bd286a..cd8a363e 100644 --- a/rdtarga.c +++ b/rdtarga.c @@ -344,8 +344,9 @@ start_input_tga(j_compress_ptr cinfo, cjpeg_source_ptr sinfo) unsigned int width, height, maplen; boolean is_bottom_up; -#define GET_2B(offset) ((unsigned int)UCH(targaheader[offset]) + \ - (((unsigned int)UCH(targaheader[offset + 1])) << 8)) +#define GET_2B(offset) \ + ((unsigned int)UCH(targaheader[offset]) + \ + (((unsigned int)UCH(targaheader[offset + 1])) << 8)) if (!ReadOK(source->pub.input_file, targaheader, 18)) ERREXIT(cinfo, JERR_INPUT_EOF); diff --git a/release/rpm.spec.in b/release/rpm.spec.in index f8db7640..6b5cd2be 100644 --- a/release/rpm.spec.in +++ b/release/rpm.spec.in @@ -1,36 +1,36 @@ %global _docdir %{_defaultdocdir}/%{name}-%{version} -%define _prefix @CMAKE_INSTALL_PREFIX@ -%define _bindir @CMAKE_INSTALL_FULL_BINDIR@ -%define _datarootdir @CMAKE_INSTALL_FULL_DATAROOTDIR@ -%define _includedir @CMAKE_INSTALL_FULL_INCLUDEDIR@ -%define _javadir @CMAKE_INSTALL_FULL_JAVADIR@ -%define _mandir @CMAKE_INSTALL_FULL_MANDIR@ -%define _enable_static @ENABLE_STATIC@ -%define _enable_shared @ENABLE_SHARED@ -%define _with_turbojpeg @WITH_TURBOJPEG@ -%define _with_java @WITH_JAVA@ +%define _prefix @CMAKE_INSTALL_PREFIX@ +%define _bindir @CMAKE_INSTALL_FULL_BINDIR@ +%define _datarootdir @CMAKE_INSTALL_FULL_DATAROOTDIR@ +%define _includedir @CMAKE_INSTALL_FULL_INCLUDEDIR@ +%define _javadir @CMAKE_INSTALL_FULL_JAVADIR@ +%define _mandir @CMAKE_INSTALL_FULL_MANDIR@ +%define _enable_static @ENABLE_STATIC@ +%define _enable_shared @ENABLE_SHARED@ +%define _with_turbojpeg @WITH_TURBOJPEG@ +%define _with_java @WITH_JAVA@ %if "%{?__isa_bits:1}" == "1" -%define _bits %{__isa_bits} +%define _bits %{__isa_bits} %else # RPM < 4.6 %if "%{_lib}" == "lib64" -%define _bits 64 +%define _bits 64 %else -%define _bits 32 +%define _bits 32 %endif %endif #-->%if 1 %if "%{_bits}" == "64" -%define _libdir %{_exec_prefix}/lib64 +%define _libdir %{_exec_prefix}/lib64 %else %if "%{_prefix}" == "/opt/libjpeg-turbo" -%define _libdir %{_exec_prefix}/lib32 +%define _libdir %{_exec_prefix}/lib32 %endif %endif #-->%else -%define _libdir @CMAKE_INSTALL_FULL_LIBDIR@ +%define _libdir @CMAKE_INSTALL_FULL_LIBDIR@ #-->%endif Summary: A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs @@ -101,7 +101,6 @@ broader range of users and developers. #-->make DESTDIR=$RPM_BUILD_ROOT %install - rm -rf $RPM_BUILD_ROOT make install DESTDIR=$RPM_BUILD_ROOT /sbin/ldconfig -n $RPM_BUILD_ROOT%{_libdir} @@ -163,38 +162,38 @@ rm -rf $RPM_BUILD_ROOT %doc %{_docdir}/* %dir %{_prefix} %if "%{_prefix}" == "@CMAKE_INSTALL_DEFAULT_PREFIX@" && "%{_docdir}" != "%{_prefix}/doc" - %{_prefix}/doc + %{_prefix}/doc %endif %dir %{_bindir} %{_bindir}/cjpeg %{_bindir}/djpeg %{_bindir}/jpegtran %if "%{_with_turbojpeg}" == "1" - %{_bindir}/tjbench + %{_bindir}/tjbench %endif %{_bindir}/rdjpgcom %{_bindir}/wrjpgcom %dir %{_libdir} %if "%{_enable_shared}" == "1" - %{_libdir}/libjpeg.so.@SO_MAJOR_VERSION@.@SO_AGE@.@SO_MINOR_VERSION@ - %{_libdir}/libjpeg.so.@SO_MAJOR_VERSION@ - %{_libdir}/libjpeg.so + %{_libdir}/libjpeg.so.@SO_MAJOR_VERSION@.@SO_AGE@.@SO_MINOR_VERSION@ + %{_libdir}/libjpeg.so.@SO_MAJOR_VERSION@ + %{_libdir}/libjpeg.so %endif %if "%{_enable_static}" == "1" - %{_libdir}/libjpeg.a + %{_libdir}/libjpeg.a %endif %dir %{_libdir}/pkgconfig %{_libdir}/pkgconfig/libjpeg.pc %if "%{_with_turbojpeg}" == "1" - %if "%{_enable_shared}" == "1" || "%{_with_java}" == "1" - %{_libdir}/libturbojpeg.so.@TURBOJPEG_SO_VERSION@ - %{_libdir}/libturbojpeg.so.@TURBOJPEG_SO_MAJOR_VERSION@ - %{_libdir}/libturbojpeg.so - %endif - %if "%{_enable_static}" == "1" - %{_libdir}/libturbojpeg.a - %endif - %{_libdir}/pkgconfig/libturbojpeg.pc + %if "%{_enable_shared}" == "1" || "%{_with_java}" == "1" + %{_libdir}/libturbojpeg.so.@TURBOJPEG_SO_VERSION@ + %{_libdir}/libturbojpeg.so.@TURBOJPEG_SO_MAJOR_VERSION@ + %{_libdir}/libturbojpeg.so + %endif + %if "%{_enable_static}" == "1" + %{_libdir}/libturbojpeg.a + %endif + %{_libdir}/pkgconfig/libturbojpeg.pc %endif %dir %{_includedir} %{_includedir}/jconfig.h @@ -202,7 +201,7 @@ rm -rf $RPM_BUILD_ROOT %{_includedir}/jmorecfg.h %{_includedir}/jpeglib.h %if "%{_with_turbojpeg}" == "1" - %{_includedir}/turbojpeg.h + %{_includedir}/turbojpeg.h %endif %dir %{_mandir} %dir %{_mandir}/man1 @@ -212,10 +211,11 @@ rm -rf $RPM_BUILD_ROOT %{_mandir}/man1/rdjpgcom.1* %{_mandir}/man1/wrjpgcom.1* %if "%{_prefix}" != "%{_datarootdir}" - %dir %{_datarootdir} + %dir %{_datarootdir} %endif %if "%{_with_java}" == "1" - %dir %{_javadir} - %{_javadir}/turbojpeg.jar + %dir %{_javadir} + %{_javadir}/turbojpeg.jar %endif + %changelog diff --git a/simd/arm/jsimd_neon.S b/simd/arm/jsimd_neon.S index f8f0dad1..30f9cc63 100644 --- a/simd/arm/jsimd_neon.S +++ b/simd/arm/jsimd_neon.S @@ -107,69 +107,69 @@ _\fname: * Uses some ideas from the comments in 'simd/jiss2int-64.asm' */ #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \ - DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ - JLONG q1, q2, q3, q4, q5, q6, q7; \ - JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \ - \ - /* 1-D iDCT input data */ \ - row0 = xrow0; \ - row1 = xrow1; \ - row2 = xrow2; \ - row3 = xrow3; \ - row4 = xrow4; \ - row5 = xrow5; \ - row6 = xrow6; \ - row7 = xrow7; \ - \ - q5 = row7 + row3; \ - q4 = row5 + row1; \ - q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ - MULTIPLY(q4, FIX_1_175875602); \ - q7 = MULTIPLY(q5, FIX_1_175875602) + \ - MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ - q2 = MULTIPLY(row2, FIX_0_541196100) + \ - MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ - q4 = q6; \ - q3 = ((JLONG)row0 - (JLONG)row4) << 13; \ - q6 += MULTIPLY(row5, -FIX_2_562915447) + \ - MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ - /* now we can use q1 (reloadable constants have been used up) */ \ - q1 = q3 + q2; \ - q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ - MULTIPLY(row1, -FIX_0_899976223); \ - q5 = q7; \ - q1 = q1 + q6; \ - q7 += MULTIPLY(row7, -FIX_0_899976223) + \ - MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ - \ - /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ - tmp11_plus_tmp2 = q1; \ - row1 = 0; \ - \ - q1 = q1 - q6; \ - q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ - MULTIPLY(row3, -FIX_2_562915447); \ - q1 = q1 - q6; \ - q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ - MULTIPLY(row6, FIX_0_541196100); \ - q3 = q3 - q2; \ - \ - /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ - tmp11_minus_tmp2 = q1; \ - \ - q1 = ((JLONG)row0 + (JLONG)row4) << 13; \ - q2 = q1 + q6; \ - q1 = q1 - q6; \ - \ - /* pick up the results */ \ - tmp0 = q4; \ - tmp1 = q5; \ - tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ - tmp3 = q7; \ - tmp10 = q2; \ - tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ - tmp12 = q3; \ - tmp13 = q1; \ + DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ + JLONG q1, q2, q3, q4, q5, q6, q7; \ + JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \ + \ + /* 1-D iDCT input data */ \ + row0 = xrow0; \ + row1 = xrow1; \ + row2 = xrow2; \ + row3 = xrow3; \ + row4 = xrow4; \ + row5 = xrow5; \ + row6 = xrow6; \ + row7 = xrow7; \ + \ + q5 = row7 + row3; \ + q4 = row5 + row1; \ + q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ + MULTIPLY(q4, FIX_1_175875602); \ + q7 = MULTIPLY(q5, FIX_1_175875602) + \ + MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ + q2 = MULTIPLY(row2, FIX_0_541196100) + \ + MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ + q4 = q6; \ + q3 = ((JLONG)row0 - (JLONG)row4) << 13; \ + q6 += MULTIPLY(row5, -FIX_2_562915447) + \ + MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ + /* now we can use q1 (reloadable constants have been used up) */ \ + q1 = q3 + q2; \ + q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ + MULTIPLY(row1, -FIX_0_899976223); \ + q5 = q7; \ + q1 = q1 + q6; \ + q7 += MULTIPLY(row7, -FIX_0_899976223) + \ + MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ + \ + /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ + tmp11_plus_tmp2 = q1; \ + row1 = 0; \ + \ + q1 = q1 - q6; \ + q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ + MULTIPLY(row3, -FIX_2_562915447); \ + q1 = q1 - q6; \ + q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ + MULTIPLY(row6, FIX_0_541196100); \ + q3 = q3 - q2; \ + \ + /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ + tmp11_minus_tmp2 = q1; \ + \ + q1 = ((JLONG)row0 + (JLONG)row4) << 13; \ + q2 = q1 + q6; \ + q1 = q1 - q6; \ + \ + /* pick up the results */ \ + tmp0 = q4; \ + tmp1 = q5; \ + tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ + tmp3 = q7; \ + tmp10 = q2; \ + tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ + tmp12 = q3; \ + tmp13 = q1; \ } #define XFIX_0_899976223 d0[0] @@ -261,7 +261,7 @@ asm_function jsimd_idct_islow_neon vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ add ip, ip, #16 vmul.s16 q15, q15, q3 - vpush {d8-d15} /* save Neon registers */ + vpush {d8 - d15} /* save Neon registers */ /* 1-D IDCT, pass 1, left 4x8 half */ vadd.s16 d4, ROW7L, ROW3L vadd.s16 d5, ROW5L, ROW1L @@ -507,7 +507,7 @@ asm_function jsimd_idct_islow_neon vqrshrn.s16 d17, q9, #2 vqrshrn.s16 d18, q10, #2 vqrshrn.s16 d19, q11, #2 - vpop {d8-d15} /* restore Neon registers */ + vpop {d8 - d15} /* restore Neon registers */ vqrshrn.s16 d20, q12, #2 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ vtrn.16 q8, q9 @@ -749,7 +749,7 @@ asm_function jsimd_idct_ifast_neon vmul.s16 q13, q13, q1 vld1.16 {d0}, [ip, :64] /* load constants */ vmul.s16 q15, q15, q3 - vpush {d8-d13} /* save Neon registers */ + vpush {d8 - d13} /* save Neon registers */ /* 1-D IDCT, pass 1 */ vsub.s16 q2, q10, q14 vadd.s16 q14, q10, q14 @@ -842,7 +842,7 @@ asm_function jsimd_idct_ifast_neon vadd.s16 q14, q5, q3 vsub.s16 q9, q5, q3 vsub.s16 q13, q10, q2 - vpop {d8-d13} /* restore Neon registers */ + vpop {d8 - d13} /* restore Neon registers */ vadd.s16 q10, q10, q2 vsub.s16 q11, q12, q1 vadd.s16 q12, q12, q1 @@ -1010,7 +1010,7 @@ asm_function jsimd_idct_4x4_neon TMP3 .req r2 TMP4 .req ip - vpush {d8-d15} + vpush {d8 - d15} /* Load constants (d3 is just used for padding) */ adr TMP4, jsimd_idct_4x4_neon_consts @@ -1099,7 +1099,7 @@ asm_function jsimd_idct_4x4_neon vst1.8 {d27[7]}, [TMP4]! #endif - vpop {d8-d15} + vpop {d8 - d15} bx lr .unreq DCT_TABLE @@ -1167,7 +1167,7 @@ asm_function jsimd_idct_2x2_neon TMP1 .req r0 TMP2 .req ip - vpush {d8-d15} + vpush {d8 - d15} /* Load constants */ adr TMP2, jsimd_idct_2x2_neon_consts @@ -1254,7 +1254,7 @@ asm_function jsimd_idct_2x2_neon vst1.8 {d26[1]}, [TMP2]! vst1.8 {d27[5]}, [TMP2]! - vpop {d8-d15} + vpop {d8 - d15} bx lr .unreq DCT_TABLE @@ -1508,7 +1508,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon .unreq INPUT_BUF /* Save Neon registers */ - vpush {d8-d15} + vpush {d8 - d15} /* Initially set d10, d11, d12, d13 to 0xFF */ vmov.u8 q5, #255 @@ -1571,7 +1571,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon bgt 0b 9: /* Restore all registers and return */ - vpop {d8-d15} + vpop {d8 - d15} pop {r4, r5, r6, r7, r8, r9, r10, pc} .unreq OUTPUT_WIDTH @@ -1823,7 +1823,7 @@ asm_function jsimd_\colorid\()_ycc_convert_neon .unreq OUTPUT_BUF /* Save Neon registers */ - vpush {d8-d15} + vpush {d8 - d15} /* Outer loop over scanlines */ cmp NUM_ROWS, #1 @@ -1882,7 +1882,7 @@ asm_function jsimd_\colorid\()_ycc_convert_neon bgt 0b 9: /* Restore all registers and return */ - vpop {d8-d15} + vpop {d8 - d15} pop {r4, r5, r6, r7, r8, r9, r10, pc} .unreq OUTPUT_WIDTH @@ -2011,7 +2011,7 @@ asm_function jsimd_fdct_ifast_neon DATA .req r0 TMP .req ip - vpush {d8-d15} + vpush {d8 - d15} /* Load constants */ adr TMP, jsimd_fdct_ifast_neon_consts @@ -2096,7 +2096,7 @@ asm_function jsimd_fdct_ifast_neon vst1.16 {d24, d25, d26, d27}, [DATA, :128]! vst1.16 {d28, d29, d30, d31}, [DATA, :128] - vpop {d8-d15} + vpop {d8 - d15} bx lr .unreq DATA @@ -2404,7 +2404,7 @@ asm_function jsimd_h2v1_fancy_upsample_neon TMP .req lr push {r4, r5, r6, lr} - vpush {d8-d15} + vpush {d8 - d15} ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] cmp MAX_V_SAMP_FACTOR, #0 @@ -2422,7 +2422,7 @@ asm_function jsimd_h2v1_fancy_upsample_neon bgt 11b 99: - vpop {d8-d15} + vpop {d8 - d15} pop {r4, r5, r6, pc} .unreq MAX_V_SAMP_FACTOR diff --git a/simd/arm64/jsimd_neon.S b/simd/arm64/jsimd_neon.S index 3ed5f587..85598326 100644 --- a/simd/arm64/jsimd_neon.S +++ b/simd/arm64/jsimd_neon.S @@ -613,21 +613,21 @@ asm_function jsimd_idct_islow_neon movi v0.16b, #(CENTERJSAMPLE) /* Prepare pointers (dual-issue with Neon instructions) */ ldp TMP1, TMP2, [OUTPUT_BUF], 16 - sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16) + sqrshrn v28.8b, v2.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) ldp TMP3, TMP4, [OUTPUT_BUF], 16 - sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16) + sqrshrn v29.8b, v3.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) add TMP1, TMP1, OUTPUT_COL - sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16) + sqrshrn v30.8b, v4.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) add TMP2, TMP2, OUTPUT_COL - sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16) + sqrshrn v31.8b, v5.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) add TMP3, TMP3, OUTPUT_COL - sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16) + sqrshrn2 v28.16b, v6.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) add TMP4, TMP4, OUTPUT_COL - sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16) + sqrshrn2 v29.16b, v7.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) ldp TMP5, TMP6, [OUTPUT_BUF], 16 - sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16) + sqrshrn2 v30.16b, v8.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) ldp TMP7, TMP8, [OUTPUT_BUF], 16 - sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16) + sqrshrn2 v31.16b, v9.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) add TMP5, TMP5, OUTPUT_COL add v16.16b, v28.16b, v0.16b add TMP6, TMP6, OUTPUT_COL @@ -739,14 +739,14 @@ asm_function jsimd_idct_islow_neon add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ - rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ - rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ - rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ - rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ - rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ - rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ - rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ - rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ mov v6.16b, v15.16b mov v7.16b, v15.16b mov v8.16b, v15.16b @@ -823,14 +823,14 @@ asm_function jsimd_idct_islow_neon mov v3.16b, v14.16b mov v4.16b, v14.16b mov v5.16b, v14.16b - rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ - rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ - rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ - rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ - rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ - rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ - rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ - rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ b 1b .balign 16 @@ -933,22 +933,22 @@ asm_function jsimd_idct_islow_neon sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ - rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ - rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ - rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ - rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ - rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ - rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ - rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ - rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ - rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ - rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ - rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ - rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ - rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ - rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ - rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ - rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ b 1b .unreq DCT_TABLE @@ -1405,7 +1405,7 @@ asm_function jsimd_idct_4x4_neon st1 {v27.b}[7], [TMP4], 1 #endif - /* vpop {v8.4h - v15.4h} ;not available */ + /* vpop {v8.4h - v15.4h} (not available) */ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 blr x30 @@ -1473,7 +1473,7 @@ asm_function jsimd_idct_2x2_neon instruction ensures that those bits are set to zero. */ uxtw x3, w3 - /* vpush {v8.4h - v15.4h} ; not available */ + /* vpush {v8.4h - v15.4h} (not available) */ sub sp, sp, 64 mov x9, sp diff --git a/simd/loongson/jsimd_mmi.h b/simd/loongson/jsimd_mmi.h index 2506aa86..59b2ee0b 100644 --- a/simd/loongson/jsimd_mmi.h +++ b/simd/loongson/jsimd_mmi.h @@ -47,11 +47,13 @@ ((uint64_t)(uint8_t)f << 16) | \ ((uint64_t)(uint8_t)g << 8) | \ ((uint64_t)(uint8_t)h)) -#define _uint64_set_pi16(a, b, c, d) (((uint64_t)(uint16_t)a << 48) | \ - ((uint64_t)(uint16_t)b << 32) | \ - ((uint64_t)(uint16_t)c << 16) | \ - ((uint64_t)(uint16_t)d)) -#define _uint64_set_pi32(a, b) (((uint64_t)(uint32_t)a << 32) | \ - ((uint64_t)(uint32_t)b)) +#define _uint64_set_pi16(a, b, c, d) \ + (((uint64_t)(uint16_t)a << 48) | \ + ((uint64_t)(uint16_t)b << 32) | \ + ((uint64_t)(uint16_t)c << 16) | \ + ((uint64_t)(uint16_t)d)) +#define _uint64_set_pi32(a, b) \ + (((uint64_t)(uint32_t)a << 32) | \ + ((uint64_t)(uint32_t)b)) #define get_const_value(index) (*(__m64 *)&const_value[index]) diff --git a/simd/mips/jsimd_dspr2.S b/simd/mips/jsimd_dspr2.S index a28c1161..c99288a8 100644 --- a/simd/mips/jsimd_dspr2.S +++ b/simd/mips/jsimd_dspr2.S @@ -41,10 +41,10 @@ LEAF_DSPR2(jsimd_c_null_convert_dspr2) */ SAVE_REGS_ON_STACK 8, s0, s1 - lw t9, 24(sp) // t9 = num_rows - lw s0, 28(sp) // s0 = cinfo->num_components - andi t0, a0, 3 // t0 = cinfo->image_width & 3 - beqz t0, 4f // no residual + lw t9, 24(sp) /* t9 = num_rows */ + lw s0, 28(sp) /* s0 = cinfo->num_components */ + andi t0, a0, 3 /* t0 = cinfo->image_width & 3 */ + beqz t0, 4f /* no residual */ nop 0: addiu t9, t9, -1 @@ -52,10 +52,10 @@ LEAF_DSPR2(jsimd_c_null_convert_dspr2) li t1, 0 1: sll t3, t1, 2 - lwx t5, t3(a2) // t5 = outptr = output_buf[ci] - lw t2, 0(a1) // t2 = inptr = *input_buf + lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */ + lw t2, 0(a1) /* t2 = inptr = *input_buf */ sll t4, a3, 2 - lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] + lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */ addu t2, t2, t1 addu s1, t5, a0 addu t6, t5, t0 @@ -94,10 +94,10 @@ LEAF_DSPR2(jsimd_c_null_convert_dspr2) li t1, 0 5: sll t3, t1, 2 - lwx t5, t3(a2) // t5 = outptr = output_buf[ci] - lw t2, 0(a1) // t2 = inptr = *input_buf + lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */ + lw t2, 0(a1) /* t2 = inptr = *input_buf */ sll t4, a3, 2 - lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] + lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */ addu t2, t2, t1 addu s1, t5, a0 addu t6, t5, t0 @@ -163,29 +163,29 @@ LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2) */ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - lw t7, 48(sp) // t7 = num_rows - li s0, 0x4c8b // FIX(0.29900) - li s1, 0x9646 // FIX(0.58700) - li s2, 0x1d2f // FIX(0.11400) - li s3, 0xffffd4cd // -FIX(0.16874) - li s4, 0xffffab33 // -FIX(0.33126) - li s5, 0x8000 // FIX(0.50000) - li s6, 0xffff94d1 // -FIX(0.41869) - li s7, 0xffffeb2f // -FIX(0.08131) - li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1 + lw t7, 48(sp) /* t7 = num_rows */ + li s0, 0x4c8b /* FIX(0.29900) */ + li s1, 0x9646 /* FIX(0.58700) */ + li s2, 0x1d2f /* FIX(0.11400) */ + li s3, 0xffffd4cd /* -FIX(0.16874) */ + li s4, 0xffffab33 /* -FIX(0.33126) */ + li s5, 0x8000 /* FIX(0.50000) */ + li s6, 0xffff94d1 /* -FIX(0.41869) */ + li s7, 0xffffeb2f /* -FIX(0.08131) */ + li t8, 0x807fff /* CBCR_OFFSET + ONE_HALF-1 */ 0: - addiu t7, -1 // --num_rows - lw t6, 0(a1) // t6 = input_buf[0] + addiu t7, -1 /* --num_rows */ + lw t6, 0(a1) /* t6 = input_buf[0] */ lw t0, 0(a2) lw t1, 4(a2) lw t2, 8(a2) sll t3, a3, 2 - lwx t0, t3(t0) // t0 = output_buf[0][output_row] - lwx t1, t3(t1) // t1 = output_buf[1][output_row] - lwx t2, t3(t2) // t2 = output_buf[2][output_row] + lwx t0, t3(t0) /* t0 = output_buf[0][output_row] */ + lwx t1, t3(t1) /* t1 = output_buf[1][output_row] */ + lwx t2, t3(t2) /* t2 = output_buf[2][output_row] */ - addu t9, t2, a0 // t9 = end address + addu t9, t2, a0 /* t9 = end address */ addiu a3, 1 1: @@ -273,10 +273,10 @@ LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2) lw s1, 48(sp) li t3, 0x8000 - li t4, 0x166e9 // FIX(1.40200) - li t5, 0x1c5a2 // FIX(1.77200) - li t6, 0xffff492e // -FIX(0.71414) - li t7, 0xffffa7e6 // -FIX(0.34414) + li t4, 0x166e9 /* FIX(1.40200) */ + li t5, 0x1c5a2 /* FIX(1.77200) */ + li t6, 0xffff492e /* -FIX(0.71414) */ + li t7, 0xffffa7e6 /* -FIX(0.34414) */ repl.ph t8, 128 0: @@ -293,25 +293,25 @@ LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2) addiu a2, 1 1: - lbu s7, 0(s4) // cr - lbu s6, 0(s3) // cb - lbu s5, 0(s2) // y + lbu s7, 0(s4) /* cr */ + lbu s6, 0(s3) /* cb */ + lbu s5, 0(s2) /* y */ addiu s2, 1 addiu s4, 1 addiu s7, -128 addiu s6, -128 mul t2, t7, s6 - mul t0, t6, s7 // Crgtab[cr] + mul t0, t6, s7 /* Crgtab[cr] */ sll s7, 15 - mulq_rs.w t1, t4, s7 // Crrtab[cr] + mulq_rs.w t1, t4, s7 /* Crrtab[cr] */ sll s6, 15 - addu t2, t3 // Cbgtab[cb] + addu t2, t3 /* Cbgtab[cb] */ addu t2, t0 - mulq_rs.w t0, t5, s6 // Cbbtab[cb] + mulq_rs.w t0, t5, s6 /* Cbbtab[cb] */ sra t2, 16 addu t1, s5 - addu t2, s5 // add y + addu t2, s5 /* add y */ ins t2, t1, 16, 16 subu.ph t2, t2, t8 addu t0, s5 @@ -319,7 +319,7 @@ LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2) subu t0, 128 shra.ph t2, t2, 8 shll_s.w t0, t0, 24 - addu.ph t2, t2, t8 // clip & store + addu.ph t2, t2, t8 /* clip & store */ sra t0, t0, 24 sra t1, t2, 16 addiu t0, 128 @@ -382,15 +382,15 @@ LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2) */ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - li s0, 0x4c8b // s0 = FIX(0.29900) - li s1, 0x9646 // s1 = FIX(0.58700) - li s2, 0x1d2f // s2 = FIX(0.11400) - li s7, 0x8000 // s7 = FIX(0.50000) + li s0, 0x4c8b /* s0 = FIX(0.29900) */ + li s1, 0x9646 /* s1 = FIX(0.58700) */ + li s2, 0x1d2f /* s2 = FIX(0.11400) */ + li s7, 0x8000 /* s7 = FIX(0.50000) */ lw s6, 48(sp) andi t7, a0, 3 0: - addiu s6, -1 // s6 = num_rows + addiu s6, -1 /* s6 = num_rows */ lw t0, 0(a1) lw t1, 0(a2) sll t3, a3, 2 @@ -532,59 +532,59 @@ LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) */ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra - lw t9, 56(sp) // cinfo->sample_range_limit + lw t9, 56(sp) /* cinfo->sample_range_limit */ lw v0, 0(a1) lw v1, 4(a1) lw t0, 8(a1) sll t1, a2, 3 addiu t2, t1, 4 sll t3, a2, 2 - lw t4, 0(a3) // t4 = output_buf[0] - lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2] - lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1] - lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr] - lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr] - lw t7, 4(a3) // t7 = output_buf[1] + lw t4, 0(a3) /* t4 = output_buf[0] */ + lwx t1, t1(v0) /* t1 = input_buf[0][in_row_group_ctr*2] */ + lwx t2, t2(v0) /* t2 = input_buf[0][in_row_group_ctr*2 + 1] */ + lwx t5, t3(v1) /* t5 = input_buf[1][in_row_group_ctr] */ + lwx t6, t3(t0) /* t6 = input_buf[2][in_row_group_ctr] */ + lw t7, 4(a3) /* t7 = output_buf[1] */ li s1, 0xe6ea - addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)] - addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)] - addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] - xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] + addiu t8, s1, 0x7fff /* t8 = 0x166e9 [FIX(1.40200)] */ + addiu s0, t8, 0x5eb9 /* s0 = 0x1c5a2 [FIX(1.77200)] */ + addiu s1, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */ + xori s2, s1, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */ srl t3, a0, 1 blez t3, 2f - addu t0, t5, t3 // t0 = end address + addu t0, t5, t3 /* t0 = end address */ 1: lbu t3, 0(t5) lbu s3, 0(t6) addiu t5, t5, 1 - addiu t3, t3, -128 // (cb - 128) - addiu s3, s3, -128 // (cr - 128) + addiu t3, t3, -128 /* (cb - 128) */ + addiu s3, s3, -128 /* (cr - 128) */ mult $ac1, s1, t3 madd $ac1, s2, s3 sll s3, s3, 15 sll t3, t3, 15 - mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS + mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */ extr_r.w s5, $ac1, 16 - mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS + mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */ lbu v0, 0(t1) addiu t6, t6, 1 addiu t1, t1, 2 - addu t3, v0, s4 // y+cred - addu s3, v0, s5 // y+cgreen - addu v1, v0, s6 // y+cblue - addu t3, t9, t3 // y+cred - addu s3, t9, s3 // y+cgreen - addu v1, t9, v1 // y+cblue + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ lbu AT, 0(t3) lbu s7, 0(s3) lbu ra, 0(v1) lbu v0, -1(t1) - addu t3, v0, s4 // y+cred - addu s3, v0, s5 // y+cgreen - addu v1, v0, s6 // y+cblue - addu t3, t9, t3 // y+cred - addu s3, t9, s3 // y+cgreen - addu v1, t9, v1 // y+cblue + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ lbu t3, 0(t3) lbu s3, 0(s3) lbu v1, 0(v1) @@ -592,23 +592,23 @@ LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4 - addu t3, v0, s4 // y+cred - addu s3, v0, s5 // y+cgreen - addu v1, v0, s6 // y+cblue - addu t3, t9, t3 // y+cred - addu s3, t9, s3 // y+cgreen - addu v1, t9, v1 // y+cblue + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ lbu AT, 0(t3) lbu s7, 0(s3) lbu ra, 0(v1) lbu v0, 1(t2) addiu t2, t2, 2 - addu t3, v0, s4 // y+cred - addu s3, v0, s5 // y+cgreen - addu v1, v0, s6 // y+cblue - addu t3, t9, t3 // y+cred - addu s3, t9, s3 // y+cgreen - addu v1, t9, v1 // y+cblue + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ lbu t3, 0(t3) lbu s3, 0(s3) lbu v1, 0(v1) @@ -622,22 +622,22 @@ LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) beqz t0, 4f lbu t3, 0(t5) lbu s3, 0(t6) - addiu t3, t3, -128 // (cb - 128) - addiu s3, s3, -128 // (cr - 128) + addiu t3, t3, -128 /* (cb - 128) */ + addiu s3, s3, -128 /* (cr - 128) */ mult $ac1, s1, t3 madd $ac1, s2, s3 sll s3, s3, 15 sll t3, t3, 15 lbu v0, 0(t1) extr_r.w s5, $ac1, 16 - mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS - mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS - addu t3, v0, s4 // y+cred - addu s3, v0, s5 // y+cgreen - addu v1, v0, s6 // y+cblue - addu t3, t9, t3 // y+cred - addu s3, t9, s3 // y+cgreen - addu v1, t9, v1 // y+cblue + mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */ + mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */ + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ lbu t3, 0(t3) lbu s3, 0(s3) lbu v1, 0(v1) @@ -645,12 +645,12 @@ LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) STORE_H2V2_1_PIXEL t3, s3, v1, t4 - addu t3, v0, s4 // y+cred - addu s3, v0, s5 // y+cgreen - addu v1, v0, s6 // y+cblue - addu t3, t9, t3 // y+cred - addu s3, t9, s3 // y+cgreen - addu v1, t9, v1 // y+cblue + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ lbu t3, 0(t3) lbu s3, 0(s3) lbu v1, 0(v1) @@ -733,41 +733,41 @@ LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2) SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra li t0, 0xe6ea - lw t1, 0(a1) // t1 = input_buf[0] - lw t2, 4(a1) // t2 = input_buf[1] - lw t3, 8(a1) // t3 = input_buf[2] - lw t8, 56(sp) // t8 = range_limit - addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)] - addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)] - addiu s0, t0, 0x9916 // s0 = 0x8000 - addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] - xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] + lw t1, 0(a1) /* t1 = input_buf[0] */ + lw t2, 4(a1) /* t2 = input_buf[1] */ + lw t3, 8(a1) /* t3 = input_buf[2] */ + lw t8, 56(sp) /* t8 = range_limit */ + addiu s1, t0, 0x7fff /* s1 = 0x166e9 [FIX(1.40200)] */ + addiu s2, s1, 0x5eb9 /* s2 = 0x1c5a2 [FIX(1.77200)] */ + addiu s0, t0, 0x9916 /* s0 = 0x8000 */ + addiu s4, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */ + xori s3, s4, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */ srl t0, a0, 1 sll t4, a2, 2 - lwx s5, t4(t1) // s5 = inptr0 - lwx s6, t4(t2) // s6 = inptr1 - lwx s7, t4(t3) // s7 = inptr2 - lw t7, 0(a3) // t7 = outptr + lwx s5, t4(t1) /* s5 = inptr0 */ + lwx s6, t4(t2) /* s6 = inptr1 */ + lwx s7, t4(t3) /* s7 = inptr2 */ + lw t7, 0(a3) /* t7 = outptr */ blez t0, 2f - addu t9, s6, t0 // t9 = end address + addu t9, s6, t0 /* t9 = end address */ 1: - lbu t2, 0(s6) // t2 = cb - lbu t0, 0(s7) // t0 = cr - lbu t1, 0(s5) // t1 = y - addiu t2, t2, -128 // t2 = cb - 128 - addiu t0, t0, -128 // t0 = cr - 128 + lbu t2, 0(s6) /* t2 = cb */ + lbu t0, 0(s7) /* t0 = cr */ + lbu t1, 0(s5) /* t1 = y */ + addiu t2, t2, -128 /* t2 = cb - 128 */ + addiu t0, t0, -128 /* t0 = cr - 128 */ mult $ac1, s4, t2 madd $ac1, s3, t0 sll t0, t0, 15 sll t2, t2, 15 - mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS + mulq_rs.w t0, s1, t0 /* t0 = (C1*cr + ONE_HALF)>> SCALEBITS */ extr_r.w t5, $ac1, 16 - mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS + mulq_rs.w t6, s2, t2 /* t6 = (C2*cb + ONE_HALF)>> SCALEBITS */ addiu s7, s7, 1 addiu s6, s6, 1 - addu t2, t1, t0 // t2 = y + cred - addu t3, t1, t5 // t3 = y + cgreen - addu t4, t1, t6 // t4 = y + cblue + addu t2, t1, t0 /* t2 = y + cred */ + addu t3, t1, t5 /* t3 = y + cgreen */ + addu t4, t1, t6 /* t4 = y + cblue */ addu t2, t8, t2 addu t3, t8, t3 addu t4, t8, t4 @@ -797,20 +797,20 @@ LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2) lbu t2, 0(s6) lbu t0, 0(s7) lbu t1, 0(s5) - addiu t2, t2, -128 // (cb - 128) - addiu t0, t0, -128 // (cr - 128) + addiu t2, t2, -128 /* (cb - 128) */ + addiu t0, t0, -128 /* (cr - 128) */ mul t3, s4, t2 mul t4, s3, t0 sll t0, t0, 15 sll t2, t2, 15 - mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS - mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS + mulq_rs.w t0, s1, t0 /* (C1*cr + ONE_HALF)>> SCALEBITS */ + mulq_rs.w t6, s2, t2 /* (C2*cb + ONE_HALF)>> SCALEBITS */ addu t3, t3, s0 addu t3, t4, t3 - sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS - addu t2, t1, t0 // y + cred - addu t3, t1, t5 // y + cgreen - addu t4, t1, t6 // y + cblue + sra t5, t3, 16 /* (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS */ + addu t2, t1, t0 /* y + cred */ + addu t3, t1, t5 /* y + cgreen */ + addu t4, t1, t6 /* y + cblue */ addu t2, t8, t2 addu t3, t8, t3 addu t4, t8, t4 @@ -856,15 +856,15 @@ LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2) SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 li s4, 0 - lw s2, 0(a3) // s2 = *output_data_ptr + lw s2, 0(a3) /* s2 = *output_data_ptr */ 0: li t9, 2 - lw s1, -4(a2) // s1 = inptr1 + lw s1, -4(a2) /* s1 = inptr1 */ 1: - lw s0, 0(a2) // s0 = inptr0 + lw s0, 0(a2) /* s0 = inptr0 */ lwx s3, s4(s2) - addiu s5, a1, -2 // s5 = downsampled_width - 2 + addiu s5, a1, -2 /* s5 = downsampled_width - 2 */ srl t4, s5, 1 sll t4, t4, 1 lbu t0, 0(s0) @@ -873,50 +873,50 @@ LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2) lbu t3, 1(s1) addiu s0, 2 addiu s1, 2 - addu t8, s0, t4 // t8 = end address - andi s5, s5, 1 // s5 = residual + addu t8, s0, t4 /* t8 = end address */ + andi s5, s5, 1 /* s5 = residual */ sll t4, t0, 1 sll t6, t1, 1 - addu t0, t0, t4 // t0 = (*inptr0++) * 3 - addu t1, t1, t6 // t1 = (*inptr0++) * 3 - addu t7, t0, t2 // t7 = thiscolsum - addu t6, t1, t3 // t5 = nextcolsum - sll t0, t7, 2 // t0 = thiscolsum * 4 - subu t1, t0, t7 // t1 = thiscolsum * 3 + addu t0, t0, t4 /* t0 = (*inptr0++) * 3 */ + addu t1, t1, t6 /* t1 = (*inptr0++) * 3 */ + addu t7, t0, t2 /* t7 = thiscolsum */ + addu t6, t1, t3 /* t5 = nextcolsum */ + sll t0, t7, 2 /* t0 = thiscolsum * 4 */ + subu t1, t0, t7 /* t1 = thiscolsum * 3 */ shra_r.w t0, t0, 4 addiu t1, 7 addu t1, t1, t6 srl t1, t1, 4 sb t0, 0(s3) sb t1, 1(s3) - beq t8, s0, 22f // skip to final iteration if width == 3 + beq t8, s0, 22f /* skip to final iteration if width == 3 */ addiu s3, 2 2: - lh t0, 0(s0) // t0 = A3|A2 - lh t2, 0(s1) // t2 = B3|B2 + lh t0, 0(s0) /* t0 = A3|A2 */ + lh t2, 0(s1) /* t2 = B3|B2 */ addiu s0, 2 addiu s1, 2 - preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2 - preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2 + preceu.ph.qbr t0, t0 /* t0 = 0|A3|0|A2 */ + preceu.ph.qbr t2, t2 /* t2 = 0|B3|0|B2 */ shll.ph t1, t0, 1 sll t3, t6, 1 - addu.ph t0, t1, t0 // t0 = A3*3|A2*3 - addu t3, t3, t6 // t3 = this * 3 - addu.ph t0, t0, t2 // t0 = next2|next1 + addu.ph t0, t1, t0 /* t0 = A3*3|A2*3 */ + addu t3, t3, t6 /* t3 = this * 3 */ + addu.ph t0, t0, t2 /* t0 = next2|next1 */ addu t1, t3, t7 - andi t7, t0, 0xFFFF // t7 = next1 + andi t7, t0, 0xFFFF /* t7 = next1 */ sll t2, t7, 1 - addu t2, t7, t2 // t2 = next1*3 + addu t2, t7, t2 /* t2 = next1*3 */ addu t4, t2, t6 - srl t6, t0, 16 // t6 = next2 - shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4 + srl t6, t0, 16 /* t6 = next2 */ + shra_r.w t1, t1, 4 /* t1 = (this*3 + last + 8) >> 4 */ addu t0, t3, t7 addiu t0, 7 - srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4 - shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4 + srl t0, t0, 4 /* t0 = (this*3 + next1 + 7) >> 4 */ + shra_r.w t4, t4, 4 /* t3 = (next1*3 + this + 8) >> 4 */ addu t2, t2, t6 addiu t2, 7 - srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4 + srl t2, t2, 4 /* t2 = (next1*3 + next2 + 7) >> 4 */ sb t1, 0(s3) sb t0, 1(s3) sb t4, 2(s3) @@ -933,8 +933,8 @@ LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2) addiu s1, 1 sll t3, t6, 1 sll t1, t0, 1 - addu t1, t0, t1 // t1 = inptr0 * 3 - addu t3, t3, t6 // t3 = thiscolsum * 3 + addu t1, t0, t1 /* t1 = inptr0 * 3 */ + addu t3, t3, t6 /* t3 = thiscolsum * 3 */ addu t5, t1, t2 addu t1, t3, t7 shra_r.w t1, t1, 4 @@ -948,8 +948,8 @@ LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2) bne t8, s0, 3b move t6, t5 4: - sll t0, t6, 2 // t0 = thiscolsum * 4 - subu t1, t0, t6 // t1 = thiscolsum * 3 + sll t0, t6, 2 /* t0 = thiscolsum * 4 */ + subu t1, t0, t6 /* t1 = thiscolsum * 3 */ addu t1, t1, t7 addiu s4, 4 shra_r.w t1, t1, 4 @@ -996,9 +996,9 @@ LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2) lw t7, 0(a2) lw s2, 0(s1) lbu t0, 0(t7) - lbu t1, 1(t7) // t1 = inptr[1] + lbu t1, 1(t7) /* t1 = inptr[1] */ sll t2, t0, 1 - addu t2, t2, t0 // t2 = invalue*3 + addu t2, t2, t0 /* t2 = invalue*3 */ addu t2, t2, t1 shra_r.w t2, t2, 2 sb t0, 0(s2) @@ -1006,28 +1006,28 @@ LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2) beqz t9, 11f addiu s2, 2 1: - ulw t0, 0(t7) // t0 = |P3|P2|P1|P0| + ulw t0, 0(t7) /* t0 = |P3|P2|P1|P0| */ ulw t1, 1(t7) - ulh t2, 4(t7) // t2 = |0|0|P5|P4| - preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2| - preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0| - preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4| - preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3| - preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1| + ulh t2, 4(t7) /* t2 = |0|0|P5|P4| */ + preceu.ph.qbl t3, t0 /* t3 = |0|P3|0|P2| */ + preceu.ph.qbr t0, t0 /* t0 = |0|P1|0|P0| */ + preceu.ph.qbr t2, t2 /* t2 = |0|P5|0|P4| */ + preceu.ph.qbl t4, t1 /* t4 = |0|P4|0|P3| */ + preceu.ph.qbr t1, t1 /* t1 = |0|P2|0|P1| */ shll.ph t5, t4, 1 shll.ph t6, t1, 1 - addu.ph t5, t5, t4 // t5 = |P4*3|P3*3| - addu.ph t6, t6, t1 // t6 = |P2*3|P1*3| + addu.ph t5, t5, t4 /* t5 = |P4*3|P3*3| */ + addu.ph t6, t6, t1 /* t6 = |P2*3|P1*3| */ addu.ph t4, t3, s3 addu.ph t0, t0, s3 addu.ph t4, t4, t5 addu.ph t0, t0, t6 - shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2| - shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0| + shrl.ph t4, t4, 2 /* t4 = |0|P3|0|P2| */ + shrl.ph t0, t0, 2 /* t0 = |0|P1|0|P0| */ addu.ph t2, t2, t5 addu.ph t3, t3, t6 - shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4| - shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2| + shra_r.ph t2, t2, 2 /* t2 = |0|P5|0|P4| */ + shra_r.ph t3, t3, 2 /* t3 = |0|P3|0|P2| */ shll.ph t2, t2, 8 shll.ph t3, t3, 8 or t2, t4, t2 @@ -1047,7 +1047,7 @@ LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2) lbu t0, 0(t7) addiu t7, 1 sll t1, t0, 1 - addu t2, t0, t1 // t2 = invalue + addu t2, t0, t1 /* t2 = invalue */ lbu t3, -2(t7) lbu t4, 0(t7) addiu t3, 1 @@ -1066,7 +1066,7 @@ LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2) lbu t0, 0(t7) lbu t2, -1(t7) sll t1, t0, 1 - addu t1, t1, t0 // t1 = invalue * 3 + addu t1, t1, t0 /* t1 = invalue * 3 */ addu t1, t1, t2 addiu t1, 1 srl t1, t1, 2 @@ -1098,22 +1098,22 @@ LEAF_DSPR2(jsimd_h2v1_downsample_dspr2) SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4 beqz a2, 7f - lw s1, 44(sp) // s1 = output_data - lw s0, 40(sp) // s0 = input_data + lw s1, 44(sp) /* s1 = output_data */ + lw s0, 40(sp) /* s0 = input_data */ srl s2, a0, 2 andi t9, a0, 2 srl t7, t9, 1 addu s2, t7, s2 - sll t0, a3, 3 // t0 = width_in_blocks*DCT + sll t0, a3, 3 /* t0 = width_in_blocks*DCT */ srl t7, t0, 1 subu s2, t7, s2 0: - andi t6, a0, 1 // t6 = temp_index + andi t6, a0, 1 /* t6 = temp_index */ addiu t6, -1 - lw t4, 0(s1) // t4 = outptr - lw t5, 0(s0) // t5 = inptr0 - li s3, 0 // s3 = bias - srl t7, a0, 1 // t7 = image_width1 + lw t4, 0(s1) /* t4 = outptr */ + lw t5, 0(s0) /* t5 = inptr0 */ + li s3, 0 /* s3 = bias */ + srl t7, a0, 1 /* t7 = image_width1 */ srl s4, t7, 2 andi t8, t7, 3 1: @@ -1151,12 +1151,12 @@ LEAF_DSPR2(jsimd_h2v1_downsample_dspr2) 3: lbux t1, t6(t5) sll t1, 1 - addqh.w t2, t1, s3 // t2 = pixval1 + addqh.w t2, t1, s3 /* t2 = pixval1 */ xori s3, s3, 1 - addqh.w t3, t1, s3 // t3 = pixval2 + addqh.w t3, t1, s3 /* t3 = pixval2 */ blez s2, 5f append t3, t2, 8 - addu t5, t4, s2 // t5 = loop_end2 + addu t5, t4, s2 /* t5 = loop_end2 */ 4: ush t3, 0(t4) addiu s2, -1 @@ -1194,33 +1194,33 @@ LEAF_DSPR2(jsimd_h2v2_downsample_dspr2) SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 beqz a2, 8f - lw s1, 52(sp) // s1 = output_data - lw s0, 48(sp) // s0 = input_data + lw s1, 52(sp) /* s1 = output_data */ + lw s0, 48(sp) /* s0 = input_data */ - andi t6, a0, 1 // t6 = temp_index + andi t6, a0, 1 /* t6 = temp_index */ addiu t6, -1 - srl t7, a0, 1 // t7 = image_width1 + srl t7, a0, 1 /* t7 = image_width1 */ srl s4, t7, 2 andi t8, t7, 3 andi t9, a0, 2 srl s2, a0, 2 srl t7, t9, 1 addu s2, t7, s2 - sll t0, a3, 3 // s2 = width_in_blocks*DCT + sll t0, a3, 3 /* s2 = width_in_blocks*DCT */ srl t7, t0, 1 subu s2, t7, s2 0: - lw t4, 0(s1) // t4 = outptr - lw t5, 0(s0) // t5 = inptr0 - lw s7, 4(s0) // s7 = inptr1 - li s6, 1 // s6 = bias + lw t4, 0(s1) /* t4 = outptr */ + lw t5, 0(s0) /* t5 = inptr0 */ + lw s7, 4(s0) /* s7 = inptr1 */ + li s6, 1 /* s6 = bias */ 2: - ulw t0, 0(t5) // t0 = |P3|P2|P1|P0| - ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0| + ulw t0, 0(t5) /* t0 = |P3|P2|P1|P0| */ + ulw t1, 0(s7) /* t1 = |Q3|Q2|Q1|Q0| */ ulw t2, 4(t5) ulw t3, 4(s7) - precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2| - ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0| + precrq.ph.w t7, t0, t1 /* t2 = |P3|P2|Q3|Q2| */ + ins t0, t1, 16, 16 /* t0 = |Q1|Q0|P1|P0| */ raddu.w.qb t1, t7 raddu.w.qb t0, t0 shra_r.w t1, t1, 2 @@ -1264,10 +1264,10 @@ LEAF_DSPR2(jsimd_h2v2_downsample_dspr2) sll t0, 1 addu t1, t1, t0 addu t3, t1, s6 - srl t0, t3, 2 // t2 = pixval1 + srl t0, t3, 2 /* t2 = pixval1 */ xori s6, s6, 3 addu t2, t1, s6 - srl t1, t2, 2 // t3 = pixval2 + srl t1, t2, 2 /* t3 = pixval2 */ blez s2, 6f append t1, t0, 8 5: @@ -1307,15 +1307,15 @@ LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2) SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - lw s7, 52(sp) // compptr->width_in_blocks - lw s0, 56(sp) // cinfo->image_width - lw s6, 48(sp) // cinfo->smoothing_factor - sll s7, 3 // output_cols = width_in_blocks * DCTSIZE + lw s7, 52(sp) /* compptr->width_in_blocks */ + lw s0, 56(sp) /* cinfo->image_width */ + lw s6, 48(sp) /* cinfo->smoothing_factor */ + sll s7, 3 /* output_cols = width_in_blocks * DCTSIZE */ sll v0, s7, 1 subu v0, v0, s0 blez v0, 2f move v1, zero - addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2 + addiu t0, a3, 2 /* t0 = cinfo->max_v_samp_factor + 2 */ 0: addiu t1, a0, -4 sll t2, v1, 2 @@ -1337,20 +1337,20 @@ LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2) li v1, 16384 move t4, zero move t5, zero - subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80 - sll t7, s6, 4 // t7 = tmp_smoot_f * 16 + subu t6, v1, v0 /* t6 = 16384 - tmp_smoot_f * 80 */ + sll t7, s6, 4 /* t7 = tmp_smoot_f * 16 */ 3: /* Special case for first column: pretend column -1 is same as column 0 */ sll v0, t4, 2 - lwx t8, v0(a1) // outptr = output_data[outrow] + lwx t8, v0(a1) /* outptr = output_data[outrow] */ sll v1, t5, 2 addiu t9, v1, 4 addiu s0, v1, -4 addiu s1, v1, 8 - lwx s2, v1(a0) // inptr0 = input_data[inrow] - lwx t9, t9(a0) // inptr1 = input_data[inrow+1] - lwx s0, s0(a0) // above_ptr = input_data[inrow-1] - lwx s1, s1(a0) // below_ptr = input_data[inrow+2] + lwx s2, v1(a0) /* inptr0 = input_data[inrow] */ + lwx t9, t9(a0) /* inptr1 = input_data[inrow+1] */ + lwx s0, s0(a0) /* above_ptr = input_data[inrow-1] */ + lwx s1, s1(a0) /* below_ptr = input_data[inrow+2] */ lh v0, 0(s2) lh v1, 0(t9) lh t0, 0(s0) @@ -1387,7 +1387,7 @@ LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2) sb v0, -1(t8) addiu s4, s7, -2 and s4, s4, 3 - addu s5, s4, t8 // end address + addu s5, s4, t8 /* end address */ 4: lh v0, 0(s2) lh v1, 0(t9) @@ -1426,7 +1426,7 @@ LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2) addiu s1, s1, 2 addiu s5, s7, -2 subu s5, s5, s4 - addu s5, s5, t8 // end address + addu s5, s5, t8 /* end address */ 5: lh v0, 0(s2) lh v1, 0(t9) @@ -1607,24 +1607,24 @@ LEAF_DSPR2(jsimd_int_upsample_dspr2) SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 - lw s0, 0(a3) // s0 = output_data - lw s1, 32(sp) // s1 = cinfo->output_width - lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor - li t6, 0 // t6 = inrow + lw s0, 0(a3) /* s0 = output_data */ + lw s1, 32(sp) /* s1 = cinfo->output_width */ + lw s2, 36(sp) /* s2 = cinfo->max_v_samp_factor */ + li t6, 0 /* t6 = inrow */ beqz s2, 10f - li s3, 0 // s3 = outrow + li s3, 0 /* s3 = outrow */ 0: addu t0, a2, t6 addu t7, s0, s3 - lw t3, 0(t0) // t3 = inptr - lw t8, 0(t7) // t8 = outptr + lw t3, 0(t0) /* t3 = inptr */ + lw t8, 0(t7) /* t8 = outptr */ beqz s1, 4f - addu t5, t8, s1 // t5 = outend + addu t5, t8, s1 /* t5 = outend */ 1: - lb t2, 0(t3) // t2 = invalue = *inptr++ + lb t2, 0(t3) /* t2 = invalue = *inptr++ */ addiu t3, 1 beqz a0, 3f - move t0, a0 // t0 = h_expand + move t0, a0 /* t0 = h_expand */ 2: sb t2, 0(t8) addiu t0, -1 @@ -1634,7 +1634,7 @@ LEAF_DSPR2(jsimd_int_upsample_dspr2) bgt t5, t8, 1b nop 4: - addiu t9, a1, -1 // t9 = v_expand - 1 + addiu t9, a1, -1 /* t9 = v_expand - 1 */ blez t9, 9f nop 5: @@ -1642,8 +1642,8 @@ LEAF_DSPR2(jsimd_int_upsample_dspr2) lw t4, 4(s0) subu t0, s1, 0xF blez t0, 7f - addu t5, t3, s1 // t5 = end address - andi t7, s1, 0xF // t7 = residual + addu t5, t3, s1 /* t5 = end address */ + andi t7, s1, 0xF /* t7 = residual */ subu t8, t5, t7 6: ulw t0, 0(t3) @@ -1689,33 +1689,33 @@ LEAF_DSPR2(jsimd_h2v1_upsample_dspr2) * a2 = input_data * a3 = output_data_ptr */ - lw t7, 0(a3) // t7 = output_data - andi t8, a1, 0xf // t8 = residual + lw t7, 0(a3) /* t7 = output_data */ + andi t8, a1, 0xf /* t8 = residual */ sll t0, a0, 2 blez a0, 4f - addu t9, t7, t0 // t9 = output_data end address + addu t9, t7, t0 /* t9 = output_data end address */ 0: - lw t5, 0(t7) // t5 = outptr - lw t6, 0(a2) // t6 = inptr - addu t3, t5, a1 // t3 = outptr + output_width (end address) - subu t3, t8 // t3 = end address - residual + lw t5, 0(t7) /* t5 = outptr */ + lw t6, 0(a2) /* t6 = inptr */ + addu t3, t5, a1 /* t3 = outptr + output_width (end address) */ + subu t3, t8 /* t3 = end address - residual */ beq t5, t3, 2f move t4, t8 1: - ulw t0, 0(t6) // t0 = |P3|P2|P1|P0| - ulw t2, 4(t6) // t2 = |P7|P6|P5|P4| - srl t1, t0, 16 // t1 = |X|X|P3|P2| - ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0| - ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2| - ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0| - ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2| + ulw t0, 0(t6) /* t0 = |P3|P2|P1|P0| */ + ulw t2, 4(t6) /* t2 = |P7|P6|P5|P4| */ + srl t1, t0, 16 /* t1 = |X|X|P3|P2| */ + ins t0, t0, 16, 16 /* t0 = |P1|P0|P1|P0| */ + ins t1, t1, 16, 16 /* t1 = |P3|P2|P3|P2| */ + ins t0, t0, 8, 16 /* t0 = |P1|P1|P0|P0| */ + ins t1, t1, 8, 16 /* t1 = |P3|P3|P2|P2| */ usw t0, 0(t5) usw t1, 4(t5) - srl t0, t2, 16 // t0 = |X|X|P7|P6| - ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4| - ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6| - ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4| - ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6| + srl t0, t2, 16 /* t0 = |X|X|P7|P6| */ + ins t2, t2, 16, 16 /* t2 = |P5|P4|P5|P4| */ + ins t0, t0, 16, 16 /* t0 = |P7|P6|P7|P6| */ + ins t2, t2, 8, 16 /* t2 = |P5|P5|P4|P4| */ + ins t0, t0, 8, 16 /* t0 = |P7|P7|P6|P6| */ usw t2, 8(t5) usw t0, 12(t5) addiu t5, 16 @@ -1751,12 +1751,12 @@ LEAF_DSPR2(jsimd_h2v2_upsample_dspr2) */ lw t7, 0(a3) blez a0, 7f - andi t9, a1, 0xf // t9 = residual + andi t9, a1, 0xf /* t9 = residual */ 0: - lw t6, 0(a2) // t6 = inptr - lw t5, 0(t7) // t5 = outptr - addu t8, t5, a1 // t8 = outptr end address - subu t8, t9 // t8 = end address - residual + lw t6, 0(a2) /* t6 = inptr */ + lw t5, 0(t7) /* t5 = outptr */ + addu t8, t5, a1 /* t8 = outptr end address */ + subu t8, t9 /* t8 = end address - residual */ beq t5, t8, 2f move t4, t9 1: @@ -1790,9 +1790,9 @@ LEAF_DSPR2(jsimd_h2v2_upsample_dspr2) bgtz t4, 2b addiu t5, 2 3: - lw t6, 0(t7) // t6 = outptr[0] - lw t5, 4(t7) // t5 = outptr[1] - addu t4, t6, a1 // t4 = new end address + lw t6, 0(t7) /* t6 = outptr[0] */ + lw t5, 4(t7) /* t5 = outptr[1] */ + addu t4, t6, a1 /* t4 = new end address */ beq a1, t9, 5f subu t8, t4, t9 4: @@ -1838,15 +1838,15 @@ LEAF_DSPR2(jsimd_idct_islow_dspr2) addiu sp, sp, -256 move v0, sp - addiu v1, zero, 8 // v1 = DCTSIZE = 8 + addiu v1, zero, 8 /* v1 = DCTSIZE = 8 */ 1: - lh s4, 32(a0) // s4 = inptr[16] - lh s5, 64(a0) // s5 = inptr[32] - lh s6, 96(a0) // s6 = inptr[48] - lh t1, 112(a0) // t1 = inptr[56] - lh t7, 16(a0) // t7 = inptr[8] - lh t5, 80(a0) // t5 = inptr[40] - lh t3, 48(a0) // t3 = inptr[24] + lh s4, 32(a0) /* s4 = inptr[16] */ + lh s5, 64(a0) /* s5 = inptr[32] */ + lh s6, 96(a0) /* s6 = inptr[48] */ + lh t1, 112(a0) /* t1 = inptr[56] */ + lh t7, 16(a0) /* t7 = inptr[8] */ + lh t5, 80(a0) /* t5 = inptr[40] */ + lh t3, 48(a0) /* t3 = inptr[24] */ or s4, s4, t1 or s4, s4, t3 or s4, s4, t5 @@ -1855,9 +1855,9 @@ LEAF_DSPR2(jsimd_idct_islow_dspr2) or s4, s4, s6 bnez s4, 2f addiu v1, v1, -1 - lh s5, 0(a1) // quantptr[DCTSIZE*0] - lh s6, 0(a0) // inptr[DCTSIZE*0] - mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0]) + lh s5, 0(a1) /* quantptr[DCTSIZE*0] */ + lh s6, 0(a0) /* inptr[DCTSIZE*0] */ + mul s5, s5, s6 /* DEQUANTIZE(inptr[0], quantptr[0]) */ sll s5, s5, 2 sw s5, 0(v0) sw s5, 32(v0) @@ -1873,68 +1873,77 @@ LEAF_DSPR2(jsimd_idct_islow_dspr2) lh t2, 48(a1) lh t4, 80(a1) lh t6, 16(a1) - mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) - mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) - mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) - mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) + mul t0, t0, t1 /* DEQUANTIZE(inptr[DCTSIZE*7], + quantptr[DCTSIZE*7]) */ + mul t1, t2, t3 /* DEQUANTIZE(inptr[DCTSIZE*3], + quantptr[DCTSIZE*3]) */ + mul t2, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*5], + quantptr[DCTSIZE*5]) */ + mul t3, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*1], + quantptr[DCTSIZE*1]) */ lh t4, 32(a1) lh t5, 32(a0) lh t6, 96(a1) lh t7, 96(a0) - addu s0, t0, t1 // z3 = tmp0 + tmp2 - addu s1, t1, t2 // z2 = tmp1 + tmp2 - addu s2, t2, t3 // z4 = tmp1 + tmp3 - addu s3, s0, s2 // z3 + z4 - addiu t9, zero, 9633 // FIX_1_175875602 - mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) - addu t8, t0, t3 // z1 = tmp0 + tmp3 - addiu t9, zero, 2446 // FIX_0_298631336 - mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) - addiu t9, zero, 16819 // FIX_2_053119869 - mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) - addiu t9, zero, 25172 // FIX_3_072711026 - mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) - addiu t9, zero, 12299 // FIX_1_501321110 - mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) - addiu t9, zero, 16069 // FIX_1_961570560 - mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560) - addiu t9, zero, 3196 // FIX_0_390180644 - mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644) - addiu t9, zero, 7373 // FIX_0_899976223 - mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223) - addiu t9, zero, 20995 // FIX_2_562915447 - mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447) - subu s0, s3, s0 // z3 += z5 - addu t0, t0, s0 // tmp0 += z3 - addu t1, t1, s0 // tmp2 += z3 - subu s2, s3, s2 // z4 += z5 - addu t2, t2, s2 // tmp1 += z4 - addu t3, t3, s2 // tmp3 += z4 - subu t0, t0, t8 // tmp0 += z1 - subu t1, t1, s1 // tmp2 += z2 - subu t2, t2, s1 // tmp1 += z2 - subu t3, t3, t8 // tmp3 += z1 - mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) - addiu t9, zero, 6270 // FIX_0_765366865 - mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) + addu s0, t0, t1 /* z3 = tmp0 + tmp2 */ + addu s1, t1, t2 /* z2 = tmp1 + tmp2 */ + addu s2, t2, t3 /* z4 = tmp1 + tmp3 */ + addu s3, s0, s2 /* z3 + z4 */ + addiu t9, zero, 9633 /* FIX_1_175875602 */ + mul s3, s3, t9 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + addu t8, t0, t3 /* z1 = tmp0 + tmp3 */ + addiu t9, zero, 2446 /* FIX_0_298631336 */ + mul t0, t0, t9 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + addiu t9, zero, 16819 /* FIX_2_053119869 */ + mul t2, t2, t9 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + addiu t9, zero, 25172 /* FIX_3_072711026 */ + mul t1, t1, t9 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + addiu t9, zero, 12299 /* FIX_1_501321110 */ + mul t3, t3, t9 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + addiu t9, zero, 16069 /* FIX_1_961570560 */ + mul s0, s0, t9 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */ + addiu t9, zero, 3196 /* FIX_0_390180644 */ + mul s2, s2, t9 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */ + addiu t9, zero, 7373 /* FIX_0_899976223 */ + mul t8, t8, t9 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */ + addiu t9, zero, 20995 /* FIX_2_562915447 */ + mul s1, s1, t9 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */ + subu s0, s3, s0 /* z3 += z5 */ + addu t0, t0, s0 /* tmp0 += z3 */ + addu t1, t1, s0 /* tmp2 += z3 */ + subu s2, s3, s2 /* z4 += z5 */ + addu t2, t2, s2 /* tmp1 += z4 */ + addu t3, t3, s2 /* tmp3 += z4 */ + subu t0, t0, t8 /* tmp0 += z1 */ + subu t1, t1, s1 /* tmp2 += z2 */ + subu t2, t2, s1 /* tmp1 += z2 */ + subu t3, t3, t8 /* tmp3 += z1 */ + mul s0, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*2], + quantptr[DCTSIZE*2]) */ + addiu t9, zero, 6270 /* FIX_0_765366865 */ + mul s1, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*6], + quantptr[DCTSIZE*6]) */ lh t4, 0(a1) lh t5, 0(a0) lh t6, 64(a1) lh t7, 64(a0) - mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865) - mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) - addiu t9, zero, 4433 // FIX_0_541196100 - addu s3, s0, s1 // z2 + z3 - mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) - addiu t9, zero, 15137 // FIX_1_847759065 - mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065) + mul s2, t9, s0 /* MULTIPLY(z2, FIX_0_765366865) */ + mul t5, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*0], + quantptr[DCTSIZE*0]) */ + mul t6, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*4], + quantptr[DCTSIZE*4]) */ + addiu t9, zero, 4433 /* FIX_0_541196100 */ + addu s3, s0, s1 /* z2 + z3 */ + mul s3, s3, t9 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */ + addiu t9, zero, 15137 /* FIX_1_847759065 */ + mul t8, s1, t9 /* MULTIPLY(z3, FIX_1_847759065) */ addu t4, t5, t6 subu t5, t5, t6 - sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS - sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS - addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) - subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065) + sll t4, t4, 13 /* tmp0 = (z2 + z3) << CONST_BITS */ + sll t5, t5, 13 /* tmp1 = (z2 - z3) << CONST_BITS */ + addu t7, s3, s2 /* tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) */ + subu t6, s3, t8 /* tmp2 = + z1 + MULTIPLY(z3, -FIX_1_847759065) */ addu s0, t4, t7 subu s1, t4, t7 addu s2, t5, t6 @@ -1971,14 +1980,14 @@ LEAF_DSPR2(jsimd_idct_islow_dspr2) move v0, sp addiu v1, zero, 8 4: - lw t0, 8(v0) // z2 = (JLONG)wsptr[2] - lw t1, 24(v0) // z3 = (JLONG)wsptr[6] - lw t2, 0(v0) // (JLONG)wsptr[0] - lw t3, 16(v0) // (JLONG)wsptr[4] - lw s4, 4(v0) // (JLONG)wsptr[1] - lw s5, 12(v0) // (JLONG)wsptr[3] - lw s6, 20(v0) // (JLONG)wsptr[5] - lw s7, 28(v0) // (JLONG)wsptr[7] + lw t0, 8(v0) /* z2 = (JLONG)wsptr[2] */ + lw t1, 24(v0) /* z3 = (JLONG)wsptr[6] */ + lw t2, 0(v0) /* (JLONG)wsptr[0] */ + lw t3, 16(v0) /* (JLONG)wsptr[4] */ + lw s4, 4(v0) /* (JLONG)wsptr[1] */ + lw s5, 12(v0) /* (JLONG)wsptr[3] */ + lw s6, 20(v0) /* (JLONG)wsptr[5] */ + lw s7, 28(v0) /* (JLONG)wsptr[7] */ or s4, s4, t0 or s4, s4, t1 or s4, s4, t3 @@ -1997,60 +2006,64 @@ LEAF_DSPR2(jsimd_idct_islow_dspr2) b 6f nop 5: - addu t4, t0, t1 // z2 + z3 - addiu t8, zero, 4433 // FIX_0_541196100 - mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) - addiu t8, zero, 15137 // FIX_1_847759065 - mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065) - addiu t8, zero, 6270 // FIX_0_765366865 - mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865) - addu t4, t2, t3 // (JLONG)wsptr[0] + (JLONG)wsptr[4] - subu t2, t2, t3 // (JLONG)wsptr[0] - (JLONG)wsptr[4] - sll t4, t4, 13 // tmp0 = (wsptr[0] + wsptr[4]) << CONST_BITS - sll t2, t2, 13 // tmp1 = (wsptr[0] - wsptr[4]) << CONST_BITS - subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065) - subu t3, t2, t1 // tmp12 = tmp1 - tmp2 - addu t2, t2, t1 // tmp11 = tmp1 + tmp2 - addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) - subu t1, t4, t5 // tmp13 = tmp0 - tmp3 - addu t0, t4, t5 // tmp10 = tmp0 + tmp3 - lw t4, 28(v0) // tmp0 = (JLONG)wsptr[7] - lw t6, 12(v0) // tmp2 = (JLONG)wsptr[3] - lw t5, 20(v0) // tmp1 = (JLONG)wsptr[5] - lw t7, 4(v0) // tmp3 = (JLONG)wsptr[1] - addu s0, t4, t6 // z3 = tmp0 + tmp2 - addiu t8, zero, 9633 // FIX_1_175875602 - addu s1, t5, t7 // z4 = tmp1 + tmp3 - addu s2, s0, s1 // z3 + z4 - mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) - addu s3, t4, t7 // z1 = tmp0 + tmp3 - addu t9, t5, t6 // z2 = tmp1 + tmp2 - addiu t8, zero, 16069 // FIX_1_961570560 - mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560) - addiu t8, zero, 3196 // FIX_0_390180644 - mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644) - addiu t8, zero, 2446 // FIX_0_298631336 - mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) - addiu t8, zero, 7373 // FIX_0_899976223 - mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223) - addiu t8, zero, 16819 // FIX_2_053119869 - mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) - addiu t8, zero, 20995 // FIX_2_562915447 - mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447) - addiu t8, zero, 25172 // FIX_3_072711026 - mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) - addiu t8, zero, 12299 // FIX_1_501321110 - mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) - subu s0, s2, s0 // z3 += z5 - subu s1, s2, s1 // z4 += z5 + addu t4, t0, t1 /* z2 + z3 */ + addiu t8, zero, 4433 /* FIX_0_541196100 */ + mul t5, t4, t8 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */ + addiu t8, zero, 15137 /* FIX_1_847759065 */ + mul t1, t1, t8 /* MULTIPLY(z3, FIX_1_847759065) */ + addiu t8, zero, 6270 /* FIX_0_765366865 */ + mul t0, t0, t8 /* MULTIPLY(z2, FIX_0_765366865) */ + addu t4, t2, t3 /* (JLONG)wsptr[0] + (JLONG)wsptr[4] */ + subu t2, t2, t3 /* (JLONG)wsptr[0] - (JLONG)wsptr[4] */ + sll t4, t4, 13 /* tmp0 = + (wsptr[0] + wsptr[4]) << CONST_BITS */ + sll t2, t2, 13 /* tmp1 = + (wsptr[0] - wsptr[4]) << CONST_BITS */ + subu t1, t5, t1 /* tmp2 = + z1 + MULTIPLY(z3, -FIX_1_847759065) */ + subu t3, t2, t1 /* tmp12 = tmp1 - tmp2 */ + addu t2, t2, t1 /* tmp11 = tmp1 + tmp2 */ + addu t5, t5, t0 /* tmp3 = + z1 + MULTIPLY(z2, FIX_0_765366865) */ + subu t1, t4, t5 /* tmp13 = tmp0 - tmp3 */ + addu t0, t4, t5 /* tmp10 = tmp0 + tmp3 */ + lw t4, 28(v0) /* tmp0 = (JLONG)wsptr[7] */ + lw t6, 12(v0) /* tmp2 = (JLONG)wsptr[3] */ + lw t5, 20(v0) /* tmp1 = (JLONG)wsptr[5] */ + lw t7, 4(v0) /* tmp3 = (JLONG)wsptr[1] */ + addu s0, t4, t6 /* z3 = tmp0 + tmp2 */ + addiu t8, zero, 9633 /* FIX_1_175875602 */ + addu s1, t5, t7 /* z4 = tmp1 + tmp3 */ + addu s2, s0, s1 /* z3 + z4 */ + mul s2, s2, t8 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + addu s3, t4, t7 /* z1 = tmp0 + tmp3 */ + addu t9, t5, t6 /* z2 = tmp1 + tmp2 */ + addiu t8, zero, 16069 /* FIX_1_961570560 */ + mul s0, s0, t8 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */ + addiu t8, zero, 3196 /* FIX_0_390180644 */ + mul s1, s1, t8 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */ + addiu t8, zero, 2446 /* FIX_0_298631336 */ + mul t4, t4, t8 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + addiu t8, zero, 7373 /* FIX_0_899976223 */ + mul s3, s3, t8 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */ + addiu t8, zero, 16819 /* FIX_2_053119869 */ + mul t5, t5, t8 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + addiu t8, zero, 20995 /* FIX_2_562915447 */ + mul t9, t9, t8 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */ + addiu t8, zero, 25172 /* FIX_3_072711026 */ + mul t6, t6, t8 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + addiu t8, zero, 12299 /* FIX_1_501321110 */ + mul t7, t7, t8 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + subu s0, s2, s0 /* z3 += z5 */ + subu s1, s2, s1 /* z4 += z5 */ addu t4, t4, s0 - subu t4, t4, s3 // tmp0 + subu t4, t4, s3 /* tmp0 */ addu t5, t5, s1 - subu t5, t5, t9 // tmp1 + subu t5, t5, t9 /* tmp1 */ addu t6, t6, s0 - subu t6, t6, t9 // tmp2 + subu t6, t6, t9 /* tmp2 */ addu t7, t7, s1 - subu t7, t7, s3 // tmp3 + subu t7, t7, s3 /* tmp3 */ addu s0, t0, t7 subu t0, t0, t7 addu t7, t2, t6 @@ -2116,124 +2129,124 @@ LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2) */ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - addiu t9, a0, 16 // end address + addiu t9, a0, 16 /* end address */ or AT, a3, zero 0: - lw s0, 0(a1) // quantptr[DCTSIZE*0] - lw t0, 0(a0) // inptr[DCTSIZE*0] - lw t1, 16(a0) // inptr[DCTSIZE*1] - muleq_s.w.phl v0, t0, s0 // tmp0 ... - lw t2, 32(a0) // inptr[DCTSIZE*2] - lw t3, 48(a0) // inptr[DCTSIZE*3] - lw t4, 64(a0) // inptr[DCTSIZE*4] - lw t5, 80(a0) // inptr[DCTSIZE*5] - muleq_s.w.phr t0, t0, s0 // ... tmp0 ... - lw t6, 96(a0) // inptr[DCTSIZE*6] - lw t7, 112(a0) // inptr[DCTSIZE*7] + lw s0, 0(a1) /* quantptr[DCTSIZE*0] */ + lw t0, 0(a0) /* inptr[DCTSIZE*0] */ + lw t1, 16(a0) /* inptr[DCTSIZE*1] */ + muleq_s.w.phl v0, t0, s0 /* tmp0 ... */ + lw t2, 32(a0) /* inptr[DCTSIZE*2] */ + lw t3, 48(a0) /* inptr[DCTSIZE*3] */ + lw t4, 64(a0) /* inptr[DCTSIZE*4] */ + lw t5, 80(a0) /* inptr[DCTSIZE*5] */ + muleq_s.w.phr t0, t0, s0 /* ... tmp0 ... */ + lw t6, 96(a0) /* inptr[DCTSIZE*6] */ + lw t7, 112(a0) /* inptr[DCTSIZE*7] */ or s4, t1, t2 or s5, t3, t4 bnez s4, 1f - ins t0, v0, 16, 16 // ... tmp0 + ins t0, v0, 16, 16 /* ... tmp0 */ bnez s5, 1f or s6, t5, t6 or s6, s6, t7 bnez s6, 1f - sw t0, 0(a2) // wsptr[DCTSIZE*0] - sw t0, 16(a2) // wsptr[DCTSIZE*1] - sw t0, 32(a2) // wsptr[DCTSIZE*2] - sw t0, 48(a2) // wsptr[DCTSIZE*3] - sw t0, 64(a2) // wsptr[DCTSIZE*4] - sw t0, 80(a2) // wsptr[DCTSIZE*5] - sw t0, 96(a2) // wsptr[DCTSIZE*6] - sw t0, 112(a2) // wsptr[DCTSIZE*7] + sw t0, 0(a2) /* wsptr[DCTSIZE*0] */ + sw t0, 16(a2) /* wsptr[DCTSIZE*1] */ + sw t0, 32(a2) /* wsptr[DCTSIZE*2] */ + sw t0, 48(a2) /* wsptr[DCTSIZE*3] */ + sw t0, 64(a2) /* wsptr[DCTSIZE*4] */ + sw t0, 80(a2) /* wsptr[DCTSIZE*5] */ + sw t0, 96(a2) /* wsptr[DCTSIZE*6] */ + sw t0, 112(a2) /* wsptr[DCTSIZE*7] */ addiu a0, a0, 4 b 2f addiu a1, a1, 4 1: - lw s1, 32(a1) // quantptr[DCTSIZE*2] - lw s2, 64(a1) // quantptr[DCTSIZE*4] - muleq_s.w.phl v0, t2, s1 // tmp1 ... - muleq_s.w.phr t2, t2, s1 // ... tmp1 ... - lw s0, 16(a1) // quantptr[DCTSIZE*1] - lw s1, 48(a1) // quantptr[DCTSIZE*3] - lw s3, 96(a1) // quantptr[DCTSIZE*6] - muleq_s.w.phl v1, t4, s2 // tmp2 ... - muleq_s.w.phr t4, t4, s2 // ... tmp2 ... - lw s2, 80(a1) // quantptr[DCTSIZE*5] - lw t8, 4(AT) // FIX(1.414213562) - ins t2, v0, 16, 16 // ... tmp1 - muleq_s.w.phl v0, t6, s3 // tmp3 ... - muleq_s.w.phr t6, t6, s3 // ... tmp3 ... - ins t4, v1, 16, 16 // ... tmp2 - addq.ph s4, t0, t4 // tmp10 - subq.ph s5, t0, t4 // tmp11 - ins t6, v0, 16, 16 // ... tmp3 - subq.ph s6, t2, t6 // tmp12 ... - addq.ph s7, t2, t6 // tmp13 - mulq_s.ph s6, s6, t8 // ... tmp12 ... - addq.ph t0, s4, s7 // tmp0 - subq.ph t6, s4, s7 // tmp3 - muleq_s.w.phl v0, t1, s0 // tmp4 ... - muleq_s.w.phr t1, t1, s0 // ... tmp4 ... - shll_s.ph s6, s6, 1 // x2 - lw s3, 112(a1) // quantptr[DCTSIZE*7] - subq.ph s6, s6, s7 // ... tmp12 - muleq_s.w.phl v1, t7, s3 // tmp7 ... - muleq_s.w.phr t7, t7, s3 // ... tmp7 ... - ins t1, v0, 16, 16 // ... tmp4 - addq.ph t2, s5, s6 // tmp1 - subq.ph t4, s5, s6 // tmp2 - muleq_s.w.phl v0, t5, s2 // tmp6 ... - muleq_s.w.phr t5, t5, s2 // ... tmp6 ... - ins t7, v1, 16, 16 // ... tmp7 - addq.ph s5, t1, t7 // z11 - subq.ph s6, t1, t7 // z12 - muleq_s.w.phl v1, t3, s1 // tmp5 ... - muleq_s.w.phr t3, t3, s1 // ... tmp5 ... - ins t5, v0, 16, 16 // ... tmp6 - ins t3, v1, 16, 16 // ... tmp5 - addq.ph s7, t5, t3 // z13 - subq.ph v0, t5, t3 // z10 - addq.ph t7, s5, s7 // tmp7 - subq.ph s5, s5, s7 // tmp11 ... - addq.ph v1, v0, s6 // z5 ... - mulq_s.ph s5, s5, t8 // ... tmp11 - lw t8, 8(AT) // FIX(1.847759065) - lw s4, 0(AT) // FIX(1.082392200) + lw s1, 32(a1) /* quantptr[DCTSIZE*2] */ + lw s2, 64(a1) /* quantptr[DCTSIZE*4] */ + muleq_s.w.phl v0, t2, s1 /* tmp1 ... */ + muleq_s.w.phr t2, t2, s1 /* ... tmp1 ... */ + lw s0, 16(a1) /* quantptr[DCTSIZE*1] */ + lw s1, 48(a1) /* quantptr[DCTSIZE*3] */ + lw s3, 96(a1) /* quantptr[DCTSIZE*6] */ + muleq_s.w.phl v1, t4, s2 /* tmp2 ... */ + muleq_s.w.phr t4, t4, s2 /* ... tmp2 ... */ + lw s2, 80(a1) /* quantptr[DCTSIZE*5] */ + lw t8, 4(AT) /* FIX(1.414213562) */ + ins t2, v0, 16, 16 /* ... tmp1 */ + muleq_s.w.phl v0, t6, s3 /* tmp3 ... */ + muleq_s.w.phr t6, t6, s3 /* ... tmp3 ... */ + ins t4, v1, 16, 16 /* ... tmp2 */ + addq.ph s4, t0, t4 /* tmp10 */ + subq.ph s5, t0, t4 /* tmp11 */ + ins t6, v0, 16, 16 /* ... tmp3 */ + subq.ph s6, t2, t6 /* tmp12 ... */ + addq.ph s7, t2, t6 /* tmp13 */ + mulq_s.ph s6, s6, t8 /* ... tmp12 ... */ + addq.ph t0, s4, s7 /* tmp0 */ + subq.ph t6, s4, s7 /* tmp3 */ + muleq_s.w.phl v0, t1, s0 /* tmp4 ... */ + muleq_s.w.phr t1, t1, s0 /* ... tmp4 ... */ + shll_s.ph s6, s6, 1 /* x2 */ + lw s3, 112(a1) /* quantptr[DCTSIZE*7] */ + subq.ph s6, s6, s7 /* ... tmp12 */ + muleq_s.w.phl v1, t7, s3 /* tmp7 ... */ + muleq_s.w.phr t7, t7, s3 /* ... tmp7 ... */ + ins t1, v0, 16, 16 /* ... tmp4 */ + addq.ph t2, s5, s6 /* tmp1 */ + subq.ph t4, s5, s6 /* tmp2 */ + muleq_s.w.phl v0, t5, s2 /* tmp6 ... */ + muleq_s.w.phr t5, t5, s2 /* ... tmp6 ... */ + ins t7, v1, 16, 16 /* ... tmp7 */ + addq.ph s5, t1, t7 /* z11 */ + subq.ph s6, t1, t7 /* z12 */ + muleq_s.w.phl v1, t3, s1 /* tmp5 ... */ + muleq_s.w.phr t3, t3, s1 /* ... tmp5 ... */ + ins t5, v0, 16, 16 /* ... tmp6 */ + ins t3, v1, 16, 16 /* ... tmp5 */ + addq.ph s7, t5, t3 /* z13 */ + subq.ph v0, t5, t3 /* z10 */ + addq.ph t7, s5, s7 /* tmp7 */ + subq.ph s5, s5, s7 /* tmp11 ... */ + addq.ph v1, v0, s6 /* z5 ... */ + mulq_s.ph s5, s5, t8 /* ... tmp11 */ + lw t8, 8(AT) /* FIX(1.847759065) */ + lw s4, 0(AT) /* FIX(1.082392200) */ addq.ph s0, t0, t7 subq.ph s1, t0, t7 - mulq_s.ph v1, v1, t8 // ... z5 - shll_s.ph s5, s5, 1 // x2 - lw t8, 12(AT) // FIX(-2.613125930) - sw s0, 0(a2) // wsptr[DCTSIZE*0] - shll_s.ph v0, v0, 1 // x4 - mulq_s.ph v0, v0, t8 // tmp12 ... - mulq_s.ph s4, s6, s4 // tmp10 ... - shll_s.ph v1, v1, 1 // x2 + mulq_s.ph v1, v1, t8 /* ... z5 */ + shll_s.ph s5, s5, 1 /* x2 */ + lw t8, 12(AT) /* FIX(-2.613125930) */ + sw s0, 0(a2) /* wsptr[DCTSIZE*0] */ + shll_s.ph v0, v0, 1 /* x4 */ + mulq_s.ph v0, v0, t8 /* tmp12 ... */ + mulq_s.ph s4, s6, s4 /* tmp10 ... */ + shll_s.ph v1, v1, 1 /* x2 */ addiu a0, a0, 4 addiu a1, a1, 4 - sw s1, 112(a2) // wsptr[DCTSIZE*7] - shll_s.ph s6, v0, 1 // x4 - shll_s.ph s4, s4, 1 // x2 - addq.ph s6, s6, v1 // ... tmp12 - subq.ph t5, s6, t7 // tmp6 - subq.ph s4, s4, v1 // ... tmp10 - subq.ph t3, s5, t5 // tmp5 + sw s1, 112(a2) /* wsptr[DCTSIZE*7] */ + shll_s.ph s6, v0, 1 /* x4 */ + shll_s.ph s4, s4, 1 /* x2 */ + addq.ph s6, s6, v1 /* ... tmp12 */ + subq.ph t5, s6, t7 /* tmp6 */ + subq.ph s4, s4, v1 /* ... tmp10 */ + subq.ph t3, s5, t5 /* tmp5 */ addq.ph s2, t2, t5 - addq.ph t1, s4, t3 // tmp4 + addq.ph t1, s4, t3 /* tmp4 */ subq.ph s3, t2, t5 - sw s2, 16(a2) // wsptr[DCTSIZE*1] - sw s3, 96(a2) // wsptr[DCTSIZE*6] + sw s2, 16(a2) /* wsptr[DCTSIZE*1] */ + sw s3, 96(a2) /* wsptr[DCTSIZE*6] */ addq.ph v0, t4, t3 subq.ph v1, t4, t3 - sw v0, 32(a2) // wsptr[DCTSIZE*2] - sw v1, 80(a2) // wsptr[DCTSIZE*5] + sw v0, 32(a2) /* wsptr[DCTSIZE*2] */ + sw v1, 80(a2) /* wsptr[DCTSIZE*5] */ addq.ph v0, t6, t1 subq.ph v1, t6, t1 - sw v0, 64(a2) // wsptr[DCTSIZE*4] - sw v1, 48(a2) // wsptr[DCTSIZE*3] + sw v0, 64(a2) /* wsptr[DCTSIZE*4] */ + sw v1, 48(a2) /* wsptr[DCTSIZE*3] */ 2: bne a0, t9, 0b @@ -2257,22 +2270,22 @@ LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2) */ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 - addiu t9, a0, 128 // end address + addiu t9, a0, 128 /* end address */ lui s8, 0x8080 ori s8, s8, 0x8080 0: - lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs) - lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a - lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A - lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c - lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C - lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e - lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E - lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g - lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G - precrq.ph.w t1, s0, t0 // B b - ins t0, s0, 16, 16 // A a + lw AT, 36(sp) /* restore $a3 (mips_idct_ifast_coefs) */ + lw t0, 0(a0) /* wsptr[DCTSIZE*0+0/1] b a */ + lw s0, 16(a0) /* wsptr[DCTSIZE*1+0/1] B A */ + lw t2, 4(a0) /* wsptr[DCTSIZE*0+2/3] d c */ + lw s2, 20(a0) /* wsptr[DCTSIZE*1+2/3] D C */ + lw t4, 8(a0) /* wsptr[DCTSIZE*0+4/5] f e */ + lw s4, 24(a0) /* wsptr[DCTSIZE*1+4/5] F E */ + lw t6, 12(a0) /* wsptr[DCTSIZE*0+6/7] h g */ + lw s6, 28(a0) /* wsptr[DCTSIZE*1+6/7] H G */ + precrq.ph.w t1, s0, t0 /* B b */ + ins t0, s0, 16, 16 /* A a */ bnez t1, 1f or s0, t2, s2 bnez s0, 1f @@ -2280,15 +2293,15 @@ LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2) bnez s0, 1f or s0, t6, s6 bnez s0, 1f - shll_s.ph s0, t0, 2 // A a + shll_s.ph s0, t0, 2 /* A a */ lw a3, 0(a1) lw AT, 4(a1) - precrq.ph.w t0, s0, s0 // A A - ins s0, s0, 16, 16 // a a + precrq.ph.w t0, s0, s0 /* A A */ + ins s0, s0, 16, 16 /* a a */ addu a3, a3, a2 addu AT, AT, a2 - precrq.qb.ph t0, t0, t0 // A A A A - precrq.qb.ph s0, s0, s0 // a a a a + precrq.qb.ph t0, t0, t0 /* A A A A */ + precrq.qb.ph s0, s0, s0 /* a a a a */ addu.qb s0, s0, s8 addu.qb t0, t0, s8 sw s0, 0(a3) @@ -2308,85 +2321,85 @@ LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2) ins t4, s4, 16, 16 precrq.ph.w t7, s6, t6 ins t6, s6, 16, 16 - lw t8, 4(AT) // FIX(1.414213562) - addq.ph s4, t0, t4 // tmp10 - subq.ph s5, t0, t4 // tmp11 - subq.ph s6, t2, t6 // tmp12 ... - addq.ph s7, t2, t6 // tmp13 - mulq_s.ph s6, s6, t8 // ... tmp12 ... - addq.ph t0, s4, s7 // tmp0 - subq.ph t6, s4, s7 // tmp3 - shll_s.ph s6, s6, 1 // x2 - subq.ph s6, s6, s7 // ... tmp12 - addq.ph t2, s5, s6 // tmp1 - subq.ph t4, s5, s6 // tmp2 - addq.ph s5, t1, t7 // z11 - subq.ph s6, t1, t7 // z12 - addq.ph s7, t5, t3 // z13 - subq.ph v0, t5, t3 // z10 - addq.ph t7, s5, s7 // tmp7 - subq.ph s5, s5, s7 // tmp11 ... - addq.ph v1, v0, s6 // z5 ... - mulq_s.ph s5, s5, t8 // ... tmp11 - lw t8, 8(AT) // FIX(1.847759065) - lw s4, 0(AT) // FIX(1.082392200) - addq.ph s0, t0, t7 // tmp0 + tmp7 - subq.ph s7, t0, t7 // tmp0 - tmp7 - mulq_s.ph v1, v1, t8 // ... z5 + lw t8, 4(AT) /* FIX(1.414213562) */ + addq.ph s4, t0, t4 /* tmp10 */ + subq.ph s5, t0, t4 /* tmp11 */ + subq.ph s6, t2, t6 /* tmp12 ... */ + addq.ph s7, t2, t6 /* tmp13 */ + mulq_s.ph s6, s6, t8 /* ... tmp12 ... */ + addq.ph t0, s4, s7 /* tmp0 */ + subq.ph t6, s4, s7 /* tmp3 */ + shll_s.ph s6, s6, 1 /* x2 */ + subq.ph s6, s6, s7 /* ... tmp12 */ + addq.ph t2, s5, s6 /* tmp1 */ + subq.ph t4, s5, s6 /* tmp2 */ + addq.ph s5, t1, t7 /* z11 */ + subq.ph s6, t1, t7 /* z12 */ + addq.ph s7, t5, t3 /* z13 */ + subq.ph v0, t5, t3 /* z10 */ + addq.ph t7, s5, s7 /* tmp7 */ + subq.ph s5, s5, s7 /* tmp11 ... */ + addq.ph v1, v0, s6 /* z5 ... */ + mulq_s.ph s5, s5, t8 /* ... tmp11 */ + lw t8, 8(AT) /* FIX(1.847759065) */ + lw s4, 0(AT) /* FIX(1.082392200) */ + addq.ph s0, t0, t7 /* tmp0 + tmp7 */ + subq.ph s7, t0, t7 /* tmp0 - tmp7 */ + mulq_s.ph v1, v1, t8 /* ... z5 */ lw a3, 0(a1) - lw t8, 12(AT) // FIX(-2.613125930) - shll_s.ph s5, s5, 1 // x2 + lw t8, 12(AT) /* FIX(-2.613125930) */ + shll_s.ph s5, s5, 1 /* x2 */ addu a3, a3, a2 - shll_s.ph v0, v0, 1 // x4 - mulq_s.ph v0, v0, t8 // tmp12 ... - mulq_s.ph s4, s6, s4 // tmp10 ... - shll_s.ph v1, v1, 1 // x2 + shll_s.ph v0, v0, 1 /* x4 */ + mulq_s.ph v0, v0, t8 /* tmp12 ... */ + mulq_s.ph s4, s6, s4 /* tmp10 ... */ + shll_s.ph v1, v1, 1 /* x2 */ addiu a0, a0, 32 addiu a1, a1, 8 - shll_s.ph s6, v0, 1 // x4 - shll_s.ph s4, s4, 1 // x2 - addq.ph s6, s6, v1 // ... tmp12 + shll_s.ph s6, v0, 1 /* x4 */ + shll_s.ph s4, s4, 1 /* x2 */ + addq.ph s6, s6, v1 /* ... tmp12 */ shll_s.ph s0, s0, 2 - subq.ph t5, s6, t7 // tmp6 - subq.ph s4, s4, v1 // ... tmp10 - subq.ph t3, s5, t5 // tmp5 + subq.ph t5, s6, t7 /* tmp6 */ + subq.ph s4, s4, v1 /* ... tmp10 */ + subq.ph t3, s5, t5 /* tmp5 */ shll_s.ph s7, s7, 2 - addq.ph t1, s4, t3 // tmp4 - addq.ph s1, t2, t5 // tmp1 + tmp6 - subq.ph s6, t2, t5 // tmp1 - tmp6 - addq.ph s2, t4, t3 // tmp2 + tmp5 - subq.ph s5, t4, t3 // tmp2 - tmp5 - addq.ph s4, t6, t1 // tmp3 + tmp4 - subq.ph s3, t6, t1 // tmp3 - tmp4 + addq.ph t1, s4, t3 /* tmp4 */ + addq.ph s1, t2, t5 /* tmp1 + tmp6 */ + subq.ph s6, t2, t5 /* tmp1 - tmp6 */ + addq.ph s2, t4, t3 /* tmp2 + tmp5 */ + subq.ph s5, t4, t3 /* tmp2 - tmp5 */ + addq.ph s4, t6, t1 /* tmp3 + tmp4 */ + subq.ph s3, t6, t1 /* tmp3 - tmp4 */ shll_s.ph s1, s1, 2 shll_s.ph s2, s2, 2 shll_s.ph s3, s3, 2 shll_s.ph s4, s4, 2 shll_s.ph s5, s5, 2 shll_s.ph s6, s6, 2 - precrq.ph.w t0, s1, s0 // B A - ins s0, s1, 16, 16 // b a - precrq.ph.w t2, s3, s2 // D C - ins s2, s3, 16, 16 // d c - precrq.ph.w t4, s5, s4 // F E - ins s4, s5, 16, 16 // f e - precrq.ph.w t6, s7, s6 // H G - ins s6, s7, 16, 16 // h g - precrq.qb.ph t0, t2, t0 // D C B A - precrq.qb.ph s0, s2, s0 // d c b a - precrq.qb.ph t4, t6, t4 // H G F E - precrq.qb.ph s4, s6, s4 // h g f e + precrq.ph.w t0, s1, s0 /* B A */ + ins s0, s1, 16, 16 /* b a */ + precrq.ph.w t2, s3, s2 /* D C */ + ins s2, s3, 16, 16 /* d c */ + precrq.ph.w t4, s5, s4 /* F E */ + ins s4, s5, 16, 16 /* f e */ + precrq.ph.w t6, s7, s6 /* H G */ + ins s6, s7, 16, 16 /* h g */ + precrq.qb.ph t0, t2, t0 /* D C B A */ + precrq.qb.ph s0, s2, s0 /* d c b a */ + precrq.qb.ph t4, t6, t4 /* H G F E */ + precrq.qb.ph s4, s6, s4 /* h g f e */ addu.qb s0, s0, s8 addu.qb s4, s4, s8 - sw s0, 0(a3) // outptr[0/1/2/3] d c b a - sw s4, 4(a3) // outptr[4/5/6/7] h g f e + sw s0, 0(a3) /* outptr[0/1/2/3] d c b a */ + sw s4, 4(a3) /* outptr[4/5/6/7] h g f e */ lw a3, -4(a1) addu.qb t0, t0, s8 addu a3, a3, a2 addu.qb t4, t4, s8 - sw t0, 0(a3) // outptr[0/1/2/3] D C B A + sw t0, 0(a3) /* outptr[0/1/2/3] D C B A */ bne a0, t9, 0b - sw t4, 4(a3) // outptr[4/5/6/7] H G F E + sw t4, 4(a3) /* outptr[4/5/6/7] H G F E */ 2: @@ -2428,51 +2441,51 @@ LEAF_DSPR2(jsimd_fdct_islow_dspr2) li s8, 8 move a1, a0 1: - lw s0, 0(a1) // tmp0 = 1|0 - lw s1, 4(a1) // tmp1 = 3|2 - lw s2, 8(a1) // tmp2 = 5|4 - lw s3, 12(a1) // tmp3 = 7|6 - packrl.ph s1, s1, s1 // tmp1 = 2|3 - packrl.ph s3, s3, s3 // tmp3 = 6|7 - subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4 - subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7 - mult $0, $0 // ac0 = 0 - dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260 - dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363 - mult $ac1, $0, $0 // ac1 = 0 - dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436 - dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633 - mult $ac2, $0, $0 // ac2 = 0 - dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633 - dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437 - mult $ac3, $0, $0 // ac3 = 0 - dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363 - dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260 - addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3 - addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0 - extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 - extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 - extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11 - extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11 - addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10 - subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13 + lw s0, 0(a1) /* tmp0 = 1|0 */ + lw s1, 4(a1) /* tmp1 = 3|2 */ + lw s2, 8(a1) /* tmp2 = 5|4 */ + lw s3, 12(a1) /* tmp3 = 7|6 */ + packrl.ph s1, s1, s1 /* tmp1 = 2|3 */ + packrl.ph s3, s3, s3 /* tmp3 = 6|7 */ + subq.ph s7, s1, s2 /* tmp7 = 2-5|3-4 = t5|t4 */ + subq.ph s5, s0, s3 /* tmp5 = 1-6|0-7 = t6|t7 */ + mult $0, $0 /* ac0 = 0 */ + dpa.w.ph $ac0, s7, t0 /* ac0 += t5* 6437 + t4* 2260 */ + dpa.w.ph $ac0, s5, t1 /* ac0 += t6* 9633 + t7* 11363 */ + mult $ac1, $0, $0 /* ac1 = 0 */ + dpa.w.ph $ac1, s7, t2 /* ac1 += t5*-11362 + t4* -6436 */ + dpa.w.ph $ac1, s5, t3 /* ac1 += t6* -2259 + t7* 9633 */ + mult $ac2, $0, $0 /* ac2 = 0 */ + dpa.w.ph $ac2, s7, t4 /* ac2 += t5* 2261 + t4* 9633 */ + dpa.w.ph $ac2, s5, t5 /* ac2 += t6*-11362 + t7* 6437 */ + mult $ac3, $0, $0 /* ac3 = 0 */ + dpa.w.ph $ac3, s7, t6 /* ac3 += t5* 9633 + t4*-11363 */ + dpa.w.ph $ac3, s5, t7 /* ac3 += t6* -6436 + t7* 2260 */ + addq.ph s6, s1, s2 /* tmp6 = 2+5|3+4 = t2|t3 */ + addq.ph s4, s0, s3 /* tmp4 = 1+6|0+7 = t1|t0 */ + extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */ + extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */ + extr_r.w s2, $ac2, 11 /* tmp2 = (ac2 + 1024) >> 11 */ + extr_r.w s3, $ac3, 11 /* tmp3 = (ac3 + 1024) >> 11 */ + addq.ph s5, s4, s6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */ + subq.ph s7, s4, s6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */ sh s0, 2(a1) sh s1, 6(a1) sh s2, 10(a1) sh s3, 14(a1) - mult $0, $0 // ac0 = 0 - dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703 - mult $ac1, $0, $0 // ac1 = 0 - dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433 - sra s4, s5, 16 // tmp4 = t11 + mult $0, $0 /* ac0 = 0 */ + dpa.w.ph $ac0, s7, t8 /* ac0 += t12* 4433 + t13* 10703 */ + mult $ac1, $0, $0 /* ac1 = 0 */ + dpa.w.ph $ac1, s7, t9 /* ac1 += t12*-10704 + t13* 4433 */ + sra s4, s5, 16 /* tmp4 = t11 */ addiu a1, a1, 16 addiu s8, s8, -1 - extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 - extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 - addu s2, s5, s4 // tmp2 = t10 + t11 - subu s3, s5, s4 // tmp3 = t10 - t11 - sll s2, s2, 2 // tmp2 = (t10 + t11) << 2 - sll s3, s3, 2 // tmp3 = (t10 - t11) << 2 + extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */ + extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */ + addu s2, s5, s4 /* tmp2 = t10 + t11 */ + subu s3, s5, s4 /* tmp3 = t10 - t11 */ + sll s2, s2, 2 /* tmp2 = (t10 + t11) << 2 */ + sll s3, s3, 2 /* tmp3 = (t10 - t11) << 2 */ sh s2, -16(a1) sh s3, -8(a1) sh s0, -12(a1) @@ -2492,62 +2505,62 @@ LEAF_DSPR2(jsimd_fdct_islow_dspr2) li s8, 8 2: - lh a2, 0(a0) // 0 - lh a3, 16(a0) // 8 - lh v0, 32(a0) // 16 - lh v1, 48(a0) // 24 - lh s4, 64(a0) // 32 - lh s5, 80(a0) // 40 - lh s6, 96(a0) // 48 - lh s7, 112(a0) // 56 - addu s2, v0, s5 // tmp2 = 16 + 40 - subu s5, v0, s5 // tmp5 = 16 - 40 - addu s3, v1, s4 // tmp3 = 24 + 32 - subu s4, v1, s4 // tmp4 = 24 - 32 - addu s0, a2, s7 // tmp0 = 0 + 56 - subu s7, a2, s7 // tmp7 = 0 - 56 - addu s1, a3, s6 // tmp1 = 8 + 48 - subu s6, a3, s6 // tmp6 = 8 - 48 - addu a2, s0, s3 // tmp10 = tmp0 + tmp3 - subu v1, s0, s3 // tmp13 = tmp0 - tmp3 - addu a3, s1, s2 // tmp11 = tmp1 + tmp2 - subu v0, s1, s2 // tmp12 = tmp1 - tmp2 - mult s7, t1 // ac0 = tmp7 * c1 - madd s4, t0 // ac0 += tmp4 * c0 - madd s5, t4 // ac0 += tmp5 * c4 - madd s6, t2 // ac0 += tmp6 * c2 - mult $ac1, s7, t2 // ac1 = tmp7 * c2 - msub $ac1, s4, t3 // ac1 -= tmp4 * c3 - msub $ac1, s5, t6 // ac1 -= tmp5 * c6 - msub $ac1, s6, t7 // ac1 -= tmp6 * c7 - mult $ac2, s7, t4 // ac2 = tmp7 * c4 - madd $ac2, s4, t2 // ac2 += tmp4 * c2 - madd $ac2, s5, t5 // ac2 += tmp5 * c5 - msub $ac2, s6, t6 // ac2 -= tmp6 * c6 - mult $ac3, s7, t0 // ac3 = tmp7 * c0 - msub $ac3, s4, t1 // ac3 -= tmp4 * c1 - madd $ac3, s5, t2 // ac3 += tmp5 * c2 - msub $ac3, s6, t3 // ac3 -= tmp6 * c3 - extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15 - extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15 - extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15 - extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15 + lh a2, 0(a0) /* 0 */ + lh a3, 16(a0) /* 8 */ + lh v0, 32(a0) /* 16 */ + lh v1, 48(a0) /* 24 */ + lh s4, 64(a0) /* 32 */ + lh s5, 80(a0) /* 40 */ + lh s6, 96(a0) /* 48 */ + lh s7, 112(a0) /* 56 */ + addu s2, v0, s5 /* tmp2 = 16 + 40 */ + subu s5, v0, s5 /* tmp5 = 16 - 40 */ + addu s3, v1, s4 /* tmp3 = 24 + 32 */ + subu s4, v1, s4 /* tmp4 = 24 - 32 */ + addu s0, a2, s7 /* tmp0 = 0 + 56 */ + subu s7, a2, s7 /* tmp7 = 0 - 56 */ + addu s1, a3, s6 /* tmp1 = 8 + 48 */ + subu s6, a3, s6 /* tmp6 = 8 - 48 */ + addu a2, s0, s3 /* tmp10 = tmp0 + tmp3 */ + subu v1, s0, s3 /* tmp13 = tmp0 - tmp3 */ + addu a3, s1, s2 /* tmp11 = tmp1 + tmp2 */ + subu v0, s1, s2 /* tmp12 = tmp1 - tmp2 */ + mult s7, t1 /* ac0 = tmp7 * c1 */ + madd s4, t0 /* ac0 += tmp4 * c0 */ + madd s5, t4 /* ac0 += tmp5 * c4 */ + madd s6, t2 /* ac0 += tmp6 * c2 */ + mult $ac1, s7, t2 /* ac1 = tmp7 * c2 */ + msub $ac1, s4, t3 /* ac1 -= tmp4 * c3 */ + msub $ac1, s5, t6 /* ac1 -= tmp5 * c6 */ + msub $ac1, s6, t7 /* ac1 -= tmp6 * c7 */ + mult $ac2, s7, t4 /* ac2 = tmp7 * c4 */ + madd $ac2, s4, t2 /* ac2 += tmp4 * c2 */ + madd $ac2, s5, t5 /* ac2 += tmp5 * c5 */ + msub $ac2, s6, t6 /* ac2 -= tmp6 * c6 */ + mult $ac3, s7, t0 /* ac3 = tmp7 * c0 */ + msub $ac3, s4, t1 /* ac3 -= tmp4 * c1 */ + madd $ac3, s5, t2 /* ac3 += tmp5 * c2 */ + msub $ac3, s6, t3 /* ac3 -= tmp6 * c3 */ + extr_r.w s0, $ac0, 15 /* tmp0 = (ac0 + 16384) >> 15 */ + extr_r.w s1, $ac1, 15 /* tmp1 = (ac1 + 16384) >> 15 */ + extr_r.w s2, $ac2, 15 /* tmp2 = (ac2 + 16384) >> 15 */ + extr_r.w s3, $ac3, 15 /* tmp3 = (ac3 + 16384) >> 15 */ addiu s8, s8, -1 - addu s4, a2, a3 // tmp4 = tmp10 + tmp11 - subu s5, a2, a3 // tmp5 = tmp10 - tmp11 + addu s4, a2, a3 /* tmp4 = tmp10 + tmp11 */ + subu s5, a2, a3 /* tmp5 = tmp10 - tmp11 */ sh s0, 16(a0) sh s1, 48(a0) sh s2, 80(a0) sh s3, 112(a0) - mult v0, t8 // ac0 = tmp12 * c8 - madd v1, t9 // ac0 += tmp13 * c9 - mult $ac1, v1, t8 // ac1 = tmp13 * c8 - msub $ac1, v0, a1 // ac1 -= tmp12 * c10 + mult v0, t8 /* ac0 = tmp12 * c8 */ + madd v1, t9 /* ac0 += tmp13 * c9 */ + mult $ac1, v1, t8 /* ac1 = tmp13 * c8 */ + msub $ac1, v0, a1 /* ac1 -= tmp12 * c10 */ addiu a0, a0, 2 - extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15 - extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15 - shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2 - shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2 + extr_r.w s6, $ac0, 15 /* tmp6 = (ac0 + 16384) >> 15 */ + extr_r.w s7, $ac1, 15 /* tmp7 = (ac1 + 16384) >> 15 */ + shra_r.w s4, s4, 2 /* tmp4 = (tmp4 + 2) >> 2 */ + shra_r.w s5, s5, 2 /* tmp5 = (tmp5 + 2) >> 2 */ sh s4, -2(a0) sh s5, 62(a0) sh s6, 30(a0) @@ -2571,55 +2584,59 @@ LEAF_DSPR2(jsimd_fdct_ifast_dspr2) SAVE_REGS_ON_STACK 8, s0, s1 - li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff) - li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff) - li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff) - li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff) + li a1, 0x014e014e /* FIX_1_306562965 (334 << 16) | + (334 & 0xffff) */ + li a2, 0x008b008b /* FIX_0_541196100 (139 << 16) | + (139 & 0xffff) */ + li a3, 0x00620062 /* FIX_0_382683433 (98 << 16) | + (98 & 0xffff) */ + li s1, 0x00b500b5 /* FIX_0_707106781 (181 << 16) | + (181 & 0xffff) */ move v0, a0 - addiu v1, v0, 128 // end address + addiu v1, v0, 128 /* end address */ 0: - lw t0, 0(v0) // tmp0 = 1|0 - lw t1, 4(v0) // tmp1 = 3|2 - lw t2, 8(v0) // tmp2 = 5|4 - lw t3, 12(v0) // tmp3 = 7|6 - packrl.ph t1, t1, t1 // tmp1 = 2|3 - packrl.ph t3, t3, t3 // tmp3 = 6|7 - subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4 - subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7 - addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3 - addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0 - addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10 - subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13 - sra t4, t8, 16 // tmp4 = t11 - mult $0, $0 // ac0 = 0 + lw t0, 0(v0) /* tmp0 = 1|0 */ + lw t1, 4(v0) /* tmp1 = 3|2 */ + lw t2, 8(v0) /* tmp2 = 5|4 */ + lw t3, 12(v0) /* tmp3 = 7|6 */ + packrl.ph t1, t1, t1 /* tmp1 = 2|3 */ + packrl.ph t3, t3, t3 /* tmp3 = 6|7 */ + subq.ph t7, t1, t2 /* tmp7 = 2-5|3-4 = t5|t4 */ + subq.ph t5, t0, t3 /* tmp5 = 1-6|0-7 = t6|t7 */ + addq.ph t6, t1, t2 /* tmp6 = 2+5|3+4 = t2|t3 */ + addq.ph t4, t0, t3 /* tmp4 = 1+6|0+7 = t1|t0 */ + addq.ph t8, t4, t6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */ + subq.ph t9, t4, t6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */ + sra t4, t8, 16 /* tmp4 = t11 */ + mult $0, $0 /* ac0 = 0 */ dpa.w.ph $ac0, t9, s1 - mult $ac1, $0, $0 // ac1 = 0 - dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98 - dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98 - mult $ac2, $0, $0 // ac2 = 0 - dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139 - mult $ac3, $0, $0 // ac3 = 0 - dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334 - precrq.ph.w t0, t5, t7 // t0 = t5|t6 - addq.ph t2, t8, t4 // tmp2 = t10 + t11 - subq.ph t3, t8, t4 // tmp3 = t10 - t11 + mult $ac1, $0, $0 /* ac1 = 0 */ + dpa.w.ph $ac1, t7, a3 /* ac1 += t4*98 + t5*98 */ + dpsx.w.ph $ac1, t5, a3 /* ac1 += t6*98 + t7*98 */ + mult $ac2, $0, $0 /* ac2 = 0 */ + dpa.w.ph $ac2, t7, a2 /* ac2 += t4*139 + t5*139 */ + mult $ac3, $0, $0 /* ac3 = 0 */ + dpa.w.ph $ac3, t5, a1 /* ac3 += t6*334 + t7*334 */ + precrq.ph.w t0, t5, t7 /* t0 = t5|t6 */ + addq.ph t2, t8, t4 /* tmp2 = t10 + t11 */ + subq.ph t3, t8, t4 /* tmp3 = t10 - t11 */ extr.w t4, $ac0, 8 - mult $0, $0 // ac0 = 0 - dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181 - extr.w t0, $ac1, 8 // t0 = z5 - extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139) - extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334) - extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181) - add t6, t1, t0 // t6 = z2 - add t7, t7, t0 // t7 = z4 - subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3 - addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3 - addq.ph t1, t0, t6 // t1 = z13 + z2 - subq.ph t6, t0, t6 // t6 = z13 - z2 - addq.ph t0, t8, t7 // t0 = z11 + z4 - subq.ph t7, t8, t7 // t7 = z11 - z4 + mult $0, $0 /* ac0 = 0 */ + dpa.w.ph $ac0, t0, s1 /* ac0 += t5*181 + t6*181 */ + extr.w t0, $ac1, 8 /* t0 = z5 */ + extr.w t1, $ac2, 8 /* t1 = MULTIPLY(tmp10, 139) */ + extr.w t7, $ac3, 8 /* t2 = MULTIPLY(tmp12, 334) */ + extr.w t8, $ac0, 8 /* t8 = z3 = MULTIPLY(tmp11, 181) */ + add t6, t1, t0 /* t6 = z2 */ + add t7, t7, t0 /* t7 = z4 */ + subq.ph t0, t5, t8 /* t0 = z13 = tmp7 - z3 */ + addq.ph t8, t5, t8 /* t9 = z11 = tmp7 + z3 */ + addq.ph t1, t0, t6 /* t1 = z13 + z2 */ + subq.ph t6, t0, t6 /* t6 = z13 - z2 */ + addq.ph t0, t8, t7 /* t0 = z11 + z4 */ + subq.ph t7, t8, t7 /* t7 = z11 - z4 */ addq.ph t5, t4, t9 subq.ph t4, t9, t4 sh t2, 0(v0) @@ -2637,64 +2654,69 @@ LEAF_DSPR2(jsimd_fdct_ifast_dspr2) addiu v1, v0, 16 1: - lh t0, 0(v0) // 0 - lh t1, 16(v0) // 8 - lh t2, 32(v0) // 16 - lh t3, 48(v0) // 24 - lh t4, 64(v0) // 32 - lh t5, 80(v0) // 40 - lh t6, 96(v0) // 48 - lh t7, 112(v0) // 56 - add t8, t0, t7 // t8 = tmp0 - sub t7, t0, t7 // t7 = tmp7 - add t0, t1, t6 // t0 = tmp1 - sub t1, t1, t6 // t1 = tmp6 - add t6, t2, t5 // t6 = tmp2 - sub t5, t2, t5 // t5 = tmp5 - add t2, t3, t4 // t2 = tmp3 - sub t3, t3, t4 // t3 = tmp4 - add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3 - sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3 - sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2 - ins t8, s0, 16, 16 // t8 = tmp12|tmp13 - add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2 - mult $0, $0 // ac0 = 0 - dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181 - add s0, t4, t2 // t8 = tmp10+tmp11 - sub t4, t4, t2 // t4 = tmp10-tmp11 + lh t0, 0(v0) /* 0 */ + lh t1, 16(v0) /* 8 */ + lh t2, 32(v0) /* 16 */ + lh t3, 48(v0) /* 24 */ + lh t4, 64(v0) /* 32 */ + lh t5, 80(v0) /* 40 */ + lh t6, 96(v0) /* 48 */ + lh t7, 112(v0) /* 56 */ + add t8, t0, t7 /* t8 = tmp0 */ + sub t7, t0, t7 /* t7 = tmp7 */ + add t0, t1, t6 /* t0 = tmp1 */ + sub t1, t1, t6 /* t1 = tmp6 */ + add t6, t2, t5 /* t6 = tmp2 */ + sub t5, t2, t5 /* t5 = tmp5 */ + add t2, t3, t4 /* t2 = tmp3 */ + sub t3, t3, t4 /* t3 = tmp4 */ + add t4, t8, t2 /* t4 = tmp10 = tmp0 + tmp3 */ + sub t8, t8, t2 /* t8 = tmp13 = tmp0 - tmp3 */ + sub s0, t0, t6 /* s0 = tmp12 = tmp1 - tmp2 */ + ins t8, s0, 16, 16 /* t8 = tmp12|tmp13 */ + add t2, t0, t6 /* t2 = tmp11 = tmp1 + tmp2 */ + mult $0, $0 /* ac0 = 0 */ + dpa.w.ph $ac0, t8, s1 /* ac0 += t12*181 + t13*181 */ + add s0, t4, t2 /* t8 = tmp10+tmp11 */ + sub t4, t4, t2 /* t4 = tmp10-tmp11 */ sh s0, 0(v0) sh t4, 64(v0) - extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13, FIX_0_707106781) - addq.ph t4, t8, t2 // t9 = tmp13 + z1 - subq.ph t8, t8, t2 // t2 = tmp13 - z1 + extr.w t2, $ac0, 8 /* z1 = MULTIPLY(tmp12+tmp13, + FIX_0_707106781) */ + addq.ph t4, t8, t2 /* t9 = tmp13 + z1 */ + subq.ph t8, t8, t2 /* t2 = tmp13 - z1 */ sh t4, 32(v0) sh t8, 96(v0) - add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5 - add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6 - add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7 + add t3, t3, t5 /* t3 = tmp10 = tmp4 + tmp5 */ + add t0, t5, t1 /* t0 = tmp11 = tmp5 + tmp6 */ + add t1, t1, t7 /* t1 = tmp12 = tmp6 + tmp7 */ andi t4, a1, 0xffff mul s0, t1, t4 - sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965) - ins t1, t3, 16, 16 // t1 = tmp10|tmp12 - mult $0, $0 // ac0 = 0 - mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98 - extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12, FIX_0_382683433) - add t2, t7, t8 // t2 = tmp7 + z5 - sub t7, t7, t8 // t7 = tmp7 - z5 + sra s0, s0, 8 /* s0 = z4 = + MULTIPLY(tmp12, FIX_1_306562965) */ + ins t1, t3, 16, 16 /* t1 = tmp10|tmp12 */ + mult $0, $0 /* ac0 = 0 */ + mulsa.w.ph $ac0, t1, a3 /* ac0 += t10*98 - t12*98 */ + extr.w t8, $ac0, 8 /* z5 = MULTIPLY(tmp10-tmp12, + FIX_0_382683433) */ + add t2, t7, t8 /* t2 = tmp7 + z5 */ + sub t7, t7, t8 /* t7 = tmp7 - z5 */ andi t4, a2, 0xffff mul t8, t3, t4 - sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100) + sra t8, t8, 8 /* t8 = z2 = + MULTIPLY(tmp10, FIX_0_541196100) */ andi t4, s1, 0xffff mul t6, t0, t4 - sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781) - add t0, t6, t8 // t0 = z3 + z2 - sub t1, t6, t8 // t1 = z3 - z2 - add t3, t6, s0 // t3 = z3 + z4 - sub t4, t6, s0 // t4 = z3 - z4 - sub t5, t2, t1 // t5 = dataptr[5] - sub t6, t7, t0 // t6 = dataptr[3] - add t3, t2, t3 // t3 = dataptr[1] - add t4, t7, t4 // t4 = dataptr[7] + sra t6, t6, 8 /* t6 = z3 = + MULTIPLY(tmp11, FIX_0_707106781) */ + add t0, t6, t8 /* t0 = z3 + z2 */ + sub t1, t6, t8 /* t1 = z3 - z2 */ + add t3, t6, s0 /* t3 = z3 + z4 */ + sub t4, t6, s0 /* t4 = z3 - z4 */ + sub t5, t2, t1 /* t5 = dataptr[5] */ + sub t6, t7, t0 /* t6 = dataptr[3] */ + add t3, t2, t3 /* t3 = dataptr[1] */ + add t4, t7, t4 /* t4 = dataptr[7] */ sh t5, 80(v0) sh t6, 48(v0) sh t3, 16(v0) @@ -2721,7 +2743,7 @@ LEAF_DSPR2(jsimd_quantize_dspr2) SAVE_REGS_ON_STACK 16, s0, s1, s2 - addiu v0, a2, 124 // v0 = workspace_end + addiu v0, a2, 124 /* v0 = workspace_end */ lh t0, 0(a2) lh t1, 0(a1) lh t2, 128(a1) @@ -2821,7 +2843,7 @@ LEAF_DSPR2(jsimd_quantize_float_dspr2) */ .set at - li t1, 0x46800100 // integer representation 16384.5 + li t1, 0x46800100 /* integer representation 16384.5 */ mtc1 t1, f0 li t0, 63 0: @@ -2913,30 +2935,30 @@ LEAF_DSPR2(jsimd_idct_2x2_dspr2) addiu s3, zero, -10426 addiu s4, zero, 6967 addiu s5, zero, -5906 - lh t0, 0(a1) // t0 = inptr[DCTSIZE*0] - lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0] - lh t1, 48(a1) // t1 = inptr[DCTSIZE*3] - lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3] + lh t0, 0(a1) /* t0 = inptr[DCTSIZE*0] */ + lh t5, 0(a0) /* t5 = quantptr[DCTSIZE*0] */ + lh t1, 48(a1) /* t1 = inptr[DCTSIZE*3] */ + lh t6, 48(a0) /* t6 = quantptr[DCTSIZE*3] */ mul t4, t5, t0 - lh t0, 16(a1) // t0 = inptr[DCTSIZE*1] - lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1] + lh t0, 16(a1) /* t0 = inptr[DCTSIZE*1] */ + lh t5, 16(a0) /* t5 = quantptr[DCTSIZE*1] */ mul t6, t6, t1 mul t5, t5, t0 - lh t2, 80(a1) // t2 = inptr[DCTSIZE*5] - lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5] - lh t3, 112(a1) // t3 = inptr[DCTSIZE*7] - lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7] + lh t2, 80(a1) /* t2 = inptr[DCTSIZE*5] */ + lh t7, 80(a0) /* t7 = quantptr[DCTSIZE*5] */ + lh t3, 112(a1) /* t3 = inptr[DCTSIZE*7] */ + lh t8, 112(a0) /* t8 = quantptr[DCTSIZE*7] */ mul t7, t7, t2 mult zero, zero mul t8, t8, t3 - li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff) - li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff) - ins t6, t5, 16, 16 // t6 = t5|t6 + li s0, 0x73FCD746 /* s0 = (29692 << 16) | (-10426 & 0xffff) */ + li s1, 0x1B37E8EE /* s1 = (6967 << 16) | (-5906 & 0xffff) */ + ins t6, t5, 16, 16 /* t6 = t5|t6 */ sll t4, t4, 15 dpa.w.ph $ac0, t6, s0 lh t1, 2(a1) lh t6, 2(a0) - ins t8, t7, 16, 16 // t8 = t7|t8 + ins t8, t7, 16, 16 /* t8 = t7|t8 */ dpa.w.ph $ac0, t8, s1 mflo t0, $ac0 mul t5, t6, t1 @@ -3122,7 +3144,7 @@ LEAF_DSPR2(jsimd_idct_4x4_dspr2) * a1 = coef_block * a2 = output_buf * a3 = output_col - * 16(sp) = workspace[DCTSIZE*4]; // buffers data between passes + * 16(sp) = workspace[DCTSIZE*4] (buffers data between passes) */ .set at @@ -3138,35 +3160,44 @@ LEAF_DSPR2(jsimd_idct_4x4_dspr2) li s3, 0x52031ccd 0: - lh s6, 32(t0) // inptr[DCTSIZE*2] - lh t6, 32(a0) // quantptr[DCTSIZE*2] - lh s7, 96(t0) // inptr[DCTSIZE*6] - lh t7, 96(a0) // quantptr[DCTSIZE*6] - mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) - lh s4, 0(t0) // inptr[DCTSIZE*0] - mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) - lh s5, 0(a0) // quantptr[0] + lh s6, 32(t0) /* inptr[DCTSIZE*2] */ + lh t6, 32(a0) /* quantptr[DCTSIZE*2] */ + lh s7, 96(t0) /* inptr[DCTSIZE*6] */ + lh t7, 96(a0) /* quantptr[DCTSIZE*6] */ + mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] * + quantptr[DCTSIZE*2]) */ + lh s4, 0(t0) /* inptr[DCTSIZE*0] */ + mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] * + quantptr[DCTSIZE*6]) */ + lh s5, 0(a0) /* quantptr[0] */ li s6, 15137 li s7, 6270 - mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) - mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) - lh t5, 112(t0) // inptr[DCTSIZE*7] - mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) - lh s4, 112(a0) // quantptr[DCTSIZE*7] - lh v0, 80(t0) // inptr[DCTSIZE*5] - lh s5, 80(a0) // quantptr[DCTSIZE*5] - lh s6, 48(a0) // quantptr[DCTSIZE*3] - sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) - lh s7, 16(a0) // quantptr[DCTSIZE*1] - lh t8, 16(t0) // inptr[DCTSIZE*1] - subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) - lh t7, 48(t0) // inptr[DCTSIZE*3] - mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) - mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) - mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) - mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) - addu t3, t2, t6 // tmp10 = tmp0 + z2 - subu t4, t2, t6 // tmp10 = tmp0 - z2 + mul t2, s4, s5 /* tmp0 = (inptr[0] * quantptr[0]) */ + mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] * + quantptr[DCTSIZE*2]) */ + lh t5, 112(t0) /* inptr[DCTSIZE*7] */ + mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] * + quantptr[DCTSIZE*6]) */ + lh s4, 112(a0) /* quantptr[DCTSIZE*7] */ + lh v0, 80(t0) /* inptr[DCTSIZE*5] */ + lh s5, 80(a0) /* quantptr[DCTSIZE*5] */ + lh s6, 48(a0) /* quantptr[DCTSIZE*3] */ + sll t2, t2, 14 /* tmp0 <<= (CONST_BITS+1) */ + lh s7, 16(a0) /* quantptr[DCTSIZE*1] */ + lh t8, 16(t0) /* inptr[DCTSIZE*1] */ + subu t6, t6, t7 /* tmp2 = + MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */ + lh t7, 48(t0) /* inptr[DCTSIZE*3] */ + mul t5, s4, t5 /* z1 = (inptr[DCTSIZE*7] * + quantptr[DCTSIZE*7]) */ + mul v0, s5, v0 /* z2 = (inptr[DCTSIZE*5] * + quantptr[DCTSIZE*5]) */ + mul t7, s6, t7 /* z3 = (inptr[DCTSIZE*3] * + quantptr[DCTSIZE*3]) */ + mul t8, s7, t8 /* z4 = (inptr[DCTSIZE*1] * + quantptr[DCTSIZE*1]) */ + addu t3, t2, t6 /* tmp10 = tmp0 + z2 */ + subu t4, t2, t6 /* tmp10 = tmp0 - z2 */ mult $ac0, zero, zero mult $ac1, zero, zero ins t5, v0, 16, 16 @@ -3185,47 +3216,56 @@ LEAF_DSPR2(jsimd_idct_4x4_dspr2) subu t5, t4, s4 addu s6, t3, s5 subu s7, t3, s5 - shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12) - shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12) - shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) - shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) + shra_r.w t6, t6, 12 /* DESCALE(tmp12 + temp1, 12) */ + shra_r.w t5, t5, 12 /* DESCALE(tmp12 - temp1, 12) */ + shra_r.w s6, s6, 12 /* DESCALE(tmp10 + temp2, 12) */ + shra_r.w s7, s7, 12 /* DESCALE(tmp10 - temp2, 12) */ sw t6, 28(t1) sw t5, 60(t1) sw s6, -4(t1) bgtz t9, 0b sw s7, 92(t1) - // second loop three pass + /* second loop three pass */ li t9, 3 1: - lh s6, 34(t0) // inptr[DCTSIZE*2] - lh t6, 34(a0) // quantptr[DCTSIZE*2] - lh s7, 98(t0) // inptr[DCTSIZE*6] - lh t7, 98(a0) // quantptr[DCTSIZE*6] - mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) - lh s4, 2(t0) // inptr[DCTSIZE*0] - mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) - lh s5, 2(a0) // quantptr[DCTSIZE*0] + lh s6, 34(t0) /* inptr[DCTSIZE*2] */ + lh t6, 34(a0) /* quantptr[DCTSIZE*2] */ + lh s7, 98(t0) /* inptr[DCTSIZE*6] */ + lh t7, 98(a0) /* quantptr[DCTSIZE*6] */ + mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] * + quantptr[DCTSIZE*2]) */ + lh s4, 2(t0) /* inptr[DCTSIZE*0] */ + mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] * + quantptr[DCTSIZE*6]) */ + lh s5, 2(a0) /* quantptr[DCTSIZE*0] */ li s6, 15137 li s7, 6270 - mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) - mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) - lh t5, 114(t0) // inptr[DCTSIZE*7] - mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) - lh s4, 114(a0) // quantptr[DCTSIZE*7] - lh s5, 82(a0) // quantptr[DCTSIZE*5] - lh t6, 82(t0) // inptr[DCTSIZE*5] - sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) - lh s6, 50(a0) // quantptr[DCTSIZE*3] - lh t8, 18(t0) // inptr[DCTSIZE*1] - subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) - lh t7, 50(t0) // inptr[DCTSIZE*3] - lh s7, 18(a0) // quantptr[DCTSIZE*1] - mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) - mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) - mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) - mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) - addu t3, t2, v0 // tmp10 = tmp0 + z2 - subu t4, t2, v0 // tmp10 = tmp0 - z2 + mul t2, s4, s5 /* tmp0 = (inptr[0] * quantptr[0]) */ + mul v0, s6, t6 /* z2 = (inptr[DCTSIZE*2] * + quantptr[DCTSIZE*2]) */ + lh t5, 114(t0) /* inptr[DCTSIZE*7] */ + mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] * + quantptr[DCTSIZE*6]) */ + lh s4, 114(a0) /* quantptr[DCTSIZE*7] */ + lh s5, 82(a0) /* quantptr[DCTSIZE*5] */ + lh t6, 82(t0) /* inptr[DCTSIZE*5] */ + sll t2, t2, 14 /* tmp0 <<= (CONST_BITS+1) */ + lh s6, 50(a0) /* quantptr[DCTSIZE*3] */ + lh t8, 18(t0) /* inptr[DCTSIZE*1] */ + subu v0, v0, t7 /* tmp2 = + MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */ + lh t7, 50(t0) /* inptr[DCTSIZE*3] */ + lh s7, 18(a0) /* quantptr[DCTSIZE*1] */ + mul t5, s4, t5 /* z1 = (inptr[DCTSIZE*7] * + quantptr[DCTSIZE*7]) */ + mul t6, s5, t6 /* z2 = (inptr[DCTSIZE*5] * + quantptr[DCTSIZE*5]) */ + mul t7, s6, t7 /* z3 = (inptr[DCTSIZE*3] * + quantptr[DCTSIZE*3]) */ + mul t8, s7, t8 /* z4 = (inptr[DCTSIZE*1] * + quantptr[DCTSIZE*1]) */ + addu t3, t2, v0 /* tmp10 = tmp0 + z2 */ + subu t4, t2, v0 /* tmp10 = tmp0 - z2 */ mult $ac0, zero, zero mult $ac1, zero, zero ins t5, t6, 16, 16 @@ -3244,10 +3284,10 @@ LEAF_DSPR2(jsimd_idct_4x4_dspr2) subu s4, t4, t5 addu s6, t3, t6 subu s7, t3, t6 - shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12) - shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12) - shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) - shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) + shra_r.w s5, s5, 12 /* DESCALE(tmp12 + temp1, 12) */ + shra_r.w s4, s4, 12 /* DESCALE(tmp12 - temp1, 12) */ + shra_r.w s6, s6, 12 /* DESCALE(tmp10 + temp2, 12) */ + shra_r.w s7, s7, 12 /* DESCALE(tmp10 - temp2, 12) */ sw s5, 32(t1) sw s4, 64(t1) sw s6, 0(t1) @@ -3255,16 +3295,18 @@ LEAF_DSPR2(jsimd_idct_4x4_dspr2) sw s7, 96(t1) move t1, v1 li s4, 15137 - lw s6, 8(t1) // wsptr[2] + lw s6, 8(t1) /* wsptr[2] */ li s5, 6270 - lw s7, 24(t1) // wsptr[6] - mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) - lw t2, 0(t1) // wsptr[0] - mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865) - lh t5, 28(t1) // wsptr[7] - lh t6, 20(t1) // wsptr[5] - lh t7, 12(t1) // wsptr[3] - lh t8, 4(t1) // wsptr[1] + lw s7, 24(t1) /* wsptr[6] */ + mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2], + FIX_1_847759065) */ + lw t2, 0(t1) /* wsptr[0] */ + mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6], + -FIX_0_765366865) */ + lh t5, 28(t1) /* wsptr[7] */ + lh t6, 20(t1) /* wsptr[5] */ + lh t7, 12(t1) /* wsptr[3] */ + lh t8, 4(t1) /* wsptr[1] */ ins t5, t6, 16, 16 ins t7, t8, 16, 16 mult $ac0, zero, zero @@ -3273,23 +3315,25 @@ LEAF_DSPR2(jsimd_idct_4x4_dspr2) mult $ac1, zero, zero dpa.w.ph $ac1, t5, s2 dpa.w.ph $ac1, t7, s3 - sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1) + sll t2, t2, 14 /* tmp0 = + ((JLONG)wsptr[0]) << (CONST_BITS+1) */ mflo s6, $ac0 - // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) + /* MULTIPLY(wsptr[2], FIX_1_847759065) + + MULTIPLY(wsptr[6], -FIX_0_765366865) */ subu s4, s4, s5 - addu t3, t2, s4 // tmp10 = tmp0 + z2 + addu t3, t2, s4 /* tmp10 = tmp0 + z2 */ mflo s7, $ac1 - subu t4, t2, s4 // tmp10 = tmp0 - z2 + subu t4, t2, s4 /* tmp10 = tmp0 - z2 */ addu t7, t4, s6 subu t8, t4, s6 addu t5, t3, s7 subu t6, t3, s7 - shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) - shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) - shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) - shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) + shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */ + shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */ + shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */ + shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */ sll s4, t9, 2 - lw v0, 0(a2) // output_buf[ctr] + lw v0, 0(a2) /* output_buf[ctr] */ shll_s.w t5, t5, 24 shll_s.w t6, t6, 24 shll_s.w t7, t7, 24 @@ -3298,7 +3342,7 @@ LEAF_DSPR2(jsimd_idct_4x4_dspr2) sra t6, t6, 24 sra t7, t7, 24 sra t8, t8, 24 - addu v0, v0, a3 // outptr = output_buf[ctr] + output_col + addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */ addiu t5, t5, 128 addiu t6, t6, 128 addiu t7, t7, 128 @@ -3307,18 +3351,20 @@ LEAF_DSPR2(jsimd_idct_4x4_dspr2) sb t7, 1(v0) sb t8, 2(v0) sb t6, 3(v0) - // 2 + /* 2 */ li s4, 15137 - lw s6, 40(t1) // wsptr[2] + lw s6, 40(t1) /* wsptr[2] */ li s5, 6270 - lw s7, 56(t1) // wsptr[6] - mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) - lw t2, 32(t1) // wsptr[0] - mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865) - lh t5, 60(t1) // wsptr[7] - lh t6, 52(t1) // wsptr[5] - lh t7, 44(t1) // wsptr[3] - lh t8, 36(t1) // wsptr[1] + lw s7, 56(t1) /* wsptr[6] */ + mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2], + FIX_1_847759065) */ + lw t2, 32(t1) /* wsptr[0] */ + mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6], + -FIX_0_765366865) */ + lh t5, 60(t1) /* wsptr[7] */ + lh t6, 52(t1) /* wsptr[5] */ + lh t7, 44(t1) /* wsptr[3] */ + lh t8, 36(t1) /* wsptr[1] */ ins t5, t6, 16, 16 ins t7, t8, 16, 16 mult $ac0, zero, zero @@ -3327,23 +3373,29 @@ LEAF_DSPR2(jsimd_idct_4x4_dspr2) mult $ac1, zero, zero dpa.w.ph $ac1, t5, s2 dpa.w.ph $ac1, t7, s3 - sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1) + sll t2, t2, 14 /* tmp0 = + ((JLONG)wsptr[0]) << (CONST_BITS+1) */ mflo s6, $ac0 - // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) + /* MULTIPLY(wsptr[2], FIX_1_847759065) + + MULTIPLY(wsptr[6], -FIX_0_765366865) */ subu s4, s4, s5 - addu t3, t2, s4 // tmp10 = tmp0 + z2 + addu t3, t2, s4 /* tmp10 = tmp0 + z2 */ mflo s7, $ac1 - subu t4, t2, s4 // tmp10 = tmp0 - z2 + subu t4, t2, s4 /* tmp10 = tmp0 - z2 */ addu t7, t4, s6 subu t8, t4, s6 addu t5, t3, s7 subu t6, t3, s7 - shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1) - shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1) - shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1) - shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1) + shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, + CONST_BITS-PASS1_BITS+1) */ + shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, + CONST_BITS-PASS1_BITS+1) */ + shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, + CONST_BITS-PASS1_BITS+1) */ + shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, + CONST_BITS-PASS1_BITS+1) */ sll s4, t9, 2 - lw v0, 4(a2) // output_buf[ctr] + lw v0, 4(a2) /* output_buf[ctr] */ shll_s.w t5, t5, 24 shll_s.w t6, t6, 24 shll_s.w t7, t7, 24 @@ -3352,7 +3404,7 @@ LEAF_DSPR2(jsimd_idct_4x4_dspr2) sra t6, t6, 24 sra t7, t7, 24 sra t8, t8, 24 - addu v0, v0, a3 // outptr = output_buf[ctr] + output_col + addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */ addiu t5, t5, 128 addiu t6, t6, 128 addiu t7, t7, 128 @@ -3361,18 +3413,20 @@ LEAF_DSPR2(jsimd_idct_4x4_dspr2) sb t7, 1(v0) sb t8, 2(v0) sb t6, 3(v0) - // 3 + /* 3 */ li s4, 15137 - lw s6, 72(t1) // wsptr[2] + lw s6, 72(t1) /* wsptr[2] */ li s5, 6270 - lw s7, 88(t1) // wsptr[6] - mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) - lw t2, 64(t1) // wsptr[0] - mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865) - lh t5, 92(t1) // wsptr[7] - lh t6, 84(t1) // wsptr[5] - lh t7, 76(t1) // wsptr[3] - lh t8, 68(t1) // wsptr[1] + lw s7, 88(t1) /* wsptr[6] */ + mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2], + FIX_1_847759065) */ + lw t2, 64(t1) /* wsptr[0] */ + mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6], + -FIX_0_765366865) */ + lh t5, 92(t1) /* wsptr[7] */ + lh t6, 84(t1) /* wsptr[5] */ + lh t7, 76(t1) /* wsptr[3] */ + lh t8, 68(t1) /* wsptr[1] */ ins t5, t6, 16, 16 ins t7, t8, 16, 16 mult $ac0, zero, zero @@ -3381,23 +3435,25 @@ LEAF_DSPR2(jsimd_idct_4x4_dspr2) mult $ac1, zero, zero dpa.w.ph $ac1, t5, s2 dpa.w.ph $ac1, t7, s3 - sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1) + sll t2, t2, 14 /* tmp0 = + ((JLONG)wsptr[0]) << (CONST_BITS+1) */ mflo s6, $ac0 - // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) + /* MULTIPLY(wsptr[2], FIX_1_847759065) + + MULTIPLY(wsptr[6], -FIX_0_765366865) */ subu s4, s4, s5 - addu t3, t2, s4 // tmp10 = tmp0 + z2 + addu t3, t2, s4 /* tmp10 = tmp0 + z2 */ mflo s7, $ac1 - subu t4, t2, s4 // tmp10 = tmp0 - z2 + subu t4, t2, s4 /* tmp10 = tmp0 - z2 */ addu t7, t4, s6 subu t8, t4, s6 addu t5, t3, s7 subu t6, t3, s7 - shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) - shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) - shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) - shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) + shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */ + shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */ + shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */ + shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */ sll s4, t9, 2 - lw v0, 8(a2) // output_buf[ctr] + lw v0, 8(a2) /* output_buf[ctr] */ shll_s.w t5, t5, 24 shll_s.w t6, t6, 24 shll_s.w t7, t7, 24 @@ -3406,7 +3462,7 @@ LEAF_DSPR2(jsimd_idct_4x4_dspr2) sra t6, t6, 24 sra t7, t7, 24 sra t8, t8, 24 - addu v0, v0, a3 // outptr = output_buf[ctr] + output_col + addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */ addiu t5, t5, 128 addiu t6, t6, 128 addiu t7, t7, 128 @@ -3416,16 +3472,18 @@ LEAF_DSPR2(jsimd_idct_4x4_dspr2) sb t8, 2(v0) sb t6, 3(v0) li s4, 15137 - lw s6, 104(t1) // wsptr[2] + lw s6, 104(t1) /* wsptr[2] */ li s5, 6270 - lw s7, 120(t1) // wsptr[6] - mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) - lw t2, 96(t1) // wsptr[0] - mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865) - lh t5, 124(t1) // wsptr[7] - lh t6, 116(t1) // wsptr[5] - lh t7, 108(t1) // wsptr[3] - lh t8, 100(t1) // wsptr[1] + lw s7, 120(t1) /* wsptr[6] */ + mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2], + FIX_1_847759065) */ + lw t2, 96(t1) /* wsptr[0] */ + mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6], + -FIX_0_765366865) */ + lh t5, 124(t1) /* wsptr[7] */ + lh t6, 116(t1) /* wsptr[5] */ + lh t7, 108(t1) /* wsptr[3] */ + lh t8, 100(t1) /* wsptr[1] */ ins t5, t6, 16, 16 ins t7, t8, 16, 16 mult $ac0, zero, zero @@ -3434,23 +3492,25 @@ LEAF_DSPR2(jsimd_idct_4x4_dspr2) mult $ac1, zero, zero dpa.w.ph $ac1, t5, s2 dpa.w.ph $ac1, t7, s3 - sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1) + sll t2, t2, 14 /* tmp0 = + ((JLONG)wsptr[0]) << (CONST_BITS+1) */ mflo s6, $ac0 - // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) + /* MULTIPLY(wsptr[2], FIX_1_847759065) + + MULTIPLY(wsptr[6], -FIX_0_765366865) */ subu s4, s4, s5 - addu t3, t2, s4 // tmp10 = tmp0 + z2; + addu t3, t2, s4 /* tmp10 = tmp0 + z2; */ mflo s7, $ac1 - subu t4, t2, s4 // tmp10 = tmp0 - z2; + subu t4, t2, s4 /* tmp10 = tmp0 - z2; */ addu t7, t4, s6 subu t8, t4, s6 addu t5, t3, s7 subu t6, t3, s7 - shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) - shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) - shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) - shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) + shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */ + shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */ + shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */ + shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */ sll s4, t9, 2 - lw v0, 12(a2) // output_buf[ctr] + lw v0, 12(a2) /* output_buf[ctr] */ shll_s.w t5, t5, 24 shll_s.w t6, t6, 24 shll_s.w t7, t7, 24 @@ -3459,7 +3519,7 @@ LEAF_DSPR2(jsimd_idct_4x4_dspr2) sra t6, t6, 24 sra t7, t7, 24 sra t8, t8, 24 - addu v0, v0, a3 // outptr = output_buf[ctr] + output_col + addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */ addiu t5, t5, 128 addiu t6, t6, 128 addiu t7, t7, 128 @@ -3496,54 +3556,54 @@ LEAF_DSPR2(jsimd_idct_6x6_dspr2) addiu s1, zero, 2998 1: - lh s2, 0(a0) // q0 = quantptr[ 0] - lh s3, 32(a0) // q1 = quantptr[16] - lh s4, 64(a0) // q2 = quantptr[32] - lh t2, 64(a1) // tmp2 = inptr[32] - lh t1, 32(a1) // tmp1 = inptr[16] - lh t0, 0(a1) // tmp0 = inptr[ 0] - mul t2, t2, s4 // tmp2 = tmp2 * q2 - mul t1, t1, s3 // tmp1 = tmp1 * q1 - mul t0, t0, s2 // tmp0 = tmp0 * q0 - lh t6, 16(a1) // z1 = inptr[ 8] - lh t8, 80(a1) // z3 = inptr[40] - lh t7, 48(a1) // z2 = inptr[24] - lh s2, 16(a0) // q0 = quantptr[ 8] - lh s4, 80(a0) // q2 = quantptr[40] - lh s3, 48(a0) // q1 = quantptr[24] - mul t2, t2, t9 // tmp2 = tmp2 * 5793 - mul t1, t1, s0 // tmp1 = tmp1 * 10033 - sll t0, t0, 13 // tmp0 = tmp0 << 13 - mul t6, t6, s2 // z1 = z1 * q0 - mul t8, t8, s4 // z3 = z3 * q2 - mul t7, t7, s3 // z2 = z2 * q1 - addu t3, t0, t2 // tmp10 = tmp0 + tmp2 - sll t2, t2, 1 // tmp2 = tmp2 << 2 - subu t4, t0, t2 // tmp11 = tmp0 - tmp2; - subu t5, t3, t1 // tmp12 = tmp10 - tmp1 - addu t3, t3, t1 // tmp10 = tmp10 + tmp1 - addu t1, t6, t8 // tmp1 = z1 + z3 - mul t1, t1, s1 // tmp1 = tmp1 * 2998 - shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 - subu t2, t6, t8 // tmp2 = z1 - z3 - subu t2, t2, t7 // tmp2 = tmp2 - z2 - sll t2, t2, 2 // tmp2 = tmp2 << 2 - addu t0, t6, t7 // tmp0 = z1 + z2 - sll t0, t0, 13 // tmp0 = tmp0 << 13 - subu s2, t8, t7 // q0 = z3 - z2 - sll s2, s2, 13 // q0 = q0 << 13 - addu t0, t0, t1 // tmp0 = tmp0 + tmp1 - addu t1, s2, t1 // tmp1 = q0 + tmp1 - addu s2, t4, t2 // q0 = tmp11 + tmp2 - subu s3, t4, t2 // q1 = tmp11 - tmp2 - addu t6, t3, t0 // z1 = tmp10 + tmp0 - subu t7, t3, t0 // z2 = tmp10 - tmp0 - addu t4, t5, t1 // tmp11 = tmp12 + tmp1 - subu t5, t5, t1 // tmp12 = tmp12 - tmp1 - shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11 - shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11 - shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 - shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11 + lh s2, 0(a0) /* q0 = quantptr[ 0] */ + lh s3, 32(a0) /* q1 = quantptr[16] */ + lh s4, 64(a0) /* q2 = quantptr[32] */ + lh t2, 64(a1) /* tmp2 = inptr[32] */ + lh t1, 32(a1) /* tmp1 = inptr[16] */ + lh t0, 0(a1) /* tmp0 = inptr[ 0] */ + mul t2, t2, s4 /* tmp2 = tmp2 * q2 */ + mul t1, t1, s3 /* tmp1 = tmp1 * q1 */ + mul t0, t0, s2 /* tmp0 = tmp0 * q0 */ + lh t6, 16(a1) /* z1 = inptr[ 8] */ + lh t8, 80(a1) /* z3 = inptr[40] */ + lh t7, 48(a1) /* z2 = inptr[24] */ + lh s2, 16(a0) /* q0 = quantptr[ 8] */ + lh s4, 80(a0) /* q2 = quantptr[40] */ + lh s3, 48(a0) /* q1 = quantptr[24] */ + mul t2, t2, t9 /* tmp2 = tmp2 * 5793 */ + mul t1, t1, s0 /* tmp1 = tmp1 * 10033 */ + sll t0, t0, 13 /* tmp0 = tmp0 << 13 */ + mul t6, t6, s2 /* z1 = z1 * q0 */ + mul t8, t8, s4 /* z3 = z3 * q2 */ + mul t7, t7, s3 /* z2 = z2 * q1 */ + addu t3, t0, t2 /* tmp10 = tmp0 + tmp2 */ + sll t2, t2, 1 /* tmp2 = tmp2 << 2 */ + subu t4, t0, t2 /* tmp11 = tmp0 - tmp2; */ + subu t5, t3, t1 /* tmp12 = tmp10 - tmp1 */ + addu t3, t3, t1 /* tmp10 = tmp10 + tmp1 */ + addu t1, t6, t8 /* tmp1 = z1 + z3 */ + mul t1, t1, s1 /* tmp1 = tmp1 * 2998 */ + shra_r.w t4, t4, 11 /* tmp11 = (tmp11 + 1024) >> 11 */ + subu t2, t6, t8 /* tmp2 = z1 - z3 */ + subu t2, t2, t7 /* tmp2 = tmp2 - z2 */ + sll t2, t2, 2 /* tmp2 = tmp2 << 2 */ + addu t0, t6, t7 /* tmp0 = z1 + z2 */ + sll t0, t0, 13 /* tmp0 = tmp0 << 13 */ + subu s2, t8, t7 /* q0 = z3 - z2 */ + sll s2, s2, 13 /* q0 = q0 << 13 */ + addu t0, t0, t1 /* tmp0 = tmp0 + tmp1 */ + addu t1, s2, t1 /* tmp1 = q0 + tmp1 */ + addu s2, t4, t2 /* q0 = tmp11 + tmp2 */ + subu s3, t4, t2 /* q1 = tmp11 - tmp2 */ + addu t6, t3, t0 /* z1 = tmp10 + tmp0 */ + subu t7, t3, t0 /* z2 = tmp10 - tmp0 */ + addu t4, t5, t1 /* tmp11 = tmp12 + tmp1 */ + subu t5, t5, t1 /* tmp12 = tmp12 - tmp1 */ + shra_r.w t6, t6, 11 /* z1 = (z1 + 1024) >> 11 */ + shra_r.w t7, t7, 11 /* z2 = (z2 + 1024) >> 11 */ + shra_r.w t4, t4, 11 /* tmp11 = (tmp11 + 1024) >> 11 */ + shra_r.w t5, t5, 11 /* tmp12 = (tmp12 + 1024) >> 11 */ sw s2, 24(v0) sw s3, 96(v0) sw t6, 0(v0) @@ -3644,7 +3704,7 @@ LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2) li a3, 8 1: - // odd part + /* odd part */ lh t0, 48(a1) lh t1, 48(a0) lh t2, 16(a1) @@ -3653,55 +3713,55 @@ LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2) lh t5, 80(a0) lh t6, 112(a1) lh t7, 112(a0) - mul t0, t0, t1 // z2 - mul t1, t2, t3 // z1 - mul t2, t4, t5 // z3 - mul t3, t6, t7 // z4 - li t4, 10703 // FIX(1.306562965) - li t5, 4433 // FIX_0_541196100 - li t6, 7053 // FIX(0.860918669) - mul t4, t0, t4 // tmp11 - mul t5, t0, t5 // -tmp14 - addu t7, t1, t2 // tmp10 - addu t8, t7, t3 // tmp10 + z4 - mul t6, t6, t8 // tmp15 - li t8, 2139 // FIX(0.261052384) - mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384)) - li t7, 2295 // FIX(0.280143716) - mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716)) - addu t9, t2, t3 // z3 + z4 - li s0, 8565 // FIX(1.045510580) - mul t9, t9, s0 // -tmp13 - li s0, 12112 // FIX(1.478575242) - mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242) - li s1, 12998 // FIX(1.586706681) - mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) - li s2, 5540 // FIX(0.676326758) - mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) - li s3, 16244 // FIX(1.982889723) - mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) - subu t1, t1, t3 // z1-=z4 - subu t0, t0, t2 // z2-=z3 - addu t2, t0, t1 // z1+z2 - li t3, 4433 // FIX_0_541196100 - mul t2, t2, t3 // z3 - li t3, 6270 // FIX_0_765366865 - mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) - li t3, 15137 // FIX_0_765366865 - mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) - addu t8, t6, t8 // tmp12 - addu t3, t8, t4 // tmp12 + tmp11 - addu t3, t3, t7 // tmp10 - subu t8, t8, t9 // tmp12 + tmp13 + mul t0, t0, t1 /* z2 */ + mul t1, t2, t3 /* z1 */ + mul t2, t4, t5 /* z3 */ + mul t3, t6, t7 /* z4 */ + li t4, 10703 /* FIX(1.306562965) */ + li t5, 4433 /* FIX_0_541196100 */ + li t6, 7053 /* FIX(0.860918669) */ + mul t4, t0, t4 /* tmp11 */ + mul t5, t0, t5 /* -tmp14 */ + addu t7, t1, t2 /* tmp10 */ + addu t8, t7, t3 /* tmp10 + z4 */ + mul t6, t6, t8 /* tmp15 */ + li t8, 2139 /* FIX(0.261052384) */ + mul t8, t7, t8 /* MULTIPLY(tmp10, FIX(0.261052384)) */ + li t7, 2295 /* FIX(0.280143716) */ + mul t7, t1, t7 /* MULTIPLY(z1, FIX(0.280143716)) */ + addu t9, t2, t3 /* z3 + z4 */ + li s0, 8565 /* FIX(1.045510580) */ + mul t9, t9, s0 /* -tmp13 */ + li s0, 12112 /* FIX(1.478575242) */ + mul s0, t2, s0 /* MULTIPLY(z3, FIX(1.478575242) */ + li s1, 12998 /* FIX(1.586706681) */ + mul s1, t3, s1 /* MULTIPLY(z4, FIX(1.586706681)) */ + li s2, 5540 /* FIX(0.676326758) */ + mul s2, t1, s2 /* MULTIPLY(z1, FIX(0.676326758)) */ + li s3, 16244 /* FIX(1.982889723) */ + mul s3, t3, s3 /* MULTIPLY(z4, FIX(1.982889723)) */ + subu t1, t1, t3 /* z1-=z4 */ + subu t0, t0, t2 /* z2-=z3 */ + addu t2, t0, t1 /* z1+z2 */ + li t3, 4433 /* FIX_0_541196100 */ + mul t2, t2, t3 /* z3 */ + li t3, 6270 /* FIX_0_765366865 */ + mul t1, t1, t3 /* MULTIPLY(z1, FIX_0_765366865) */ + li t3, 15137 /* FIX_0_765366865 */ + mul t0, t0, t3 /* MULTIPLY(z2, FIX_1_847759065) */ + addu t8, t6, t8 /* tmp12 */ + addu t3, t8, t4 /* tmp12 + tmp11 */ + addu t3, t3, t7 /* tmp10 */ + subu t8, t8, t9 /* tmp12 + tmp13 */ addu s0, t5, s0 - subu t8, t8, s0 // tmp12 + subu t8, t8, s0 /* tmp12 */ subu t9, t6, t9 subu s1, s1, t4 - addu t9, t9, s1 // tmp13 + addu t9, t9, s1 /* tmp13 */ subu t6, t6, t5 subu t6, t6, s2 - subu t6, t6, s3 // tmp15 - // even part start + subu t6, t6, s3 /* tmp15 */ + /* even part start */ lh t4, 64(a1) lh t5, 64(a0) lh t7, 32(a1) @@ -3710,39 +3770,43 @@ LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2) lh s2, 0(a0) lh s3, 96(a1) lh v0, 96(a0) - mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) - mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) - mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) - // odd part end - addu t1, t2, t1 // tmp11 - subu t0, t2, t0 // tmp14 - // update counter and pointers + mul t4, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*4], + quantptr[DCTSIZE*4]) */ + mul t5, t7, s0 /* DEQUANTIZE(inptr[DCTSIZE*2], + quantptr[DCTSIZE*2]) */ + mul t7, s1, s2 /* DEQUANTIZE(inptr[DCTSIZE*0], + quantptr[DCTSIZE*0]) */ + mul s0, s3, v0 /* DEQUANTIZE(inptr[DCTSIZE*6], + quantptr[DCTSIZE*6]) */ + /* odd part end */ + addu t1, t2, t1 /* tmp11 */ + subu t0, t2, t0 /* tmp14 */ + /* update counter and pointers */ addiu a3, a3, -1 addiu a0, a0, 2 addiu a1, a1, 2 - // even part rest + /* even part rest */ li s1, 10033 li s2, 11190 - mul t4, t4, s1 // z4 - mul s1, t5, s2 // z4 - sll t5, t5, 13 // z1 + mul t4, t4, s1 /* z4 */ + mul s1, t5, s2 /* z4 */ + sll t5, t5, 13 /* z1 */ sll t7, t7, 13 - addiu t7, t7, 1024 // z3 - sll s0, s0, 13 // z2 - addu s2, t7, t4 // tmp10 - subu t4, t7, t4 // tmp11 - subu s3, t5, s0 // tmp12 - addu t2, t7, s3 // tmp21 - subu s3, t7, s3 // tmp24 - addu t7, s1, s0 // tmp12 - addu v0, s2, t7 // tmp20 - subu s2, s2, t7 // tmp25 - subu s1, s1, t5 // z4 - z1 - subu s1, s1, s0 // tmp12 - addu s0, t4, s1 // tmp22 - subu t4, t4, s1 // tmp23 - // final output stage + addiu t7, t7, 1024 /* z3 */ + sll s0, s0, 13 /* z2 */ + addu s2, t7, t4 /* tmp10 */ + subu t4, t7, t4 /* tmp11 */ + subu s3, t5, s0 /* tmp12 */ + addu t2, t7, s3 /* tmp21 */ + subu s3, t7, s3 /* tmp24 */ + addu t7, s1, s0 /* tmp12 */ + addu v0, s2, t7 /* tmp20 */ + subu s2, s2, t7 /* tmp25 */ + subu s1, s1, t5 /* z4 - z1 */ + subu s1, s1, s0 /* tmp12 */ + addu s0, t4, s1 /* tmp22 */ + subu t4, t4, s1 /* tmp23 */ + /* final output stage */ addu t5, v0, t3 subu v0, v0, t3 addu t3, t2, t1 @@ -3801,86 +3865,86 @@ LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2) li a3, 12 1: - // Odd part + /* Odd part */ lw t0, 12(a0) lw t1, 4(a0) lw t2, 20(a0) lw t3, 28(a0) - li t4, 10703 // FIX(1.306562965) - li t5, 4433 // FIX_0_541196100 - mul t4, t0, t4 // tmp11 - mul t5, t0, t5 // -tmp14 - addu t6, t1, t2 // tmp10 - li t7, 2139 // FIX(0.261052384) - mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384)) - addu t6, t6, t3 // tmp10 + z4 - li t8, 7053 // FIX(0.860918669) - mul t6, t6, t8 // tmp15 - li t8, 2295 // FIX(0.280143716) - mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716)) - addu t9, t2, t3 // z3 + z4 - li s0, 8565 // FIX(1.045510580) - mul t9, t9, s0 // -tmp13 - li s0, 12112 // FIX(1.478575242) - mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)) - li s1, 12998 // FIX(1.586706681) - mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) - li s2, 5540 // FIX(0.676326758) - mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) - li s3, 16244 // FIX(1.982889723) - mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) - subu t1, t1, t3 // z1 -= z4 - subu t0, t0, t2 // z2 -= z3 - addu t2, t1, t0 // z1 + z2 - li t3, 4433 // FIX_0_541196100 - mul t2, t2, t3 // z3 - li t3, 6270 // FIX_0_765366865 - mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) - li t3, 15137 // FIX_1_847759065 - mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) - addu t3, t6, t7 // tmp12 + li t4, 10703 /* FIX(1.306562965) */ + li t5, 4433 /* FIX_0_541196100 */ + mul t4, t0, t4 /* tmp11 */ + mul t5, t0, t5 /* -tmp14 */ + addu t6, t1, t2 /* tmp10 */ + li t7, 2139 /* FIX(0.261052384) */ + mul t7, t6, t7 /* MULTIPLY(tmp10, FIX(0.261052384)) */ + addu t6, t6, t3 /* tmp10 + z4 */ + li t8, 7053 /* FIX(0.860918669) */ + mul t6, t6, t8 /* tmp15 */ + li t8, 2295 /* FIX(0.280143716) */ + mul t8, t1, t8 /* MULTIPLY(z1, FIX(0.280143716)) */ + addu t9, t2, t3 /* z3 + z4 */ + li s0, 8565 /* FIX(1.045510580) */ + mul t9, t9, s0 /* -tmp13 */ + li s0, 12112 /* FIX(1.478575242) */ + mul s0, t2, s0 /* MULTIPLY(z3, FIX(1.478575242)) */ + li s1, 12998 /* FIX(1.586706681) */ + mul s1, t3, s1 /* MULTIPLY(z4, FIX(1.586706681)) */ + li s2, 5540 /* FIX(0.676326758) */ + mul s2, t1, s2 /* MULTIPLY(z1, FIX(0.676326758)) */ + li s3, 16244 /* FIX(1.982889723) */ + mul s3, t3, s3 /* MULTIPLY(z4, FIX(1.982889723)) */ + subu t1, t1, t3 /* z1 -= z4 */ + subu t0, t0, t2 /* z2 -= z3 */ + addu t2, t1, t0 /* z1 + z2 */ + li t3, 4433 /* FIX_0_541196100 */ + mul t2, t2, t3 /* z3 */ + li t3, 6270 /* FIX_0_765366865 */ + mul t1, t1, t3 /* MULTIPLY(z1, FIX_0_765366865) */ + li t3, 15137 /* FIX_1_847759065 */ + mul t0, t0, t3 /* MULTIPLY(z2, FIX_1_847759065) */ + addu t3, t6, t7 /* tmp12 */ addu t7, t3, t4 - addu t7, t7, t8 // tmp10 + addu t7, t7, t8 /* tmp10 */ subu t3, t3, t9 subu t3, t3, t5 - subu t3, t3, s0 // tmp12 + subu t3, t3, s0 /* tmp12 */ subu t9, t6, t9 subu t9, t9, t4 - addu t9, t9, s1 // tmp13 + addu t9, t9, s1 /* tmp13 */ subu t6, t6, t5 subu t6, t6, s2 - subu t6, t6, s3 // tmp15 - addu t1, t2, t1 // tmp11 - subu t0, t2, t0 // tmp14 - // even part - lw t2, 16(a0) // z4 - lw t4, 8(a0) // z1 - lw t5, 0(a0) // z3 - lw t8, 24(a0) // z2 - li s0, 10033 // FIX(1.224744871) - li s1, 11190 // FIX(1.366025404) - mul t2, t2, s0 // z4 - mul s0, t4, s1 // z4 + subu t6, t6, s3 /* tmp15 */ + addu t1, t2, t1 /* tmp11 */ + subu t0, t2, t0 /* tmp14 */ + /* even part */ + lw t2, 16(a0) /* z4 */ + lw t4, 8(a0) /* z1 */ + lw t5, 0(a0) /* z3 */ + lw t8, 24(a0) /* z2 */ + li s0, 10033 /* FIX(1.224744871) */ + li s1, 11190 /* FIX(1.366025404) */ + mul t2, t2, s0 /* z4 */ + mul s0, t4, s1 /* z4 */ addiu t5, t5, 0x10 - sll t5, t5, 13 // z3 - sll t4, t4, 13 // z1 - sll t8, t8, 13 // z2 - subu s1, t4, t8 // tmp12 - addu s2, t5, t2 // tmp10 - subu t2, t5, t2 // tmp11 - addu s3, t5, s1 // tmp21 - subu s1, t5, s1 // tmp24 - addu t5, s0, t8 // tmp12 - addu v0, s2, t5 // tmp20 - subu t5, s2, t5 // tmp25 + sll t5, t5, 13 /* z3 */ + sll t4, t4, 13 /* z1 */ + sll t8, t8, 13 /* z2 */ + subu s1, t4, t8 /* tmp12 */ + addu s2, t5, t2 /* tmp10 */ + subu t2, t5, t2 /* tmp11 */ + addu s3, t5, s1 /* tmp21 */ + subu s1, t5, s1 /* tmp24 */ + addu t5, s0, t8 /* tmp12 */ + addu v0, s2, t5 /* tmp20 */ + subu t5, s2, t5 /* tmp25 */ subu t4, s0, t4 - subu t4, t4, t8 // tmp12 - addu t8, t2, t4 // tmp22 - subu t2, t2, t4 // tmp23 - // increment counter and pointers + subu t4, t4, t8 /* tmp12 */ + addu t8, t2, t4 /* tmp22 */ + subu t2, t2, t4 /* tmp23 */ + /* increment counter and pointers */ addiu a3, a3, -1 addiu a0, a0, 32 - // Final stage + /* Final stage */ addu t4, v0, t7 subu v0, v0, t7 addu t7, s3, t1 @@ -4169,7 +4233,7 @@ LEAF_DSPR2(jsimd_convsamp_float_dspr2) swc1 f12, 20(a2) swc1 f14, 24(a2) swc1 f16, 28(a2) - // elemr 1 + /* elemr 1 */ lbu t1, 0(t0) lbu t2, 1(t0) lbu t3, 2(t0) @@ -4212,7 +4276,7 @@ LEAF_DSPR2(jsimd_convsamp_float_dspr2) swc1 f12, 52(a2) swc1 f14, 56(a2) swc1 f16, 60(a2) - // elemr 2 + /* elemr 2 */ lbu t1, 0(t0) lbu t2, 1(t0) lbu t3, 2(t0) @@ -4255,7 +4319,7 @@ LEAF_DSPR2(jsimd_convsamp_float_dspr2) swc1 f12, 84(a2) swc1 f14, 88(a2) swc1 f16, 92(a2) - // elemr 3 + /* elemr 3 */ lbu t1, 0(t0) lbu t2, 1(t0) lbu t3, 2(t0) @@ -4298,7 +4362,7 @@ LEAF_DSPR2(jsimd_convsamp_float_dspr2) swc1 f12, 116(a2) swc1 f14, 120(a2) swc1 f16, 124(a2) - // elemr 4 + /* elemr 4 */ lbu t1, 0(t0) lbu t2, 1(t0) lbu t3, 2(t0) @@ -4341,7 +4405,7 @@ LEAF_DSPR2(jsimd_convsamp_float_dspr2) swc1 f12, 148(a2) swc1 f14, 152(a2) swc1 f16, 156(a2) - // elemr 5 + /* elemr 5 */ lbu t1, 0(t0) lbu t2, 1(t0) lbu t3, 2(t0) @@ -4384,7 +4448,7 @@ LEAF_DSPR2(jsimd_convsamp_float_dspr2) swc1 f12, 180(a2) swc1 f14, 184(a2) swc1 f16, 188(a2) - // elemr 6 + /* elemr 6 */ lbu t1, 0(t0) lbu t2, 1(t0) lbu t3, 2(t0) @@ -4427,7 +4491,7 @@ LEAF_DSPR2(jsimd_convsamp_float_dspr2) swc1 f12, 212(a2) swc1 f14, 216(a2) swc1 f16, 220(a2) - // elemr 7 + /* elemr 7 */ lbu t1, 0(t0) lbu t2, 1(t0) lbu t3, 2(t0) diff --git a/simd/nasm/jsimdcfg.inc.h b/simd/nasm/jsimdcfg.inc.h index 7ff7e292..bf2a45ad 100644 --- a/simd/nasm/jsimdcfg.inc.h +++ b/simd/nasm/jsimdcfg.inc.h @@ -1,8 +1,10 @@ -// This file generates the include file for the assembly -// implementations by abusing the C preprocessor. -// -// Note: Some things are manually defined as they need to -// be mapped to NASM types. +/* + * This file generates the include file for the assembly + * implementations by abusing the C preprocessor. + * + * Note: Some things are manually defined as they need to + * be mapped to NASM types. + */ ; ; Automatically generated include file from jsimdcfg.inc.h diff --git a/wrppm.c b/wrppm.c index 69f91e81..8cabaf03 100644 --- a/wrppm.c +++ b/wrppm.c @@ -5,7 +5,7 @@ * Copyright (C) 1991-1996, Thomas G. Lane. * Modified 2009 by Guido Vollbeding. * libjpeg-turbo Modifications: - * Copyright (C) 2017, 2019, D. R. Commander. + * Copyright (C) 2017, 2019-2020, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -326,11 +326,12 @@ jinit_write_ppm(j_decompress_ptr cinfo) if (cinfo->quantize_colors || BITS_IN_JSAMPLE != 8 || sizeof(JSAMPLE) != sizeof(char) || - (cinfo->out_color_space != JCS_EXT_RGB #if RGB_RED == 0 && RGB_GREEN == 1 && RGB_BLUE == 2 && RGB_PIXELSIZE == 3 - && cinfo->out_color_space != JCS_RGB + (cinfo->out_color_space != JCS_EXT_RGB && + cinfo->out_color_space != JCS_RGB)) { +#else + cinfo->out_color_space != JCS_EXT_RGB) { #endif - )) { /* When quantizing, we need an output buffer for colormap indexes * that's separate from the physical I/O buffer. We also need a * separate buffer if pixel format translation must take place.