Loongson MMI SIMD extensions

Based on:
42aff4497b

Closes #158
This commit is contained in:
DRC
2018-03-01 10:38:17 -06:00
parent 35ed3c97b2
commit 33ce0b5e71
16 changed files with 4723 additions and 0 deletions

View File

@@ -112,6 +112,13 @@ libjpeg-turbo is built with YASM), and iOS/ARM[64] builds are now private.
This prevents those symbols from being exposed in applications or shared This prevents those symbols from being exposed in applications or shared
libraries that link statically with libjpeg-turbo. libraries that link statically with libjpeg-turbo.
13. Added Loongson MMI SIMD implementations of the RGB-to-YCbCr and
YCbCr-to-RGB colorspace conversion, 4:2:0 chroma downsampling, 4:2:0 fancy
chroma upsampling, integer quantization, and slow integer DCT/IDCT algorithms.
When using the slow integer DCT/IDCT, this speeds up the compression of RGB
images by approximately 70-100% and the decompression of RGB images by
approximately 2-3.5x.
1.5.3 1.5.3
===== =====

View File

@@ -277,6 +277,28 @@ if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1) set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
endif() endif()
###############################################################################
# Loongson (Intrinsics)
###############################################################################
elseif(CPU_TYPE STREQUAL "loongson")
set(SIMD_SOURCES loongson/jccolor-mmi.c loongson/jcsample-mmi.c
loongson/jdcolor-mmi.c loongson/jdsample-mmi.c loongson/jfdctint-mmi.c
loongson/jidctint-mmi.c loongson/jquanti-mmi.c)
if(CMAKE_COMPILER_IS_GNUCC)
foreach(file ${SIMD_SOURCES})
set_property(SOURCE ${file} APPEND_STRING PROPERTY COMPILE_FLAGS
" -fno-strict-aliasing")
endforeach()
endif()
add_library(simd OBJECT ${SIMD_SOURCES} loongson/jsimd.c)
if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
endif()
############################################################################### ###############################################################################
# PowerPC (Intrinsics) # PowerPC (Intrinsics)

View File

@@ -6,6 +6,7 @@
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California. * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
* Copyright (C) 2014, Linaro Limited. * Copyright (C) 2014, Linaro Limited.
* Copyright (C) 2015-2016, Matthieu Darbois. * Copyright (C) 2015-2016, Matthieu Darbois.
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
* *
* Based on the x86 SIMD extension for IJG JPEG library, * Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru. * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -24,6 +25,7 @@
#define JSIMD_DSPR2 0x20 #define JSIMD_DSPR2 0x20
#define JSIMD_ALTIVEC 0x40 #define JSIMD_ALTIVEC 0x40
#define JSIMD_AVX2 0x80 #define JSIMD_AVX2 0x80
#define JSIMD_MMI 0x100
/* SIMD Ext: retrieve SIMD/CPU information */ /* SIMD Ext: retrieve SIMD/CPU information */
EXTERN(unsigned int) jpeg_simd_cpu_support (void); EXTERN(unsigned int) jpeg_simd_cpu_support (void);
@@ -148,6 +150,28 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_dspr2
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows); JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_rgb_ycc_convert_mmi
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extrgb_ycc_convert_mmi
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extrgbx_ycc_convert_mmi
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgr_ycc_convert_mmi
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgrx_ycc_convert_mmi
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extxbgr_ycc_convert_mmi
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extxrgb_ycc_convert_mmi
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_rgb_ycc_convert_altivec EXTERN(void) jsimd_rgb_ycc_convert_altivec
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows); JDIMENSION output_row, int num_rows);
@@ -406,6 +430,28 @@ EXTERN(void) jsimd_ycc_extxrgb_convert_dspr2
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows); JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_rgb_convert_mmi
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extrgb_convert_mmi
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extrgbx_convert_mmi
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extbgr_convert_mmi
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extbgrx_convert_mmi
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extxbgr_convert_mmi
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extxrgb_convert_mmi
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_rgb_convert_altivec EXTERN(void) jsimd_ycc_rgb_convert_altivec
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows); JSAMPARRAY output_buf, int num_rows);
@@ -490,6 +536,11 @@ EXTERN(void) jsimd_h2v2_downsample_dspr2
JDIMENSION v_samp_factor, JDIMENSION width_in_blocks, JDIMENSION v_samp_factor, JDIMENSION width_in_blocks,
JSAMPARRAY input_data, JSAMPARRAY output_data); JSAMPARRAY input_data, JSAMPARRAY output_data);
EXTERN(void) jsimd_h2v2_downsample_mmi
(JDIMENSION image_width, int max_v_samp_factor,
JDIMENSION v_samp_factor, JDIMENSION width_in_blocks,
JSAMPARRAY input_data, JSAMPARRAY output_data);
EXTERN(void) jsimd_h2v2_downsample_altivec EXTERN(void) jsimd_h2v2_downsample_altivec
(JDIMENSION image_width, int max_v_samp_factor, (JDIMENSION image_width, int max_v_samp_factor,
JDIMENSION v_samp_factor, JDIMENSION width_in_blocks, JDIMENSION v_samp_factor, JDIMENSION width_in_blocks,
@@ -579,6 +630,10 @@ EXTERN(void) jsimd_h2v2_fancy_upsample_dspr2
(int max_v_samp_factor, JDIMENSION downsampled_width, (int max_v_samp_factor, JDIMENSION downsampled_width,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v2_fancy_upsample_mmi
(int max_v_samp_factor, JDIMENSION downsampled_width,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v1_fancy_upsample_altivec EXTERN(void) jsimd_h2v1_fancy_upsample_altivec
(int max_v_samp_factor, JDIMENSION downsampled_width, (int max_v_samp_factor, JDIMENSION downsampled_width,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
@@ -854,6 +909,8 @@ EXTERN(void) jsimd_fdct_islow_neon (DCTELEM *data);
EXTERN(void) jsimd_fdct_islow_dspr2 (DCTELEM *data); EXTERN(void) jsimd_fdct_islow_dspr2 (DCTELEM *data);
EXTERN(void) jsimd_fdct_islow_mmi (DCTELEM *data);
EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM *data); EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM *data);
/* Fast Integer Forward DCT */ /* Fast Integer Forward DCT */
@@ -890,6 +947,9 @@ EXTERN(void) jsimd_quantize_neon
EXTERN(void) jsimd_quantize_dspr2 EXTERN(void) jsimd_quantize_dspr2
(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
EXTERN(void) jsimd_quantize_mmi
(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
EXTERN(void) jsimd_quantize_altivec EXTERN(void) jsimd_quantize_altivec
(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
@@ -966,6 +1026,10 @@ EXTERN(void) jsimd_idct_islow_dspr2
(void *dct_table, JCOEFPTR coef_block, int *output_buf, (void *dct_table, JCOEFPTR coef_block, int *output_buf,
JSAMPLE *output_col); JSAMPLE *output_col);
EXTERN(void) jsimd_idct_islow_mmi
(void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col);
EXTERN(void) jsimd_idct_islow_altivec EXTERN(void) jsimd_idct_islow_altivec
(void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col); JDIMENSION output_col);

View File

@@ -0,0 +1,470 @@
/*
* Loongson MMI optimizations for libjpeg-turbo
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
* All Rights Reserved.
* Authors: ZhuChen <zhuchen@loongson.cn>
* SunZhangzhi <sunzhangzhi-cq@loongson.cn>
* CaiWanwei <caiwanwei@loongson.cn>
*
* Based on the x86 SIMD extension for IJG JPEG library
* Copyright (C) 1999-2006, MIYASAKA Masaru.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jccolor-mmi.c */
#if RGB_RED == 0
#define mmA mm0
#define mmB mm1
#elif RGB_GREEN == 0
#define mmA mm2
#define mmB mm3
#elif RGB_BLUE == 0
#define mmA mm4
#define mmB mm5
#else
#define mmA mm6
#define mmB mm7
#endif
#if RGB_RED == 1
#define mmC mm0
#define mmD mm1
#elif RGB_GREEN == 1
#define mmC mm2
#define mmD mm3
#elif RGB_BLUE == 1
#define mmC mm4
#define mmD mm5
#else
#define mmC mm6
#define mmD mm7
#endif
#if RGB_RED == 2
#define mmE mm0
#define mmF mm1
#elif RGB_GREEN == 2
#define mmE mm2
#define mmF mm3
#elif RGB_BLUE == 2
#define mmE mm4
#define mmF mm5
#else
#define mmE mm6
#define mmF mm7
#endif
#if RGB_RED == 3
#define mmG mm0
#define mmH mm1
#elif RGB_GREEN == 3
#define mmG mm2
#define mmH mm3
#elif RGB_BLUE == 3
#define mmG mm4
#define mmH mm5
#else
#define mmG mm6
#define mmH mm7
#endif
void
jsimd_rgb_ycc_convert_mmi (JDIMENSION image_width, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
JSAMPROW inptr, outptr0, outptr1, outptr2;
int num_cols, col;
__m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
__m64 wk[7];
__m64 Y_BG, Cb_RG, Cr_BG;
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr0 = output_buf[0][output_row];
outptr1 = output_buf[1][output_row];
outptr2 = output_buf[2][output_row];
output_row++;
for (num_cols = image_width; num_cols > 0; num_cols -= 8,
outptr0 += 8, outptr1 += 8, outptr2 += 8) {
#if RGB_PIXELSIZE == 3
if (num_cols < 8) {
col = num_cols * 3;
asm(".set noreorder\r\n"
"li $8, 1\r\n"
"move $9, %3\r\n"
"and $10, $9, $8\r\n"
"beqz $10, 1f\r\n"
"nop \r\n"
"subu $9, $9, 1\r\n"
"xor $12, $12, $12\r\n"
"move $13, %5\r\n"
"dadd $13, $13, $9\r\n"
"lbu $12, 0($13)\r\n"
"1: \r\n"
"li $8, 2\r\n"
"and $10, $9, $8\r\n"
"beqz $10, 2f\r\n"
"nop \r\n"
"subu $9, $9, 2\r\n"
"xor $11, $11, $11\r\n"
"move $13, %5\r\n"
"dadd $13, $13, $9\r\n"
"lhu $11, 0($13)\r\n"
"sll $12, $12, 16\r\n"
"or $12, $12, $11\r\n"
"2: \r\n"
"dmtc1 $12, %0\r\n"
"li $8, 4\r\n"
"and $10, $9, $8\r\n"
"beqz $10, 3f\r\n"
"nop \r\n"
"subu $9, $9, 4\r\n"
"move $13, %5\r\n"
"dadd $13, $13, $9\r\n"
"lwu $14, 0($13)\r\n"
"dmtc1 $14, %1\r\n"
"dsll32 $12, $12, 0\r\n"
"or $12, $12, $14\r\n"
"dmtc1 $12, %0\r\n"
"3: \r\n"
"li $8, 8\r\n"
"and $10, $9, $8\r\n"
"beqz $10, 4f\r\n"
"nop \r\n"
"mov.s %1, %0\r\n"
"ldc1 %0, 0(%5)\r\n"
"li $9, 8\r\n"
"j 5f\r\n"
"nop \r\n"
"4: \r\n"
"li $8, 16\r\n"
"and $10, $9, $8\r\n"
"beqz $10, 5f\r\n"
"nop \r\n"
"mov.s %2, %0\r\n"
"ldc1 %0, 0(%5)\r\n"
"ldc1 %1, 8(%5)\r\n"
"5: \r\n"
"nop \r\n"
".set reorder\r\n"
: "=f" (mmA), "=f" (mmG), "=f" (mmF)
: "r" (col), "r" (num_rows), "r" (inptr)
: "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
"$14", "memory"
);
} else {
mmA = _mm_load_si64((__m64 *)&inptr[0]);
mmG = _mm_load_si64((__m64 *)&inptr[8]);
mmF = _mm_load_si64((__m64 *)&inptr[16]);
inptr += RGB_PIXELSIZE*8;
}
mmD = mmA;
mmA = _mm_slli_si64(mmA, 4*BYTE_BIT);
mmD = _mm_srli_si64(mmD, 4*BYTE_BIT);
mmA = _mm_unpackhi_pi8(mmA, mmG);
mmG = _mm_slli_si64(mmG, 4*BYTE_BIT);
mmD = _mm_unpacklo_pi8(mmD, mmF);
mmG = _mm_unpackhi_pi8(mmG, mmF);
mmE = mmA;
mmA = _mm_slli_si64(mmA, 4*BYTE_BIT);
mmE = _mm_srli_si64(mmE, 4*BYTE_BIT);
mmA = _mm_unpackhi_pi8(mmA, mmD);
mmD = _mm_slli_si64(mmD, 4*BYTE_BIT);
mmE = _mm_unpacklo_pi8(mmE, mmG);
mmD = _mm_unpackhi_pi8(mmD, mmG);
mmC = mmA;
mmA = _mm_loadlo_pi8_f(mmA);
mmC = _mm_loadhi_pi8_f(mmC);
mmB = mmE;
mmE = _mm_loadlo_pi8_f(mmE);
mmB = _mm_loadhi_pi8_f(mmB);
mmF = mmD;
mmD = _mm_loadlo_pi8_f(mmD);
mmF = _mm_loadhi_pi8_f(mmF);
#else /* RGB_PIXELSIZE == 4 */
if (num_cols < 8) {
col = num_cols;
asm(".set noreorder\r\n"
"li $8, 1\r\n"
"move $9, %4\r\n"
"and $10, $9, $8\r\n"
"beqz $10, 1f\r\n"
"nop \r\n"
"subu $9, $9, 1\r\n"
"dsll $11, $9, 2\r\n"
"move $13, %5\r\n"
"daddu $13, $13, $11\r\n"
"lwc1 %0, 0($13)\r\n"
"1: \r\n"
"li $8, 2\r\n"
"and $10, $9, $8\r\n"
"beqz $10, 2f\r\n"
"nop \r\n"
"subu $9, $9, 2\r\n"
"dsll $11, $9, 2\r\n"
"move $13, %5\r\n"
"daddu $13, $13, $11\r\n"
"mov.s %1, %0\r\n"
"ldc1 %0, 0($13)\r\n"
"2: \r\n"
"li $8, 4\r\n"
"and $10, $9, $8\r\n"
"beqz $10, 3f\r\n"
"nop \r\n"
"mov.s %2, %0\r\n"
"mov.s %3, %1\r\n"
"ldc1 %0, 0(%5)\r\n"
"ldc1 %1, 8(%5)\r\n"
"3: \r\n"
"nop \r\n"
".set reorder\r\n"
: "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
: "r" (col), "r" (inptr)
: "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
);
} else {
mmA = _mm_load_si64((__m64 *)&inptr[0]);
mmF = _mm_load_si64((__m64 *)&inptr[8]);
mmD = _mm_load_si64((__m64 *)&inptr[16]);
mmC = _mm_load_si64((__m64 *)&inptr[24]);
inptr += RGB_PIXELSIZE*8;
}
mmB = mmA;
mmA = _mm_unpacklo_pi8(mmA, mmF);
mmB = _mm_unpackhi_pi8(mmB, mmF);
mmG = mmD;
mmD = _mm_unpacklo_pi8(mmD, mmC);
mmG = _mm_unpackhi_pi8(mmG, mmC);
mmE = mmA;
mmA = _mm_unpacklo_pi16(mmA, mmD);
mmE = _mm_unpackhi_pi16(mmE, mmD);
mmH = mmB;
mmB = _mm_unpacklo_pi16(mmB, mmG);
mmH = _mm_unpackhi_pi16(mmH, mmG);
mmC = mmA;
mmA = _mm_loadlo_pi8_f(mmA);
mmC = _mm_loadhi_pi8_f(mmC);
mmD = mmB;
mmB = _mm_loadlo_pi8_f(mmB);
mmD = _mm_loadhi_pi8_f(mmD);
mmG = mmE;
mmE = _mm_loadlo_pi8_f(mmE);
mmG = _mm_loadhi_pi8_f(mmG);
mmF = mmH;
mmF = _mm_unpacklo_pi8(mmF, mmH);
mmH = _mm_unpackhi_pi8(mmH, mmH);
mmF = _mm_srli_pi16(mmF, BYTE_BIT);
mmH = _mm_srli_pi16(mmH, BYTE_BIT);
#endif
wk[0] = mm0;
wk[1] = mm1;
wk[2] = mm4;
wk[3] = mm5;
mm6 = mm1;
mm1 = _mm_unpacklo_pi16(mm1, mm3);
mm6 = _mm_unpackhi_pi16(mm6, mm3);
mm7 = mm1;
mm4 = mm6;
mm1 = _mm_madd_pi16(mm1, PW_F0299_F0337);
mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
mm7 = _mm_madd_pi16(mm7, PW_MF016_MF033);
mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
wk[4] = mm1;
wk[5] = mm6;
mm1 = _mm_loadlo_pi16_f(mm5);
mm6 = _mm_loadhi_pi16_f(mm5);
mm1 = _mm_srli_pi32(mm1, 1);
mm6 = _mm_srli_pi32(mm6, 1);
mm5 = PD_ONEHALFM1_CJ;
mm7 = _mm_add_pi32(mm7, mm1);
mm4 = _mm_add_pi32(mm4, mm6);
mm7 = _mm_add_pi32(mm7, mm5);
mm4 = _mm_add_pi32(mm4, mm5);
mm7 = _mm_srli_pi32(mm7, SCALEBITS);
mm4 = _mm_srli_pi32(mm4, SCALEBITS);
mm7 = _mm_packs_pi32(mm7, mm4);
mm1 = wk[2];
mm6 = mm0;
mm0 = _mm_unpacklo_pi16(mm0, mm2);
mm6 = _mm_unpackhi_pi16(mm6, mm2);
mm5 = mm0;
mm4 = mm6;
mm0 = _mm_madd_pi16(mm0, PW_F0299_F0337);
mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
mm5 = _mm_madd_pi16(mm5, PW_MF016_MF033);
mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
wk[6] = mm0;
wk[7] = mm6;
mm0 = _mm_loadlo_pi16_f(mm1);
mm6 = _mm_loadhi_pi16_f(mm1);
mm0 = _mm_srli_pi32(mm0, 1);
mm6 = _mm_srli_pi32(mm6, 1);
mm1 = PD_ONEHALFM1_CJ;
mm5 = _mm_add_pi32(mm5, mm0);
mm4 = _mm_add_pi32(mm4, mm6);
mm5 = _mm_add_pi32(mm5, mm1);
mm4 = _mm_add_pi32(mm4, mm1);
mm5 = _mm_srli_pi32(mm5, SCALEBITS);
mm4 = _mm_srli_pi32(mm4, SCALEBITS);
mm5 = _mm_packs_pi32(mm5, mm4);
mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
mm5 = _mm_or_si64(mm5, mm7);
Cb_RG = mm5;
mm0 = wk[3];
mm6 = wk[2];
mm1 = wk[1];
mm4 = mm0;
mm0 = _mm_unpacklo_pi16(mm0, mm3);
mm4 = _mm_unpackhi_pi16(mm4, mm3);
mm7 = mm0;
mm5 = mm4;
mm0 = _mm_madd_pi16(mm0, PW_F0114_F0250);
mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
mm7 = _mm_madd_pi16(mm7, PW_MF008_MF041);
mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
mm3 = PD_ONEHALF;
mm0 = _mm_add_pi32(mm0, wk[4]);
mm4 = _mm_add_pi32(mm4, wk[5]);
mm0 = _mm_add_pi32(mm0, mm3);
mm4 = _mm_add_pi32(mm4, mm3);
mm0 = _mm_srli_pi32(mm0, SCALEBITS);
mm4 = _mm_srli_pi32(mm4, SCALEBITS);
mm0 = _mm_packs_pi32(mm0,mm4);
mm3 = _mm_loadlo_pi16_f(mm1);
mm4 = _mm_loadhi_pi16_f(mm1);
mm3 = _mm_srli_pi32(mm3, 1);
mm4 = _mm_srli_pi32(mm4, 1);
mm1 = PD_ONEHALFM1_CJ;
mm7 = _mm_add_pi32(mm7, mm3);
mm5 = _mm_add_pi32(mm5, mm4);
mm7 = _mm_add_pi32(mm7, mm1);
mm5 = _mm_add_pi32(mm5, mm1);
mm7 = _mm_srli_pi32(mm7, SCALEBITS);
mm5 = _mm_srli_pi32(mm5, SCALEBITS);
mm7 = _mm_packs_pi32(mm7, mm5);
mm3 = wk[0];
mm4 = mm6;
mm6 = _mm_unpacklo_pi16(mm6, mm2);
mm4 = _mm_unpackhi_pi16(mm4, mm2);
mm1 = mm6;
mm5 = mm4;
mm6 = _mm_madd_pi16(mm6, PW_F0114_F0250);
mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
mm1 = _mm_madd_pi16(mm1, PW_MF008_MF041);
mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
mm2 = PD_ONEHALF;
mm6 = _mm_add_pi32(mm6, wk[6]);
mm4 = _mm_add_pi32(mm4, wk[7]);
mm6 = _mm_add_pi32(mm6, mm2);
mm4 = _mm_add_pi32(mm4, mm2);
mm6 = _mm_srli_pi32(mm6, SCALEBITS);
mm4 = _mm_srli_pi32(mm4, SCALEBITS);
mm6 = _mm_packs_pi32(mm6, mm4);
mm0 = _mm_slli_pi16(mm0, BYTE_BIT);
mm6 = _mm_or_si64(mm6, mm0);
Y_BG = mm6;
mm2 = _mm_loadlo_pi16_f(mm3);
mm4 = _mm_loadhi_pi16_f(mm3);
mm2 = _mm_srli_pi32(mm2, 1);
mm4 = _mm_srli_pi32(mm4, 1);
mm0 = PD_ONEHALFM1_CJ;
mm1 = _mm_add_pi32(mm1, mm2);
mm5 = _mm_add_pi32(mm5, mm4);
mm1 = _mm_add_pi32(mm1, mm0);
mm5 = _mm_add_pi32(mm5, mm0);
mm1 = _mm_srli_pi32(mm1, SCALEBITS);
mm5 = _mm_srli_pi32(mm5, SCALEBITS);
mm1 = _mm_packs_pi32(mm1, mm5);
mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
mm1 = _mm_or_si64(mm1, mm7);
Cr_BG = mm1;
_mm_store_si64((__m64 *)&outptr0[0], Y_BG);
_mm_store_si64((__m64 *)&outptr1[0], Cb_RG);
_mm_store_si64((__m64 *)&outptr2[0], Cr_BG);
}
}
}
#undef mmA
#undef mmB
#undef mmC
#undef mmD
#undef mmE
#undef mmF
#undef mmG
#undef mmH

148
simd/loongson/jccolor-mmi.c Normal file
View File

@@ -0,0 +1,148 @@
/*
* Loongson MMI optimizations for libjpeg-turbo
*
* Copyright (C) 2011, 2014, D. R. Commander. All Rights Reserved.
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
* All Rights Reserved.
* Authors: ZhuChen <zhuchen@loongson.cn>
* CaiWanwei <caiwanwei@loongson.cn>
* SunZhangzhi <sunzhangzhi-cq@loongson.cn>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* RGB --> YCC CONVERSION */
#include "jsimd_mmi.h"
#define F_0_081 ((short) 5329) /* FIX(0.08131) */
#define F_0_114 ((short) 7471) /* FIX(0.11400) */
#define F_0_168 ((short)11059) /* FIX(0.16874) */
#define F_0_250 ((short)16384) /* FIX(0.25000) */
#define F_0_299 ((short)19595) /* FIX(0.29900) */
#define F_0_331 ((short)21709) /* FIX(0.33126) */
#define F_0_418 ((short)27439) /* FIX(0.41869) */
#define F_0_587 ((short)38470) /* FIX(0.58700) */
#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
enum const_index {
index_PD_ONEHALF,
index_PW_F0299_F0337,
index_PW_F0114_F0250,
index_PW_MF016_MF033,
index_PW_MF008_MF041,
index_PD_ONEHALFM1_CJ
};
static uint64_t const_value[] = {
_uint64_set_pi32((int)(1 << (SCALEBITS-1)), (int)(1 << (SCALEBITS-1))),
_uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
_uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114),
_uint64_set_pi16(-F_0_331, -F_0_168, -F_0_331, -F_0_168),
_uint64_set_pi16(-F_0_418, -F_0_081, -F_0_418, -F_0_081),
_uint64_set_pi32(((1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)),
((1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)))
};
#define get_const_value(index) (*(__m64 *)&const_value[index])
#define PD_ONEHALF get_const_value(index_PD_ONEHALF)
#define PW_F0299_F0337 get_const_value(index_PW_F0299_F0337)
#define PW_F0114_F0250 get_const_value(index_PW_F0114_F0250)
#define PW_MF016_MF033 get_const_value(index_PW_MF016_MF033)
#define PW_MF008_MF041 get_const_value(index_PW_MF008_MF041)
#define PD_ONEHALFM1_CJ get_const_value(index_PD_ONEHALFM1_CJ)
#include "jccolext-mmi.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#define RGB_RED EXT_RGB_RED
#define RGB_GREEN EXT_RGB_GREEN
#define RGB_BLUE EXT_RGB_BLUE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_rgb_ycc_convert_mmi jsimd_extrgb_ycc_convert_mmi
#include "jccolext-mmi.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_mmi
#define RGB_RED EXT_RGBX_RED
#define RGB_GREEN EXT_RGBX_GREEN
#define RGB_BLUE EXT_RGBX_BLUE
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define jsimd_rgb_ycc_convert_mmi jsimd_extrgbx_ycc_convert_mmi
#include "jccolext-mmi.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_mmi
#define RGB_RED EXT_BGR_RED
#define RGB_GREEN EXT_BGR_GREEN
#define RGB_BLUE EXT_BGR_BLUE
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define jsimd_rgb_ycc_convert_mmi jsimd_extbgr_ycc_convert_mmi
#include "jccolext-mmi.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_mmi
#define RGB_RED EXT_BGRX_RED
#define RGB_GREEN EXT_BGRX_GREEN
#define RGB_BLUE EXT_BGRX_BLUE
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define jsimd_rgb_ycc_convert_mmi jsimd_extbgrx_ycc_convert_mmi
#include "jccolext-mmi.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_mmi
#define RGB_RED EXT_XBGR_RED
#define RGB_GREEN EXT_XBGR_GREEN
#define RGB_BLUE EXT_XBGR_BLUE
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define jsimd_rgb_ycc_convert_mmi jsimd_extxbgr_ycc_convert_mmi
#include "jccolext-mmi.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_mmi
#define RGB_RED EXT_XRGB_RED
#define RGB_GREEN EXT_XRGB_GREEN
#define RGB_BLUE EXT_XRGB_BLUE
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define jsimd_rgb_ycc_convert_mmi jsimd_extxrgb_ycc_convert_mmi
#include "jccolext-mmi.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_mmi

View File

@@ -0,0 +1,101 @@
/*
* Loongson MMI optimizations for libjpeg-turbo
*
* Copyright (C) 2015, 2018, D. R. Commander. All Rights Reserved.
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
* All Rights Reserved.
* Authors: ZhuChen <zhuchen@loongson.cn>
* CaiWanwei <caiwanwei@loongson.cn>
* SunZhangzhi <sunzhangzhi-cq@loongson.cn>
*
* Based on the x86 SIMD extension for IJG JPEG library
* Copyright (C) 1999-2006, MIYASAKA Masaru.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* CHROMA DOWNSAMPLING */
#include "jsimd_mmi.h"
#include "jcsample.h"
void
jsimd_h2v2_downsample_mmi (JDIMENSION image_width, int max_v_samp_factor,
JDIMENSION v_samp_factor,
JDIMENSION width_in_blocks, JSAMPARRAY input_data,
JSAMPARRAY output_data)
{
int inrow, outrow, outcol, bias;
JDIMENSION output_cols = width_in_blocks * DCTSIZE;
JSAMPROW inptr0, inptr1, outptr;
__m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6 = 0.0, mm7;
expand_right_edge(input_data, max_v_samp_factor, image_width,
output_cols * 2);
bias = (1 << 17) + 1; /* 0x00020001 (bias pattern) */
mm7 = _mm_set1_pi32(bias); /* mm7={1, 2, 1, 2} */
mm6 = _mm_cmpeq_pi16(mm6, mm6);
mm6 = _mm_srli_pi16(mm6, BYTE_BIT); /* mm6={0xFF 0x00 0xFF 0x00 ..} */
for (inrow = 0, outrow = 0; outrow < v_samp_factor;
inrow += 2, outrow++) {
inptr0 = input_data[inrow];
inptr1 = input_data[inrow + 1];
outptr = output_data[outrow];
for (outcol = output_cols; outcol > 0;
outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) {
mm0 = _mm_load_si64((__m64 *)&inptr0[0]);
mm1 = _mm_load_si64((__m64 *)&inptr1[0]);
mm2 = _mm_load_si64((__m64 *)&inptr0[8]);
mm3 = _mm_load_si64((__m64 *)&inptr1[8]);
mm4 = mm0;
mm5 = mm1;
mm0 = _mm_and_si64(mm0, mm6);
mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
mm1 = _mm_and_si64(mm1, mm6);
mm5 = _mm_srli_pi16(mm5, BYTE_BIT);
mm0 = _mm_add_pi16(mm0, mm4);
mm1 = _mm_add_pi16(mm1, mm5);
mm4 = mm2;
mm5 = mm3;
mm2 = _mm_and_si64(mm2, mm6);
mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
mm3 = _mm_and_si64(mm3, mm6);
mm5 = _mm_srli_pi16(mm5, BYTE_BIT);
mm2 = _mm_add_pi16(mm2, mm4);
mm3 = _mm_add_pi16(mm3, mm5);
mm0 = _mm_add_pi16(mm0, mm1);
mm2 = _mm_add_pi16(mm2, mm3);
mm0 = _mm_add_pi16(mm0, mm7);
mm2 = _mm_add_pi16(mm2, mm7);
mm0 = _mm_srli_pi16(mm0, 2);
mm2 = _mm_srli_pi16(mm2, 2);
mm0 = _mm_packs_pu16(mm0, mm2);
_mm_store_si64((__m64 *)&outptr[0], mm0);
}
}
}

28
simd/loongson/jcsample.h Normal file
View File

@@ -0,0 +1,28 @@
/*
* jcsample.h
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1996, Thomas G. Lane.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*/
LOCAL(void)
expand_right_edge (JSAMPARRAY image_data, int num_rows,
JDIMENSION input_cols, JDIMENSION output_cols)
{
register JSAMPROW ptr;
register JSAMPLE pixval;
register int count;
int row;
int numcols = (int) (output_cols - input_cols);
if (numcols > 0) {
for (row = 0; row < num_rows; row++) {
ptr = image_data[row] + input_cols;
pixval = ptr[-1]; /* don't need GETJSAMPLE() here */
for (count = numcols; count > 0; count--)
*ptr++ = pixval;
}
}
}

View File

@@ -0,0 +1,425 @@
/*
* Loongson MMI optimizations for libjpeg-turbo
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2015, D. R. Commander. All Rights Reserved.
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
* All Rights Reserved.
* Authors: ZhuChen <zhuchen@loongson.cn>
* SunZhangzhi <sunzhangzhi-cq@loongson.cn>
* CaiWanwei <caiwanwei@loongson.cn>
*
* Based on the x86 SIMD extension for IJG JPEG library
* Copyright (C) 1999-2006, MIYASAKA Masaru.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jdcolor-mmi.c */
#if RGB_RED == 0
#define mmA mm0
#define mmB mm1
#elif RGB_GREEN == 0
#define mmA mm2
#define mmB mm3
#elif RGB_BLUE == 0
#define mmA mm4
#define mmB mm5
#else
#define mmA mm6
#define mmB mm7
#endif
#if RGB_RED == 1
#define mmC mm0
#define mmD mm1
#elif RGB_GREEN == 1
#define mmC mm2
#define mmD mm3
#elif RGB_BLUE == 1
#define mmC mm4
#define mmD mm5
#else
#define mmC mm6
#define mmD mm7
#endif
#if RGB_RED == 2
#define mmE mm0
#define mmF mm1
#elif RGB_GREEN == 2
#define mmE mm2
#define mmF mm3
#elif RGB_BLUE == 2
#define mmE mm4
#define mmF mm5
#else
#define mmE mm6
#define mmF mm7
#endif
#if RGB_RED == 3
#define mmG mm0
#define mmH mm1
#elif RGB_GREEN == 3
#define mmG mm2
#define mmH mm3
#elif RGB_BLUE == 3
#define mmG mm4
#define mmH mm5
#else
#define mmG mm6
#define mmH mm7
#endif
void
jsimd_ycc_rgb_convert_mmi (JDIMENSION out_width, JSAMPIMAGE input_buf,
JDIMENSION input_row, JSAMPARRAY output_buf,
int num_rows)
{
JSAMPROW outptr, inptr0, inptr1, inptr2;
int num_cols, col;
__m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
__m64 mm8, wk[2];
while (--num_rows >= 0) {
inptr0 = input_buf[0][input_row];
inptr1 = input_buf[1][input_row];
inptr2 = input_buf[2][input_row];
input_row++;
outptr = *output_buf++;
for (num_cols = out_width; num_cols > 0; num_cols -= 8,
inptr0 += 8, inptr1 += 8, inptr2 += 8) {
mm5 = _mm_load_si64((__m64 *)inptr1);
mm1 = _mm_load_si64((__m64 *)inptr2);
mm8 = _mm_load_si64((__m64 *)inptr0);
mm4 = 0;
mm7 = 0;
mm4 = _mm_cmpeq_pi16(mm4, mm4);
mm7 = _mm_cmpeq_pi16(mm7, mm7);
mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
mm7 = _mm_slli_pi16(mm7, 7); /* mm7={0xFF80 0xFF80 0xFF80 0xFF80} */
mm0 = mm4; /* mm0=mm4={0xFF 0x00 0xFF 0x00 ..} */
mm4 = _mm_and_si64(mm4, mm5); /* mm4=Cb(0246)=CbE */
mm5 = _mm_srli_pi16(mm5, BYTE_BIT); /* mm5=Cb(1357)=CbO */
mm0 = _mm_and_si64(mm0, mm1); /* mm0=Cr(0246)=CrE */
mm1 = _mm_srli_pi16(mm1, BYTE_BIT); /* mm1=Cr(1357)=CrO */
mm4 = _mm_add_pi16(mm4, mm7);
mm5 = _mm_add_pi16(mm5, mm7);
mm0 = _mm_add_pi16(mm0, mm7);
mm1 = _mm_add_pi16(mm1, mm7);
/* (Original)
* R = Y + 1.40200 * Cr
* G = Y - 0.34414 * Cb - 0.71414 * Cr
* B = Y + 1.77200 * Cb
*
* (This implementation)
* R = Y + 0.40200 * Cr + Cr
* G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
* B = Y - 0.22800 * Cb + Cb + Cb
*/
mm2 = mm4; /* mm2 = CbE */
mm3 = mm5; /* mm3 = CbO */
mm4 = _mm_add_pi16(mm4, mm4); /* mm4 = 2*CbE */
mm5 = _mm_add_pi16(mm5, mm5); /* mm5 = 2*CbO */
mm6 = mm0; /* mm6 = CrE */
mm7 = mm1; /* mm7 = CrO */
mm0 = _mm_add_pi16(mm0, mm0); /* mm0 = 2*CrE */
mm1 = _mm_add_pi16(mm1, mm1); /* mm1 = 2*CrO */
mm4 = _mm_mulhi_pi16(mm4, PW_MF0228); /* mm4=(2*CbE * -FIX(0.22800) */
mm5 = _mm_mulhi_pi16(mm5, PW_MF0228); /* mm5=(2*CbO * -FIX(0.22800) */
mm0 = _mm_mulhi_pi16(mm0, PW_F0402); /* mm0=(2*CrE * FIX(0.40200)) */
mm1 = _mm_mulhi_pi16(mm1, PW_F0402); /* mm1=(2*CrO * FIX(0.40200)) */
mm4 = _mm_add_pi16(mm4, PW_ONE);
mm5 = _mm_add_pi16(mm5, PW_ONE);
mm4 = _mm_srai_pi16(mm4, 1); /* mm4=(CbE * -FIX(0.22800)) */
mm5 = _mm_srai_pi16(mm5, 1); /* mm5=(CbO * -FIX(0.22800)) */
mm0 = _mm_add_pi16(mm0, PW_ONE);
mm1 = _mm_add_pi16(mm1, PW_ONE);
mm0 = _mm_srai_pi16(mm0, 1); /* mm0=(CrE * FIX(0.40200)) */
mm1 = _mm_srai_pi16(mm1, 1); /* mm1=(CrO * FIX(0.40200)) */
mm4 = _mm_add_pi16(mm4, mm2);
mm5 = _mm_add_pi16(mm5, mm3);
mm4 = _mm_add_pi16(mm4, mm2); /* mm4=(CbE * FIX(1.77200))=(B-Y)E */
mm5 = _mm_add_pi16(mm5, mm3); /* mm5=(CbO * FIX(1.77200))=(B-Y)O */
mm0 = _mm_add_pi16(mm0, mm6); /* mm0=(CrE * FIX(1.40200))=(R-Y)E */
mm1 = _mm_add_pi16(mm1, mm7); /* mm1=(CrO * FIX(1.40200))=(R-Y)O */
wk[0] = mm4; /* wk(0)=(B-Y)E */
wk[1] = mm5; /* wk(1)=(B-Y)O */
mm4 = mm2;
mm5 = mm3;
mm2 = _mm_unpacklo_pi16(mm2, mm6);
mm4 = _mm_unpackhi_pi16(mm4, mm6);
mm2 = _mm_madd_pi16(mm2, PW_MF0344_F0285);
mm4 = _mm_madd_pi16(mm4, PW_MF0344_F0285);
mm3 = _mm_unpacklo_pi16(mm3, mm7);
mm5 = _mm_unpackhi_pi16(mm5, mm7);
mm3 = _mm_madd_pi16(mm3, PW_MF0344_F0285);
mm5 = _mm_madd_pi16(mm5, PW_MF0344_F0285);
mm2 = _mm_add_pi32(mm2, PD_ONEHALF);
mm4 = _mm_add_pi32(mm4, PD_ONEHALF);
mm2 = _mm_srai_pi32(mm2, SCALEBITS);
mm4 = _mm_srai_pi32(mm4, SCALEBITS);
mm3 = _mm_add_pi32(mm3, PD_ONEHALF);
mm5 = _mm_add_pi32(mm5, PD_ONEHALF);
mm3 = _mm_srai_pi32(mm3, SCALEBITS);
mm5 = _mm_srai_pi32(mm5, SCALEBITS);
mm2 = _mm_packs_pi32(mm2, mm4); /* mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) */
mm3 = _mm_packs_pi32(mm3, mm5); /* mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) */
mm2 = _mm_sub_pi16(mm2, mm6); /* mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
mm3 = _mm_sub_pi16(mm3, mm7); /* mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
mm5 = mm8; /* mm5=Y(01234567) */
mm4 = _mm_cmpeq_pi16(mm4, mm4);
mm4 = _mm_srli_pi16(mm4, BYTE_BIT); /* mm4={0xFF 0x00 0xFF 0x00 ..} */
mm4 = _mm_and_si64(mm4, mm5); /* mm4=Y(0246)=YE */
mm5 = _mm_srli_pi16(mm5, BYTE_BIT); /* mm5=Y(1357)=YO */
mm0 = _mm_add_pi16(mm0, mm4); /* mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) */
mm1 = _mm_add_pi16(mm1, mm5); /* mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) */
mm0 = _mm_packs_pu16(mm0, mm0); /* mm0=(R0 R2 R4 R6 ** ** ** **) */
mm1 = _mm_packs_pu16(mm1, mm1); /* mm1=(R1 R3 R5 R7 ** ** ** **) */
mm2 = _mm_add_pi16(mm2, mm4); /* mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) */
mm3 = _mm_add_pi16(mm3, mm5); /* mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) */
mm2 = _mm_packs_pu16(mm2, mm2); /* mm2=(G0 G2 G4 G6 ** ** ** **) */
mm3 = _mm_packs_pu16(mm3, mm3); /* mm3=(G1 G3 G5 G7 ** ** ** **) */
mm4 = _mm_add_pi16(mm4, wk[0]); /* mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) */
mm5 = _mm_add_pi16(mm5, wk[1]); /* mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) */
mm4 = _mm_packs_pu16(mm4, mm4); /* mm4=(B0 B2 B4 B6 ** ** ** **) */
mm5 = _mm_packs_pu16(mm5, mm5); /* mm5=(B1 B3 B5 B7 ** ** ** **) */
#if RGB_PIXELSIZE == 3
/* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
/* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
mmA = _mm_unpacklo_pi8(mmA, mmC); /* mmA=(00 10 02 12 04 14 06 16) */
mmE = _mm_unpacklo_pi8(mmE, mmB); /* mmE=(20 01 22 03 24 05 26 07) */
mmD = _mm_unpacklo_pi8(mmD, mmF); /* mmD=(11 21 13 23 15 25 17 27) */
mmG = mmA;
mmH = mmA;
mmA = _mm_unpacklo_pi16(mmA, mmE); /* mmA=(00 10 20 01 02 12 22 03) */
mmG = _mm_unpackhi_pi16(mmG, mmE); /* mmG=(04 14 24 05 06 16 26 07) */
mmH = _mm_srli_si64(mmH, 2*BYTE_BIT);
mmE = _mm_srli_si64(mmE, 2*BYTE_BIT);
mmC = mmD;
mmB = mmD;
mmD = _mm_unpacklo_pi16(mmD, mmH); /* mmD=(11 21 02 12 13 23 04 14) */
mmC = _mm_unpackhi_pi16(mmC, mmH); /* mmC=(15 25 06 16 17 27 -- --) */
mmB = _mm_srli_si64(mmB, 2*BYTE_BIT); /* mmB=(13 23 15 25 17 27 -- --) */
mmF = mmE;
mmE = _mm_unpacklo_pi16(mmE, mmB); /* mmE=(22 03 13 23 24 05 15 25) */
mmF = _mm_unpackhi_pi16(mmF, mmB); /* mmF=(26 07 17 27 -- -- -- --) */
mmA = _mm_unpacklo_pi32(mmA, mmD); /* mmA=(00 10 20 01 11 21 02 12) */
mmE = _mm_unpacklo_pi32(mmE, mmG); /* mmE=(22 03 13 23 04 14 24 05) */
mmC = _mm_unpacklo_pi32(mmC, mmF); /* mmC=(15 25 06 16 26 07 17 27) */
if (num_cols >= 8) {
_mm_store_si64((__m64 *)outptr, mmA);
_mm_store_si64((__m64 *)(outptr+8), mmE);
_mm_store_si64((__m64 *)(outptr+16), mmC);
outptr += RGB_PIXELSIZE*8;
} else {
col = num_cols * 3;
asm(".set noreorder\r\n"
"li $8, 16\r\n"
"move $9, %4\r\n"
"mov.s $f4, %1\r\n"
"mov.s $f6, %3\r\n"
"move $10, %5\r\n"
"bltu $9, $8, 1f\r\n"
"nop \r\n"
"gssdlc1 $f4, 7($10)\r\n"
"gssdrc1 $f4, 0($10)\r\n"
"gssdlc1 $f6, 7+8($10)\r\n"
"gssdrc1 $f6, 8($10)\r\n"
"mov.s $f4, %2\r\n"
"subu $9, $9, 16\r\n"
"daddu $10, $10, 16\r\n"
"b 2f\r\n"
"nop \r\n"
"1: \r\n"
"li $8, 8\r\n" /* st8 */
"bltu $9, $8, 2f\r\n"
"nop \r\n"
"gssdlc1 $f4, 7($10)\r\n"
"gssdrc1 $f4, ($10)\r\n"
"mov.s $f4, %3\r\n"
"subu $9, $9, 8\r\n"
"daddu $10, $10, 8\r\n"
"2: \r\n"
"li $8, 4\r\n" /* st4 */
"mfc1 $11, $f4\r\n"
"bltu $9, $8, 3f\r\n"
"nop \r\n"
"swl $11, 3($10)\r\n"
"swr $11, 0($10)\r\n"
"li $8, 32\r\n"
"mtc1 $8, $f6\r\n"
"dsrl $f4, $f4, $f6\r\n"
"mfc1 $11, $f4\r\n"
"subu $9, $9, 4\r\n"
"daddu $10, $10, 4\r\n"
"3: \r\n"
"li $8, 2\r\n" /* st2 */
"bltu $9, $8, 4f\r\n"
"nop \r\n"
"ush $11, 0($10)\r\n"
"srl $11, 16\r\n"
"subu $9, $9, 2\r\n"
"daddu $10, $10, 2\r\n"
"4: \r\n"
"li $8, 1\r\n" /* st1 */
"bltu $9, $8, 5f\r\n"
"nop \r\n"
"sb $11, 0($10)\r\n"
"5: \r\n"
"nop \r\n" /* end */
: "=m" (*outptr)
: "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr)
: "$f4", "$f6", "$8", "$9", "$10", "$11", "memory"
);
}
#else /* RGB_PIXELSIZE == 4 */
#ifdef RGBX_FILLER_0XFF
mm6 = _mm_cmpeq_pi8(mm6, mm6);
mm7 = _mm_cmpeq_pi8(mm7, mm7);
#else
mm6 = _mm_xor_si64(mm6, mm6);
mm7 = _mm_xor_si64(mm7, mm7);
#endif
/* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
/* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
/* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
/* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
mmA = _mm_unpacklo_pi8(mmA, mmC); /* mmA=(00 10 02 12 04 14 06 16) */
mmE = _mm_unpacklo_pi8(mmE, mmG); /* mmE=(20 30 22 32 24 34 26 36) */
mmB = _mm_unpacklo_pi8(mmB, mmD); /* mmB=(01 11 03 13 05 15 07 17) */
mmF = _mm_unpacklo_pi8(mmF, mmH); /* mmF=(21 31 23 33 25 35 27 37) */
mmC = mmA;
mmA = _mm_unpacklo_pi16(mmA, mmE); /* mmA=(00 10 20 30 02 12 22 32) */
mmC = _mm_unpackhi_pi16(mmC, mmE); /* mmC=(04 14 24 34 06 16 26 36) */
mmG = mmB;
mmB = _mm_unpacklo_pi16(mmB, mmF); /* mmB=(01 11 21 31 03 13 23 33) */
mmG = _mm_unpackhi_pi16(mmG, mmF); /* mmG=(05 15 25 35 07 17 27 37) */
mmD = mmA;
mmA = _mm_unpacklo_pi32(mmA, mmB); /* mmA=(00 10 20 30 01 11 21 31) */
mmD = _mm_unpackhi_pi32(mmD, mmB); /* mmD=(02 12 22 32 03 13 23 33) */
mmH = mmC;
mmC = _mm_unpacklo_pi32(mmC, mmG); /* mmC=(04 14 24 34 05 15 25 35) */
mmH = _mm_unpackhi_pi32(mmH, mmG); /* mmH=(06 16 26 36 07 17 27 37) */
if (num_cols >= 8) {
_mm_store_si64((__m64 *)outptr, mmA);
_mm_store_si64((__m64 *)(outptr+8), mmD);
_mm_store_si64((__m64 *)(outptr+16), mmC);
_mm_store_si64((__m64 *)(outptr+24), mmH);
outptr += RGB_PIXELSIZE*8;
} else {
col = num_cols;
asm(".set noreorder\r\n" /* st16 */
"li $8, 4\r\n"
"move $9, %6\r\n"
"move $10, %7\r\n"
"mov.s $f4, %2\r\n"
"mov.s $f6, %4\r\n"
"bltu $9, $8, 1f\r\n"
"nop \r\n"
"gssdlc1 $f4, 7($10)\r\n"
"gssdrc1 $f4, ($10)\r\n"
"gssdlc1 $f6, 7+8($10)\r\n"
"gssdrc1 $f6, 8($10)\r\n"
"mov.s $f4, %3\r\n"
"mov.s $f6, %5\r\n"
"subu $9, $9, 4\r\n"
"daddu $10, $10, 16\r\n"
"1: \r\n"
"li $8, 2\r\n" /* st8 */
"bltu $9, $8, 2f\r\n"
"nop \r\n"
"gssdlc1 $f4, 7($10)\r\n"
"gssdrc1 $f4, 0($10)\r\n"
"mov.s $f4, $f6\r\n"
"subu $9, $9, 2\r\n"
"daddu $10, $10, 8\r\n"
"2: \r\n"
"li $8, 1\r\n" /* st4 */
"bltu $9, $8, 3f\r\n"
"nop \r\n"
"gsswlc1 $f4, 3($10)\r\n"
"gsswrc1 $f4, 0($10)\r\n"
"3: \r\n"
"li %1, 0\r\n" /* end */
: "=m" (*outptr), "=r" (col)
: "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col),
"r" (outptr)
: "$f4", "$f6", "$8", "$9", "$10", "memory"
);
}
#endif
}
}
}
#undef mmA
#undef mmB
#undef mmC
#undef mmD
#undef mmE
#undef mmF
#undef mmG
#undef mmH

139
simd/loongson/jdcolor-mmi.c Normal file
View File

@@ -0,0 +1,139 @@
/*
* Loongson MMI optimizations for libjpeg-turbo
*
* Copyright (C) 2011, 2015, D. R. Commander. All Rights Reserved.
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
* All Rights Reserved.
* Authors: ZhuChen <zhuchen@loongson.cn>
* CaiWanwei <caiwanwei@loongson.cn>
* SunZhangzhi <sunzhangzhi-cq@loongson.cn>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* YCC --> RGB CONVERSION */
#include "jsimd_mmi.h"
#define F_0_344 ((short)22554) /* FIX(0.34414) */
#define F_0_402 ((short)26345) /* FIX(1.40200) - FIX(1) */
#define F_0_285 ((short)18734) /* FIX(1) - FIX(0.71414) */
#define F_0_228 ((short)14942) /* FIX(2) - FIX(1.77200) */
enum const_index {
index_PW_ONE,
index_PW_F0402,
index_PW_MF0228,
index_PW_MF0344_F0285,
index_PD_ONEHALF
};
static uint64_t const_value[] = {
_uint64_set_pi16(1, 1, 1, 1),
_uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
_uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
_uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
_uint64_set_pi32((int)(1 << (SCALEBITS-1)), (int)(1 << (SCALEBITS-1)))
};
#define PW_ONE get_const_value(index_PW_ONE)
#define PW_F0402 get_const_value(index_PW_F0402)
#define PW_MF0228 get_const_value(index_PW_MF0228)
#define PW_MF0344_F0285 get_const_value(index_PW_MF0344_F0285)
#define PD_ONEHALF get_const_value(index_PD_ONEHALF)
#define RGBX_FILLER_0XFF 1
#include "jdcolext-mmi.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#define RGB_RED EXT_RGB_RED
#define RGB_GREEN EXT_RGB_GREEN
#define RGB_BLUE EXT_RGB_BLUE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extrgb_convert_mmi
#include "jdcolext-mmi.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_mmi
#define RGB_RED EXT_RGBX_RED
#define RGB_GREEN EXT_RGBX_GREEN
#define RGB_BLUE EXT_RGBX_BLUE
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extrgbx_convert_mmi
#include "jdcolext-mmi.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_mmi
#define RGB_RED EXT_BGR_RED
#define RGB_GREEN EXT_BGR_GREEN
#define RGB_BLUE EXT_BGR_BLUE
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extbgr_convert_mmi
#include "jdcolext-mmi.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_mmi
#define RGB_RED EXT_BGRX_RED
#define RGB_GREEN EXT_BGRX_GREEN
#define RGB_BLUE EXT_BGRX_BLUE
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extbgrx_convert_mmi
#include "jdcolext-mmi.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_mmi
#define RGB_RED EXT_XBGR_RED
#define RGB_GREEN EXT_XBGR_GREEN
#define RGB_BLUE EXT_XBGR_BLUE
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extxbgr_convert_mmi
#include "jdcolext-mmi.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_mmi
#define RGB_RED EXT_XRGB_RED
#define RGB_GREEN EXT_XRGB_GREEN
#define RGB_BLUE EXT_XRGB_BLUE
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extxrgb_convert_mmi
#include "jdcolext-mmi.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_mmi

View File

@@ -0,0 +1,247 @@
/*
* Loongson MMI optimizations for libjpeg-turbo
*
* Copyright (C) 2015, 2018, D. R. Commander. All Rights Reserved.
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
* All Rights Reserved.
* Authors: ZhuChen <zhuchen@loongson.cn>
* CaiWanwei <caiwanwei@loongson.cn>
* SunZhangzhi <sunzhangzhi-cq@loongson.cn>
*
* Based on the x86 SIMD extension for IJG JPEG library
* Copyright (C) 1999-2006, MIYASAKA Masaru.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* CHROMA UPSAMPLING */
#include "jsimd_mmi.h"
enum const_index {
index_PW_THREE,
index_PW_SEVEN,
index_PW_EIGHT,
};
static uint64_t const_value[] = {
_uint64_set_pi16(3, 3, 3, 3),
_uint64_set_pi16(7, 7, 7, 7),
_uint64_set_pi16(8, 8, 8, 8),
};
#define PW_THREE get_const_value(index_PW_THREE)
#define PW_SEVEN get_const_value(index_PW_SEVEN)
#define PW_EIGHT get_const_value(index_PW_EIGHT)
#define PROCESS_ROW(r) \
{ \
mm7 = _mm_load_si64((__m64 *)outptr##r); /* mm7=IntrL=( 0 1 2 3) */ \
mm3 = _mm_load_si64((__m64 *)outptr##r+1); /* mm3=IntrH=( 4 5 6 7) */ \
\
mm0 = mm7; \
mm4 = mm3; \
mm0 = _mm_srli_si64(mm0, 2*BYTE_BIT); /* mm0=( 1 2 3 -) */ \
mm4 = _mm_slli_si64(mm4, (SIZEOF_MMWORD-2)*BYTE_BIT); /* mm4=( - - - 4) */ \
mm5 = mm7; \
mm6 = mm3; \
mm5 = _mm_srli_si64(mm5, (SIZEOF_MMWORD-2)*BYTE_BIT); /* mm5=( 3 - - -) */ \
mm6 = _mm_slli_si64(mm6, 2*BYTE_BIT); /* mm6=( - 4 5 6) */ \
\
mm0 = _mm_or_si64(mm0, mm4); /* mm0=( 1 2 3 4) */ \
mm5 = _mm_or_si64(mm5, mm6); /* mm5=( 3 4 5 6) */ \
\
mm1 = mm7; \
mm2 = mm3; \
mm1 = _mm_slli_si64(mm1, 2*BYTE_BIT); /* mm1=( - 0 1 2) */ \
mm2 = _mm_srli_si64(mm2, 2*BYTE_BIT); /* mm2=( 5 6 7 -) */ \
mm4 = mm3; \
mm4 = _mm_srli_si64(mm4, (SIZEOF_MMWORD-2)*BYTE_BIT); /* mm4=( 7 - - -) */ \
\
mm1 = _mm_or_si64(mm1, wk[r]); /* mm1=(-1 0 1 2) */ \
mm2 = _mm_or_si64(mm2, wk[r+2]); /* mm2=( 5 6 6 8) */ \
\
wk[r] = mm4; \
\
mm7 = _mm_mullo_pi16(mm7, PW_THREE); \
mm3 = _mm_mullo_pi16(mm3, PW_THREE); \
mm1 = _mm_add_pi16(mm1, PW_EIGHT); \
mm5 = _mm_add_pi16(mm5, PW_EIGHT); \
mm0 = _mm_add_pi16(mm0, PW_SEVEN); \
mm2 = _mm_add_pi16(mm2, PW_SEVEN); \
\
mm1 = _mm_add_pi16(mm1, mm7); \
mm5 = _mm_add_pi16(mm5, mm3); \
mm1 = _mm_srli_pi16(mm1, 4); /* mm1=OutrLE=( 0 2 4 6) */ \
mm5 = _mm_srli_pi16(mm5, 4); /* mm5=OutrHE=( 8 10 12 14) */ \
mm0 = _mm_add_pi16(mm0, mm7); \
mm2 = _mm_add_pi16(mm2, mm3); \
mm0 = _mm_srli_pi16(mm0, 4); /* mm0=OutrLO=( 1 3 5 7) */ \
mm2 = _mm_srli_pi16(mm2, 4); /* mm2=OutrHO=( 9 11 13 15) */ \
\
mm0 = _mm_slli_pi16(mm0, BYTE_BIT); \
mm2 = _mm_slli_pi16(mm2, BYTE_BIT); \
mm1 = _mm_or_si64(mm1, mm0); /* mm1=OutrL=( 0 1 2 3 4 5 6 7) */ \
mm5 = _mm_or_si64(mm5, mm2); /* mm5=OutrH=( 8 9 10 11 12 13 14 15) */ \
\
_mm_store_si64((__m64 *)outptr##r, mm1); \
_mm_store_si64((__m64 *)outptr##r+1, mm5); \
}
void
jsimd_h2v2_fancy_upsample_mmi (int max_v_samp_factor,
JDIMENSION downsampled_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
int inrow, outrow, incol, tmp, tmp1;
__m64 mm0, mm1, mm2, mm3 = 0.0, mm4, mm5, mm6, mm7 = 0.0;
__m64 wk[4], mm_tmp;
for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
inptr_1 = input_data[inrow - 1];
inptr0 = input_data[inrow];
inptr1 = input_data[inrow + 1];
outptr0 = output_data[outrow++];
outptr1 = output_data[outrow++];
if (downsampled_width & 7) {
tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
tmp1 = downsampled_width * sizeof(JSAMPLE);
asm("daddu $8, %3, %6\r\n"
"lb $9, ($8)\r\n"
"daddu $8, %3, %7\r\n"
"sb $9, ($8)\r\n"
"daddu $8, %4, %6\r\n"
"lb $9, ($8)\r\n"
"daddu $8, %4, %7\r\n"
"sb $9, ($8)\r\n"
"daddu $8, %5, %6\r\n"
"lb $9, ($8)\r\n"
"daddu $8, %5, %7\r\n"
"sb $9, ($8)\r\n"
: "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
: "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
: "$8", "$9"
);
}
/* process the first column block */
mm0 = _mm_load_si64((__m64 *)inptr0); /* mm0 = row[ 0][0] */
mm1 = _mm_load_si64((__m64 *)inptr_1); /* mm1 = row[-1][0] */
mm2 = _mm_load_si64((__m64 *)inptr1); /* mm2 = row[ 1][0] */
mm3 = _mm_xor_si64(mm3, mm3); /* mm3 = (all 0's) */
mm4 = mm0;
mm0 = _mm_unpacklo_pi8(mm0, mm3); /* mm0 = row[ 0][0]( 0 1 2 3) */
mm4 = _mm_unpackhi_pi8(mm4, mm3); /* mm4 = row[ 0][0]( 4 5 6 7) */
mm5 = mm1;
mm1 = _mm_unpacklo_pi8(mm1, mm3); /* mm1 = row[-1][0]( 0 1 2 3) */
mm5 = _mm_unpackhi_pi8(mm5, mm3); /* mm5 = row[-1][0]( 4 5 6 7) */
mm6 = mm2;
mm2 = _mm_unpacklo_pi8(mm2, mm3); /* mm2 = row[+1][0]( 0 1 2 3) */
mm6 = _mm_unpackhi_pi8(mm6, mm3); /* mm6 = row[+1][0]( 4 5 6 7) */
mm0 = _mm_mullo_pi16(mm0, PW_THREE);
mm4 = _mm_mullo_pi16(mm4, PW_THREE);
mm7 = _mm_cmpeq_pi8(mm7, mm7);
mm7 = _mm_srli_si64(mm7, (SIZEOF_MMWORD-2)*BYTE_BIT);
mm1 = _mm_add_pi16(mm1, mm0); /* mm1=Int0L=( 0 1 2 3) */
mm5 = _mm_add_pi16(mm5, mm4); /* mm5=Int0H=( 4 5 6 7) */
mm2 = _mm_add_pi16(mm2, mm0); /* mm2=Int1L=( 0 1 2 3) */
mm6 = _mm_add_pi16(mm6, mm4); /* mm6=Int1H=( 4 5 6 7) */
_mm_store_si64((__m64*)outptr0, mm1); /* temporarily save */
_mm_store_si64((__m64*)outptr0+1, mm5); /* the intermediate data */
_mm_store_si64((__m64*)outptr1, mm2);
_mm_store_si64((__m64*)outptr1+1, mm6);
mm1 = _mm_and_si64(mm1, mm7); /* mm1=( 0 - - -) */
mm2 = _mm_and_si64(mm2, mm7); /* mm2=( 0 - - -) */
wk[0] = mm1;
wk[1] = mm2;
for (incol = downsampled_width; incol > 0;
incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
outptr0 += 16, outptr1 += 16) {
if (incol > 8) {
/* process the next column block */
mm0 = _mm_load_si64((__m64 *)inptr0+1); /* mm0 = row[ 0][1] */
mm1 = _mm_load_si64((__m64 *)inptr_1+1); /* mm1 = row[-1][1] */
mm2 = _mm_load_si64((__m64 *)inptr1+1); /* mm2 = row[+1][1] */
mm3 = _mm_setzero_si64(); /* mm3 = (all 0's) */
mm4 = mm0;
mm0 = _mm_unpacklo_pi8(mm0, mm3); /* mm0 = row[ 0][1]( 0 1 2 3) */
mm4 = _mm_unpackhi_pi8(mm4, mm3); /* mm4 = row[ 0][1]( 4 5 6 7) */
mm5 = mm1;
mm1 = _mm_unpacklo_pi8(mm1, mm3); /* mm1 = row[-1][1]( 0 1 2 3) */
mm5 = _mm_unpackhi_pi8(mm5, mm3); /* mm5 = row[-1][1]( 4 5 6 7) */
mm6 = mm2;
mm2 = _mm_unpacklo_pi8(mm2, mm3); /* mm2 = row[+1][1]( 0 1 2 3) */
mm6 = _mm_unpackhi_pi8(mm6, mm3); /* mm6 = row[+1][1]( 4 5 6 7) */
mm0 = _mm_mullo_pi16(mm0, PW_THREE);
mm4 = _mm_mullo_pi16(mm4, PW_THREE);
mm1 = _mm_add_pi16(mm1, mm0); /* mm1 = Int0L = ( 0 1 2 3) */
mm5 = _mm_add_pi16(mm5, mm4); /* mm5 = Int0H = ( 4 5 6 7) */
mm2 = _mm_add_pi16(mm2, mm0); /* mm2 = Int1L = ( 0 1 2 3) */
mm6 = _mm_add_pi16(mm6, mm4); /* mm6 = Int1H = ( 4 5 6 7) */
_mm_store_si64((__m64*)outptr0+2, mm1); /* temporarily save */
_mm_store_si64((__m64*)outptr0+3, mm5); /* the intermediate data */
_mm_store_si64((__m64*)outptr1+2, mm2);
_mm_store_si64((__m64*)outptr1+3, mm6);
mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD-2)*BYTE_BIT); /* mm1=( - - - 0) */
mm2 = _mm_slli_si64(mm2, (SIZEOF_MMWORD-2)*BYTE_BIT); /* mm2=( - - - 0) */
wk[2] = mm1;
wk[3] = mm2;
} else {
/* process the last column block */
mm1 = _mm_cmpeq_pi8(mm1, mm1);
mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD-2)*BYTE_BIT);
mm2 = mm1;
mm_tmp = _mm_load_si64((__m64 *)outptr0+1);
mm1 = _mm_and_si64(mm1, mm_tmp); /* mm1=( - - - 7) */
mm_tmp = _mm_load_si64((__m64 *)outptr1+1);
mm2 = _mm_and_si64(mm2, mm_tmp); /* mm2=( - - - 7) */
wk[2] = mm1;
wk[3] = mm2;
}
/* process the upper row */
PROCESS_ROW(0)
/* process the lower row */
PROCESS_ROW(1)
}
}
}

View File

@@ -0,0 +1,402 @@
/*
* Loongson MMI optimizations for libjpeg-turbo
*
* Copyright (C) 2014, 2018, D. R. Commander. All Rights Reserved.
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
* All Rights Reserved.
* Authors: ZhuChen <zhuchen@loongson.cn>
* CaiWanwei <caiwanwei@loongson.cn>
* SunZhangzhi <sunzhangzhi-cq@loongson.cn>
*
* Based on the x86 SIMD extension for IJG JPEG library
* Copyright (C) 1999-2006, MIYASAKA Masaru.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* SLOW INTEGER FORWARD DCT */
#include "jsimd_mmi.h"
#define CONST_BITS 13
#define PASS1_BITS 2
#define DESCALE_P1 (CONST_BITS-PASS1_BITS)
#define DESCALE_P2 (CONST_BITS+PASS1_BITS)
#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */
#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */
#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */
#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */
#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */
#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */
#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */
#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */
#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */
#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */
#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */
#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */
enum const_index{
index_PW_F130_F054,
index_PW_F054_MF130,
index_PW_MF078_F117,
index_PW_F117_F078,
index_PW_MF060_MF089,
index_PW_MF089_F060,
index_PW_MF050_MF256,
index_PW_MF256_F050,
index_PD_DESCALE_P1,
index_PD_DESCALE_P2,
index_PW_DESCALE_P2X
};
static uint64_t const_value[]={
_uint64_set_pi16(FIX_0_541, (FIX_0_541+FIX_0_765),
FIX_0_541, (FIX_0_541+FIX_0_765)),
_uint64_set_pi16((FIX_0_541-FIX_1_847), FIX_0_541,
(FIX_0_541-FIX_1_847), FIX_0_541),
_uint64_set_pi16(FIX_1_175, (FIX_1_175-FIX_1_961),
FIX_1_175, (FIX_1_175-FIX_1_961)),
_uint64_set_pi16((FIX_1_175-FIX_0_390), FIX_1_175,
(FIX_1_175-FIX_0_390), FIX_1_175),
_uint64_set_pi16(-FIX_0_899, (FIX_0_298-FIX_0_899),
-FIX_0_899, (FIX_0_298-FIX_0_899)),
_uint64_set_pi16((FIX_1_501-FIX_0_899), -FIX_0_899,
(FIX_1_501-FIX_0_899), -FIX_0_899),
_uint64_set_pi16(-FIX_2_562, (FIX_2_053-FIX_2_562),
-FIX_2_562, (FIX_2_053-FIX_2_562)),
_uint64_set_pi16((FIX_3_072-FIX_2_562), -FIX_2_562,
(FIX_3_072-FIX_2_562), -FIX_2_562),
_uint64_set_pi32((1 << (DESCALE_P1-1)), (1 << (DESCALE_P1-1))),
_uint64_set_pi32((1 << (DESCALE_P2-1)), (1 << (DESCALE_P2-1))),
_uint64_set_pi16((1 << (PASS1_BITS-1)), (1 << (PASS1_BITS-1)),
(1 << (PASS1_BITS-1)), (1 << (PASS1_BITS-1)))
};
#define PW_F130_F054 get_const_value(index_PW_F130_F054)
#define PW_F054_MF130 get_const_value(index_PW_F054_MF130)
#define PW_MF078_F117 get_const_value(index_PW_MF078_F117)
#define PW_F117_F078 get_const_value(index_PW_F117_F078)
#define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089)
#define PW_MF089_F060 get_const_value(index_PW_MF089_F060)
#define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256)
#define PW_MF256_F050 get_const_value(index_PW_MF256_F050)
#define PD_DESCALE_P1 get_const_value(index_PD_DESCALE_P1)
#define PD_DESCALE_P2 get_const_value(index_PD_DESCALE_P2)
#define PW_DESCALE_P2X get_const_value(index_PW_DESCALE_P2X)
#define DO_FDCT_COMMON(PASS) \
{ \
__m64 tmp1312l, tmp1312h, tmp47l, tmp47h, tmp4l, tmp4h, tmp7l, tmp7h; \
__m64 tmp56l, tmp56h, tmp5l, tmp5h, tmp6l, tmp6h; \
__m64 out1l, out1h, out2l, out2h, out3l, out3h; \
__m64 out5l, out5h, out6l, out6h, out7l, out7h; \
__m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
\
/* (Original) \
* z1 = (tmp12 + tmp13) * 0.541196100; \
* out2 = z1 + tmp13 * 0.765366865; \
* out6 = z1 + tmp12 * -1.847759065; \
* \
* (This implementation) \
* out2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
* out6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
*/ \
\
tmp1312l = _mm_unpacklo_pi16(tmp13, tmp12); \
tmp1312h = _mm_unpackhi_pi16(tmp13, tmp12); \
\
out2l = _mm_madd_pi16(tmp1312l, PW_F130_F054); \
out2h = _mm_madd_pi16(tmp1312h, PW_F130_F054); \
out6l = _mm_madd_pi16(tmp1312l, PW_F054_MF130); \
out6h = _mm_madd_pi16(tmp1312h, PW_F054_MF130); \
\
out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
\
out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
\
out2 = _mm_packs_pi32(out2l, out2h); \
out6 = _mm_packs_pi32(out6l, out6h); \
\
/* Odd part */ \
\
z3 = _mm_add_pi16(tmp4, tmp6); \
z4 = _mm_add_pi16(tmp5, tmp7); \
\
/* (Original) \
* z5 = (z3 + z4) * 1.175875602; \
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
* z3 += z5; z4 += z5; \
* \
* (This implementation) \
* z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
* z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
*/ \
\
z34l = _mm_unpacklo_pi16(z3, z4); \
z34h = _mm_unpackhi_pi16(z3, z4); \
z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
\
/* (Original) \
* z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \
* tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \
* tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \
* z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
* out7 = tmp4 + z1 + z3; out5 = tmp5 + z2 + z4; \
* out3 = tmp6 + z2 + z3; out1 = tmp7 + z1 + z4; \
* \
* (This implementation) \
* tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
* tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
* tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
* tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
* out7 = tmp4 + z3; out5 = tmp5 + z4; \
* out3 = tmp6 + z3; out1 = tmp7 + z4; \
*/ \
\
tmp47l = _mm_unpacklo_pi16(tmp4, tmp7); \
tmp47h = _mm_unpackhi_pi16(tmp4, tmp7); \
\
tmp4l = _mm_madd_pi16(tmp47l, PW_MF060_MF089); \
tmp4h = _mm_madd_pi16(tmp47h, PW_MF060_MF089); \
tmp7l = _mm_madd_pi16(tmp47l, PW_MF089_F060); \
tmp7h = _mm_madd_pi16(tmp47h, PW_MF089_F060); \
\
out7l = _mm_add_pi32(tmp4l, z3l); \
out7h = _mm_add_pi32(tmp4h, z3h); \
out1l = _mm_add_pi32(tmp7l, z4l); \
out1h = _mm_add_pi32(tmp7h, z4h); \
\
out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
\
out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
\
out7 = _mm_packs_pi32(out7l, out7h); \
out1 = _mm_packs_pi32(out1l, out1h); \
\
tmp56l = _mm_unpacklo_pi16(tmp5, tmp6); \
tmp56h = _mm_unpackhi_pi16(tmp5, tmp6); \
\
tmp5l = _mm_madd_pi16(tmp56l, PW_MF050_MF256); \
tmp5h = _mm_madd_pi16(tmp56h, PW_MF050_MF256); \
tmp6l = _mm_madd_pi16(tmp56l, PW_MF256_F050); \
tmp6h = _mm_madd_pi16(tmp56h, PW_MF256_F050); \
\
out5l = _mm_add_pi32(tmp5l, z4l); \
out5h = _mm_add_pi32(tmp5h, z4h); \
out3l = _mm_add_pi32(tmp6l, z3l); \
out3h = _mm_add_pi32(tmp6h, z3h); \
\
out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
\
out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
\
out5 = _mm_packs_pi32(out5l, out5h); \
out3 = _mm_packs_pi32(out3l, out3h); \
}
#define DO_FDCT_PASS1() \
{ \
__m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
__m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
__m64 col0, col1, col2, col3, col4, col5, col6, col7; \
__m64 tmp10, tmp11; \
\
row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE*0]); /* (00 01 02 03) */ \
row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE*0+4]); /* (04 05 06 07) */ \
row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE*1]); /* (10 11 12 13) */ \
row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE*1+4]); /* (14 15 16 17) */ \
row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE*2]); /* (20 21 22 23) */ \
row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE*2+4]); /* (24 25 26 27) */ \
row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE*3]); /* (30 31 32 33) */ \
row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE*3+4]); /* (34 35 36 37) */ \
\
/* Transpose coefficients */ \
\
row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \
row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \
row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \
row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \
\
row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \
row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \
row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \
row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \
\
col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \
col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \
col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \
col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \
\
tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \
tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \
tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \
tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \
\
col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \
col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \
col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \
col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \
\
tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \
tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \
tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \
tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \
\
/* Even part */ \
\
tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \
tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \
tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \
tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \
\
out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \
out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \
out0 = _mm_slli_pi16(out0, PASS1_BITS); \
out4 = _mm_slli_pi16(out4, PASS1_BITS); \
\
DO_FDCT_COMMON(1) \
\
_mm_store_si64((__m64*)&dataptr[DCTSIZE*0], out0); \
_mm_store_si64((__m64*)&dataptr[DCTSIZE*0+4], out4); \
_mm_store_si64((__m64*)&dataptr[DCTSIZE*1], out1); \
_mm_store_si64((__m64*)&dataptr[DCTSIZE*1+4], out5); \
_mm_store_si64((__m64*)&dataptr[DCTSIZE*2], out2); \
_mm_store_si64((__m64*)&dataptr[DCTSIZE*2+4], out6); \
_mm_store_si64((__m64*)&dataptr[DCTSIZE*3], out3); \
_mm_store_si64((__m64*)&dataptr[DCTSIZE*3+4], out7); \
}
#define DO_FDCT_PASS2() \
{ \
__m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
__m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
__m64 row0, row1, row2, row3, row4, row5, row6, row7; \
__m64 tmp10, tmp11; \
\
col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE*0]); /* (00 10 20 30) */ \
col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE*1]); /* (01 11 21 31) */ \
col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE*2]); /* (02 12 22 32) */ \
col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE*3]); /* (03 13 23 33) */ \
col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE*4]); /* (40 50 60 70) */ \
col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE*5]); /* (41 51 61 71) */ \
col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE*6]); /* (42 52 62 72) */ \
col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE*7]); /* (43 53 63 73) */ \
\
/* Transpose coefficients */ \
\
col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \
col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \
col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \
col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \
\
col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \
col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \
col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \
col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \
\
row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \
row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \
row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \
row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \
\
tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \
tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \
tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \
tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \
\
row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \
row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \
row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \
row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \
\
tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \
tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \
tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \
tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \
\
/* Even part */ \
\
tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \
tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \
tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \
tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \
\
out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \
out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \
\
out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \
out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \
out0 = _mm_srai_pi16(out0, PASS1_BITS); \
out4 = _mm_srai_pi16(out4, PASS1_BITS); \
\
DO_FDCT_COMMON(2) \
\
_mm_store_si64((__m64*)&dataptr[DCTSIZE*0], out0); \
_mm_store_si64((__m64*)&dataptr[DCTSIZE*1], out1); \
_mm_store_si64((__m64*)&dataptr[DCTSIZE*2], out2); \
_mm_store_si64((__m64*)&dataptr[DCTSIZE*3], out3); \
_mm_store_si64((__m64*)&dataptr[DCTSIZE*4], out4); \
_mm_store_si64((__m64*)&dataptr[DCTSIZE*5], out5); \
_mm_store_si64((__m64*)&dataptr[DCTSIZE*6], out6); \
_mm_store_si64((__m64*)&dataptr[DCTSIZE*7], out7); \
}
void
jsimd_fdct_islow_mmi (DCTELEM *data)
{
__m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
__m64 out0, out1, out2, out3, out4, out5, out6, out7;
__m64 tmp12, tmp13;
DCTELEM *dataptr = data;
/* Pass 1: process rows. */
DO_FDCT_PASS1()
dataptr += DCTSIZE*4;
DO_FDCT_PASS1()
/* Pass 2: process columns. */
dataptr = data;
DO_FDCT_PASS2()
dataptr += 4;
DO_FDCT_PASS2()
}

View File

@@ -0,0 +1,575 @@
/*
* Loongson MMI optimizations for libjpeg-turbo
*
* Copyright (C) 2014-2015, 2018, D. R. Commander. All Rights Reserved.
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
* All Rights Reserved.
* Authors: ZhuChen <zhuchen@loongson.cn>
* CaiWanwei <caiwanwei@loongson.cn>
* SunZhangzhi <sunzhangzhi-cq@loongson.cn>
*
* Based on the x86 SIMD extension for IJG JPEG library
* Copyright (C) 1999-2006, MIYASAKA Masaru.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* SLOW INTEGER INVERSE DCT */
#include "jsimd_mmi.h"
#define CONST_BITS 13
#define PASS1_BITS 2
#define DESCALE_P1 (CONST_BITS-PASS1_BITS)
#define DESCALE_P2 (CONST_BITS+PASS1_BITS+3)
#define CENTERJSAMPLE 128
#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */
#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */
#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */
#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */
#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */
#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */
#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */
#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */
#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */
#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */
#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */
#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */
enum const_index {
index_PW_F130_F054,
index_PW_F054_MF130,
index_PW_MF078_F117,
index_PW_F117_F078,
index_PW_MF060_MF089,
index_PW_MF089_F060,
index_PW_MF050_MF256,
index_PW_MF256_F050,
index_PD_DESCALE_P1,
index_PD_DESCALE_P2,
index_PB_CENTERJSAMP
};
static uint64_t const_value[] = {
_uint64_set_pi16(FIX_0_541, (FIX_0_541+FIX_0_765),
FIX_0_541, (FIX_0_541+FIX_0_765)),
_uint64_set_pi16((FIX_0_541-FIX_1_847), FIX_0_541,
(FIX_0_541-FIX_1_847), FIX_0_541),
_uint64_set_pi16(FIX_1_175, (FIX_1_175-FIX_1_961),
FIX_1_175, (FIX_1_175-FIX_1_961)),
_uint64_set_pi16((FIX_1_175-FIX_0_390), FIX_1_175,
(FIX_1_175-FIX_0_390), FIX_1_175),
_uint64_set_pi16(-FIX_0_899, (FIX_0_298-FIX_0_899),
-FIX_0_899, (FIX_0_298-FIX_0_899)),
_uint64_set_pi16((FIX_1_501-FIX_0_899), -FIX_0_899,
(FIX_1_501-FIX_0_899), -FIX_0_899),
_uint64_set_pi16(-FIX_2_562, (FIX_2_053-FIX_2_562),
-FIX_2_562, (FIX_2_053-FIX_2_562)),
_uint64_set_pi16((FIX_3_072-FIX_2_562), -FIX_2_562,
(FIX_3_072-FIX_2_562), -FIX_2_562),
_uint64_set_pi32((1 << (DESCALE_P1-1)), (1 << (DESCALE_P1-1))),
_uint64_set_pi32((1 << (DESCALE_P2-1)), (1 << (DESCALE_P2-1))),
_uint64_set_pi8(CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE,
CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE)
};
#define PW_F130_F054 get_const_value(index_PW_F130_F054)
#define PW_F054_MF130 get_const_value(index_PW_F054_MF130)
#define PW_MF078_F117 get_const_value(index_PW_MF078_F117)
#define PW_F117_F078 get_const_value(index_PW_F117_F078)
#define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089)
#define PW_MF089_F060 get_const_value(index_PW_MF089_F060)
#define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256)
#define PW_MF256_F050 get_const_value(index_PW_MF256_F050)
#define PD_DESCALE_P1 get_const_value(index_PD_DESCALE_P1)
#define PD_DESCALE_P2 get_const_value(index_PD_DESCALE_P2)
#define PB_CENTERJSAMP get_const_value(index_PB_CENTERJSAMP)
#define test_m32_zero(mm32) (!(*(uint32_t *)&mm32))
#define test_m64_zero(mm64) (!(*(uint64_t *)&mm64))
#define DO_IDCT_COMMON(PASS) \
{ \
__m64 tmp0_3l, tmp0_3h, tmp1_2l, tmp1_2h; \
__m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
__m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
__m64 out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h; \
__m64 out4l, out4h, out5l, out5h, out6l, out6h, out7l, out7h; \
\
z3 = _mm_add_pi16(tmp0, tmp2); \
z4 = _mm_add_pi16(tmp1, tmp3); \
\
/* (Original) \
* z5 = (z3 + z4) * 1.175875602; \
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
* z3 += z5; z4 += z5; \
* \
* (This implementation) \
* z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
* z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
*/ \
\
z34l = _mm_unpacklo_pi16(z3, z4); \
z34h = _mm_unpackhi_pi16(z3, z4); \
z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
\
/* (Original) \
* z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
* tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
* tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
* z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
* tmp0 += z1 + z3; tmp1 += z2 + z4; \
* tmp2 += z2 + z3; tmp3 += z1 + z4; \
* \
* (This implementation) \
* tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
* tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
* tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
* tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
* tmp0 += z3; tmp1 += z4; \
* tmp2 += z3; tmp3 += z4; \
*/ \
\
tmp0_3l = _mm_unpacklo_pi16(tmp0, tmp3); \
tmp0_3h = _mm_unpackhi_pi16(tmp0, tmp3); \
\
tmp0l = _mm_madd_pi16(tmp0_3l, PW_MF060_MF089); \
tmp0h = _mm_madd_pi16(tmp0_3h, PW_MF060_MF089); \
tmp3l = _mm_madd_pi16(tmp0_3l, PW_MF089_F060); \
tmp3h = _mm_madd_pi16(tmp0_3h, PW_MF089_F060); \
\
tmp0l = _mm_add_pi32(tmp0l, z3l); \
tmp0h = _mm_add_pi32(tmp0h, z3h); \
tmp3l = _mm_add_pi32(tmp3l, z4l); \
tmp3h = _mm_add_pi32(tmp3h, z4h); \
\
tmp1_2l = _mm_unpacklo_pi16(tmp1, tmp2); \
tmp1_2h = _mm_unpackhi_pi16(tmp1, tmp2); \
\
tmp1l = _mm_madd_pi16(tmp1_2l, PW_MF050_MF256); \
tmp1h = _mm_madd_pi16(tmp1_2h, PW_MF050_MF256); \
tmp2l = _mm_madd_pi16(tmp1_2l, PW_MF256_F050); \
tmp2h = _mm_madd_pi16(tmp1_2h, PW_MF256_F050); \
\
tmp1l = _mm_add_pi32(tmp1l, z4l); \
tmp1h = _mm_add_pi32(tmp1h, z4h); \
tmp2l = _mm_add_pi32(tmp2l, z3l); \
tmp2h = _mm_add_pi32(tmp2h, z3h); \
\
/* Final output stage */ \
\
out0l = _mm_add_pi32(tmp10l, tmp3l); \
out0h = _mm_add_pi32(tmp10h, tmp3h); \
out7l = _mm_sub_pi32(tmp10l, tmp3l); \
out7h = _mm_sub_pi32(tmp10h, tmp3h); \
\
out0l = _mm_add_pi32(out0l, PD_DESCALE_P##PASS); \
out0h = _mm_add_pi32(out0h, PD_DESCALE_P##PASS); \
out0l = _mm_srai_pi32(out0l, DESCALE_P##PASS); \
out0h = _mm_srai_pi32(out0h, DESCALE_P##PASS); \
\
out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
\
out0 = _mm_packs_pi32(out0l, out0h); \
out7 = _mm_packs_pi32(out7l, out7h); \
\
out1l = _mm_add_pi32(tmp11l, tmp2l); \
out1h = _mm_add_pi32(tmp11h, tmp2h); \
out6l = _mm_sub_pi32(tmp11l, tmp2l); \
out6h = _mm_sub_pi32(tmp11h, tmp2h); \
\
out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
\
out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
\
out1 = _mm_packs_pi32(out1l, out1h); \
out6 = _mm_packs_pi32(out6l, out6h); \
\
out2l = _mm_add_pi32(tmp12l, tmp1l); \
out2h = _mm_add_pi32(tmp12h, tmp1h); \
out5l = _mm_sub_pi32(tmp12l, tmp1l); \
out5h = _mm_sub_pi32(tmp12h, tmp1h); \
\
out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
\
out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
\
out2 = _mm_packs_pi32(out2l, out2h); \
out5 = _mm_packs_pi32(out5l, out5h); \
\
out3l = _mm_add_pi32(tmp13l, tmp0l); \
out3h = _mm_add_pi32(tmp13h, tmp0h); \
\
out4l = _mm_sub_pi32(tmp13l, tmp0l); \
out4h = _mm_sub_pi32(tmp13h, tmp0h); \
\
out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
\
out4l = _mm_add_pi32(out4l, PD_DESCALE_P##PASS); \
out4h = _mm_add_pi32(out4h, PD_DESCALE_P##PASS); \
out4l = _mm_srai_pi32(out4l, DESCALE_P##PASS); \
out4h = _mm_srai_pi32(out4h, DESCALE_P##PASS); \
\
out3 = _mm_packs_pi32(out3l, out3h); \
out4 = _mm_packs_pi32(out4l, out4h); \
}
#define DO_IDCT_PASS1(iter) \
{ \
__m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
__m64 quant0l, quant1l, quant2l, quant3l; \
__m64 quant4l, quant5l, quant6l, quant7l; \
__m64 z23, z2, z3, z23l, z23h; \
__m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
__m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
__m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
__m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
__m32 col0a, col1a, mm0; \
\
col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE*1]); \
col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE*2]); \
mm0 = _mm_or_si32(col0a, col1a); \
\
if (test_m32_zero(mm0)) { \
__m64 mm1, mm2; \
\
col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE*0]); \
col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE*1]); \
col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE*2]); \
col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE*3]); \
col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE*4]); \
col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE*5]); \
col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE*6]); \
col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE*7]); \
\
mm1 = _mm_or_si64(col1l, col3l); \
mm2 = _mm_or_si64(col2l, col4l); \
mm1 = _mm_or_si64(mm1, col5l); \
mm2 = _mm_or_si64(mm2, col6l); \
mm1 = _mm_or_si64(mm1, col7l); \
mm1 = _mm_or_si64(mm1, mm2); \
\
if (test_m64_zero(mm1)) { \
__m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
\
/* AC terms all zero */ \
\
quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE*0]); \
\
dcval = _mm_mullo_pi16(col0l, quant0l); \
dcval = _mm_slli_pi16(dcval, PASS1_BITS); /* dcval=(00 10 20 30) */ \
\
dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \
dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \
\
row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \
row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \
row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \
row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \
\
_mm_store_si64((__m64 *)&wsptr[DCTSIZE*0], row0); \
_mm_store_si64((__m64 *)&wsptr[DCTSIZE*0+4], row0); \
_mm_store_si64((__m64 *)&wsptr[DCTSIZE*1], row1); \
_mm_store_si64((__m64 *)&wsptr[DCTSIZE*1+4], row1); \
_mm_store_si64((__m64 *)&wsptr[DCTSIZE*2], row2); \
_mm_store_si64((__m64 *)&wsptr[DCTSIZE*2+4], row2); \
_mm_store_si64((__m64 *)&wsptr[DCTSIZE*3], row3); \
_mm_store_si64((__m64 *)&wsptr[DCTSIZE*3+4], row3); \
\
goto nextcolumn##iter; \
} \
} \
\
/* Even part \
* \
* (Original) \
* z1 = (z2 + z3) * 0.541196100; \
* tmp2 = z1 + z3 * -1.847759065; \
* tmp3 = z1 + z2 * 0.765366865; \
* \
* (This implementation) \
* tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
* tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
*/ \
\
col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE*0]); /* (00 10 20 30) */ \
col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE*2]); /* (02 12 22 32) */ \
col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE*4]); /* (04 14 24 34) */ \
col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE*6]); /* (06 16 26 36) */ \
\
quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE*0]); \
quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE*2]); \
quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE*4]); \
quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE*6]); \
\
z2 = _mm_mullo_pi16(col2l, quant2l); \
z3 = _mm_mullo_pi16(col6l, quant6l); \
\
z23l = _mm_unpacklo_pi16(z2, z3); \
z23h = _mm_unpackhi_pi16(z2, z3); \
tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
\
z2 = _mm_mullo_pi16(col0l, quant0l); \
z3 = _mm_mullo_pi16(col4l, quant4l); \
\
z23 = _mm_add_pi16(z2, z3); \
tmp0l = _mm_loadlo_pi16_f(z23); \
tmp0h = _mm_loadhi_pi16_f(z23); \
tmp0l = _mm_srai_pi32(tmp0l, (16-CONST_BITS)); \
tmp0h = _mm_srai_pi32(tmp0h, (16-CONST_BITS)); \
\
tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
\
z23 = _mm_sub_pi16(z2, z3); \
tmp1l = _mm_loadlo_pi16_f(z23); \
tmp1h = _mm_loadhi_pi16_f(z23); \
tmp1l = _mm_srai_pi32(tmp1l, (16-CONST_BITS)); \
tmp1h = _mm_srai_pi32(tmp1h, (16-CONST_BITS)); \
\
tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
\
/* Odd part */ \
\
col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE*1]); /* (01 11 21 31) */ \
col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE*3]); /* (03 13 23 33) */ \
col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE*5]); /* (05 15 25 35) */ \
col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE*7]); /* (07 17 27 37) */ \
\
quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE*1]); \
quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE*3]); \
quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE*5]); \
quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE*7]); \
\
tmp0 = _mm_mullo_pi16(col7l, quant7l); \
tmp1 = _mm_mullo_pi16(col5l, quant5l); \
tmp2 = _mm_mullo_pi16(col3l, quant3l); \
tmp3 = _mm_mullo_pi16(col1l, quant1l); \
\
DO_IDCT_COMMON(1) \
\
/* out0=(00 10 20 30), out1=(01 11 21 31) */ \
/* out2=(02 12 22 32), out3=(03 13 23 33) */ \
/* out4=(04 14 24 34), out5=(05 15 25 35) */ \
/* out6=(06 16 26 36), out7=(07 17 27 37) */ \
\
/* Transpose coefficients */ \
\
row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \
row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \
row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \
row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \
\
row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \
row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \
row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \
row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \
\
row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \
row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \
row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \
row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \
\
row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \
row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \
row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \
row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \
\
_mm_store_si64((__m64*)&wsptr[DCTSIZE*0], row0l); \
_mm_store_si64((__m64*)&wsptr[DCTSIZE*0+4], row0h); \
_mm_store_si64((__m64*)&wsptr[DCTSIZE*1], row1l); \
_mm_store_si64((__m64*)&wsptr[DCTSIZE*1+4], row1h); \
_mm_store_si64((__m64*)&wsptr[DCTSIZE*2], row2l); \
_mm_store_si64((__m64*)&wsptr[DCTSIZE*2+4], row2h); \
_mm_store_si64((__m64*)&wsptr[DCTSIZE*3], row3l); \
_mm_store_si64((__m64*)&wsptr[DCTSIZE*3+4], row3h); \
}
#define DO_IDCT_PASS2(ctr) \
{ \
__m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
__m64 z23, z23l, z23h; \
__m64 col0123a, col0123b, col0123c, col0123d; \
__m64 col01l, col01h, col23l, col23h, row06, row17, row24, row35; \
__m64 col0, col1, col2, col3; \
__m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
__m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
\
row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE*0]); /* (00 01 02 03) */ \
row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE*1]); /* (10 11 12 13) */ \
row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE*2]); /* (20 21 22 23) */ \
row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE*3]); /* (30 31 32 33) */ \
row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE*4]); /* (40 41 42 43) */ \
row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE*5]); /* (50 51 52 53) */ \
row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE*6]); /* (60 61 62 63) */ \
row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE*7]); /* (70 71 72 73) */ \
\
/* Even part \
* \
* (Original) \
* z1 = (z2 + z3) * 0.541196100; \
* tmp2 = z1 + z3 * -1.847759065; \
* tmp3 = z1 + z2 * 0.765366865; \
* \
* (This implementation) \
* tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
* tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
*/ \
\
z23l = _mm_unpacklo_pi16(row2l, row6l); \
z23h = _mm_unpackhi_pi16(row2l, row6l); \
\
tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
\
z23 = _mm_add_pi16(row0l, row4l); \
tmp0l = _mm_loadlo_pi16_f(z23); \
tmp0h = _mm_loadhi_pi16_f(z23); \
tmp0l = _mm_srai_pi32(tmp0l, (16-CONST_BITS)); \
tmp0h = _mm_srai_pi32(tmp0h, (16-CONST_BITS)); \
\
tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
\
z23 = _mm_sub_pi16(row0l, row4l); \
tmp1l = _mm_loadlo_pi16_f(z23); \
tmp1h = _mm_loadhi_pi16_f(z23); \
tmp1l = _mm_srai_pi32(tmp1l, (16-CONST_BITS)); \
tmp1h = _mm_srai_pi32(tmp1h, (16-CONST_BITS)); \
\
tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
\
/* Odd part */ \
\
tmp0 = row7l; \
tmp1 = row5l; \
tmp2 = row3l; \
tmp3 = row1l; \
\
DO_IDCT_COMMON(2) \
\
/* out0=(00 01 02 03), out1=(10 11 12 13) */ \
/* out2=(20 21 22 23), out3=(30 31 32 33) */ \
/* out4=(40 41 42 43), out5=(50 51 52 53) */ \
/* out6=(60 61 62 63), out7=(70 71 72 73) */ \
\
row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \
row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \
row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \
row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \
\
row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
\
/* Transpose coefficients */ \
\
col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \
col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \
col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \
col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \
\
col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \
col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \
col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \
col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \
\
col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \
col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \
col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \
col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \
\
_mm_store_si64((__m64*)(output_buf[ctr+0] + output_col), col0); \
_mm_store_si64((__m64*)(output_buf[ctr+1] + output_col), col1); \
_mm_store_si64((__m64*)(output_buf[ctr+2] + output_col), col2); \
_mm_store_si64((__m64*)(output_buf[ctr+3] + output_col), col3); \
}
void
jsimd_idct_islow_mmi (void *dct_table, JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col)
{
__m64 tmp0, tmp1, tmp2, tmp3;
__m64 out0, out1, out2, out3, out4, out5, out6, out7;
JCOEFPTR inptr;
ISLOW_MULT_TYPE *quantptr;
JCOEF *wsptr;
JCOEF workspace[DCTSIZE2]; /* buffers data between passes */
/* Pass 1: process columns. */
inptr = coef_block;
quantptr = (ISLOW_MULT_TYPE *)dct_table;
wsptr = workspace;
DO_IDCT_PASS1(1)
nextcolumn1:
inptr += 4;
quantptr += 4;
wsptr += DCTSIZE*4;
DO_IDCT_PASS1(2)
nextcolumn2:
/* Pass 2: process rows. */
wsptr = workspace;
DO_IDCT_PASS2(0)
wsptr += 4;
DO_IDCT_PASS2(4)
}

132
simd/loongson/jquanti-mmi.c Normal file
View File

@@ -0,0 +1,132 @@
/*
* Loongson MMI optimizations for libjpeg-turbo
*
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
* All Rights Reserved.
* Authors: ZhuChen <zhuchen@loongson.cn>
* CaiWanwei <caiwanwei@loongson.cn>
* SunZhangzhi <sunzhangzhi-cq@loongson.cn>
* Copyright (C) 2018, D. R. Commander. All Rights Reserved.
*
* Based on the x86 SIMD extension for IJG JPEG library
* Copyright (C) 1999-2006, MIYASAKA Masaru.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
#include "jsimd_mmi.h"
#define DO_QUANT() \
{ \
\
mm2 = _mm_load_si64((__m64 *)&workspace[0]); \
mm3 = _mm_load_si64((__m64 *)&workspace[4]); \
\
mm0 = mm2; \
mm1 = mm3; \
\
mm2 = _mm_srai_pi16(mm2, (WORD_BIT-1)); /* -1 if value < 0, */ \
/* 0 otherwise */ \
mm3 = _mm_srai_pi16(mm3, (WORD_BIT-1)); \
\
mm0 = _mm_xor_si64(mm0, mm2); /* val = -val */ \
mm1 = _mm_xor_si64(mm1, mm3); \
mm0 = _mm_sub_pi16(mm0, mm2); \
mm1 = _mm_sub_pi16(mm1, mm3); \
\
corr0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2*1]); /* correction */ \
corr1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2*1 + 4]); \
\
mm0 = _mm_add_pi16(mm0, corr0); /* correction + roundfactor */ \
mm1 = _mm_add_pi16(mm1, corr1); \
\
mm4 = mm0; \
mm5 = mm1; \
\
recip0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2*0]); /* reciprocal */ \
recip1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2*0 + 4]); \
\
mm0 = _mm_mulhi_pi16(mm0, recip0); \
mm1 = _mm_mulhi_pi16(mm1, recip1); \
\
mm0 = _mm_add_pi16(mm0, mm4); /* reciprocal is always negative */ \
mm1 = _mm_add_pi16(mm1, mm5); /* (MSB=1), so we always need to add the */ \
/* initial value (input value is never */ \
/* negative as we inverted it at the */ \
/* start of this routine) */ \
\
scale0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2*2]); /* scale */ \
scale1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2*2 + 4]); \
\
mm6 = scale0; \
mm7 = scale1; \
mm4 = mm0; \
mm5 = mm1; \
\
mm0 = _mm_mulhi_pi16(mm0, mm6); \
mm1 = _mm_mulhi_pi16(mm1, mm7); \
\
mm6 = _mm_srai_pi16(mm6, (WORD_BIT-1)); /* determine if scale... */ \
/* is negative */ \
mm7 = _mm_srai_pi16(mm7, (WORD_BIT-1)); \
\
mm6 = _mm_and_si64(mm6, mm4); /* and add input if it is */ \
mm7 = _mm_and_si64(mm7, mm5); \
mm0 = _mm_add_pi16(mm0, mm6); \
mm1 = _mm_add_pi16(mm1, mm7); \
\
mm4 = _mm_srai_pi16(mm4, (WORD_BIT-1)); /* then check if... */ \
mm5 = _mm_srai_pi16(mm5, (WORD_BIT-1)); /* negative input */ \
\
mm4 = _mm_and_si64(mm4, scale0); /* and add scale if it is */ \
mm5 = _mm_and_si64(mm5, scale1); \
mm0 = _mm_add_pi16(mm0, mm4); \
mm1 = _mm_add_pi16(mm1, mm5); \
\
mm0 = _mm_xor_si64(mm0, mm2); /* val = -val */ \
mm1 = _mm_xor_si64(mm1, mm3); \
mm0 = _mm_sub_pi16(mm0, mm2); \
mm1 = _mm_sub_pi16(mm1, mm3); \
\
_mm_store_si64((__m64 *)&output_ptr[0], mm0); \
_mm_store_si64((__m64 *)&output_ptr[4], mm1); \
\
workspace += DCTSIZE; \
divisors += DCTSIZE; \
output_ptr += DCTSIZE; \
}
void
jsimd_quantize_mmi (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
{
JCOEFPTR output_ptr = coef_block;
__m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
__m64 corr0, corr1, recip0, recip1, scale0, scale1;
DO_QUANT()
DO_QUANT()
DO_QUANT()
DO_QUANT()
DO_QUANT()
DO_QUANT()
DO_QUANT()
DO_QUANT()
}

592
simd/loongson/jsimd.c Normal file
View File

@@ -0,0 +1,592 @@
/*
* jsimd_loongson.c
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2014, 2016, D. R. Commander.
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
* Copyright (C) 2015, Matthieu Darbois.
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
* For conditions of distribution and use, see copyright notice in jsimdext.inc
*
* This file contains the interface between the "normal" portions
* of the library and the SIMD implementations when running on a
* Loongson architecture.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
static unsigned int simd_support = ~0;
/*
* Check what SIMD accelerations are supported.
*
* FIXME: This code is racy under a multi-threaded environment.
*/
LOCAL(void)
init_simd (void)
{
char *env = NULL;
if (simd_support != ~0U)
return;
simd_support |= JSIMD_MMI;
/* Force different settings through environment variables */
env = getenv("JSIMD_FORCENONE");
if ((env != NULL) && (strcmp(env, "1") == 0))
simd_support = 0;
}
GLOBAL(int)
jsimd_can_rgb_ycc (void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
return 0;
if (simd_support & JSIMD_MMI)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_rgb_gray (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_ycc_rgb (void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
return 0;
if (simd_support & JSIMD_MMI)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_ycc_rgb565 (void)
{
return 0;
}
GLOBAL(int)
jsimd_c_can_null_convert (void)
{
return 0;
}
GLOBAL(void)
jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
void (*mmifct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
switch(cinfo->in_color_space) {
case JCS_EXT_RGB:
mmifct=jsimd_extrgb_ycc_convert_mmi;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
mmifct=jsimd_extrgbx_ycc_convert_mmi;
break;
case JCS_EXT_BGR:
mmifct=jsimd_extbgr_ycc_convert_mmi;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
mmifct=jsimd_extbgrx_ycc_convert_mmi;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
mmifct=jsimd_extxbgr_ycc_convert_mmi;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
mmifct=jsimd_extxrgb_ycc_convert_mmi;
break;
default:
mmifct=jsimd_rgb_ycc_convert_mmi;
break;
}
mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
}
GLOBAL(void)
jsimd_rgb_gray_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
}
GLOBAL(void)
jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows)
{
void (*mmifct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
switch(cinfo->out_color_space) {
case JCS_EXT_RGB:
mmifct=jsimd_ycc_extrgb_convert_mmi;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
mmifct=jsimd_ycc_extrgbx_convert_mmi;
break;
case JCS_EXT_BGR:
mmifct=jsimd_ycc_extbgr_convert_mmi;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
mmifct=jsimd_ycc_extbgrx_convert_mmi;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
mmifct=jsimd_ycc_extxbgr_convert_mmi;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
mmifct=jsimd_ycc_extxrgb_convert_mmi;
break;
default:
mmifct=jsimd_ycc_rgb_convert_mmi;
break;
}
mmifct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
}
GLOBAL(void)
jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows)
{
}
GLOBAL(void)
jsimd_c_null_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
}
GLOBAL(int)
jsimd_can_h2v2_downsample (void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_MMI)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h2v2_smooth_downsample (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_downsample (void)
{
return 0;
}
GLOBAL(void)
jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
jsimd_h2v2_downsample_mmi(cinfo->image_width, cinfo->max_v_samp_factor,
compptr->v_samp_factor, compptr->width_in_blocks,
input_data, output_data);
}
GLOBAL(void)
jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo,
jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
}
GLOBAL(void)
jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
}
GLOBAL(int)
jsimd_can_h2v2_upsample (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_upsample (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_int_upsample (void)
{
return 0;
}
GLOBAL(void)
jsimd_h2v2_upsample (j_decompress_ptr cinfo,
jpeg_component_info *compptr,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
}
GLOBAL(void)
jsimd_h2v1_upsample (j_decompress_ptr cinfo,
jpeg_component_info *compptr,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
}
GLOBAL(void)
jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
}
GLOBAL(int)
jsimd_can_h2v2_fancy_upsample (void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_MMI)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_fancy_upsample (void)
{
return 0;
}
GLOBAL(void)
jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
jpeg_component_info *compptr,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
jsimd_h2v2_fancy_upsample_mmi(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
}
GLOBAL(void)
jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
jpeg_component_info *compptr,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
}
GLOBAL(int)
jsimd_can_h2v2_merged_upsample (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_merged_upsample (void)
{
return 0;
}
GLOBAL(void)
jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr,
JSAMPARRAY output_buf)
{
}
GLOBAL(void)
jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr,
JSAMPARRAY output_buf)
{
}
GLOBAL(int)
jsimd_can_convsamp (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_convsamp_float (void)
{
return 0;
}
GLOBAL(void)
jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
DCTELEM *workspace)
{
}
GLOBAL(void)
jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
FAST_FLOAT *workspace)
{
}
GLOBAL(int)
jsimd_can_fdct_islow (void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_MMI)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_fdct_ifast (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_fdct_float (void)
{
return 0;
}
GLOBAL(void)
jsimd_fdct_islow (DCTELEM *data)
{
jsimd_fdct_islow_mmi(data);
}
GLOBAL(void)
jsimd_fdct_ifast (DCTELEM *data)
{
}
GLOBAL(void)
jsimd_fdct_float (FAST_FLOAT *data)
{
}
GLOBAL(int)
jsimd_can_quantize (void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_MMI)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_quantize_float (void)
{
return 0;
}
GLOBAL(void)
jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
DCTELEM *workspace)
{
jsimd_quantize_mmi(coef_block, divisors, workspace);
}
GLOBAL(void)
jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
FAST_FLOAT *workspace)
{
}
GLOBAL(int)
jsimd_can_idct_2x2 (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_idct_4x4 (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_idct_6x6 (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_idct_12x12 (void)
{
return 0;
}
GLOBAL(void)
jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
}
GLOBAL(void)
jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
}
GLOBAL(void)
jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
}
GLOBAL(void)
jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
}
GLOBAL(int)
jsimd_can_idct_islow (void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (sizeof(ISLOW_MULT_TYPE) != 2)
return 0;
if (simd_support & JSIMD_MMI)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_idct_ifast (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_idct_float (void)
{
return 0;
}
GLOBAL(void)
jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
jsimd_idct_islow_mmi(compptr->dct_table, coef_block, output_buf, output_col);
}
GLOBAL(void)
jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
}
GLOBAL(void)
jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
}
GLOBAL(int)
jsimd_can_huff_encode_one_block (void)
{
return 0;
}
GLOBAL(JOCTET*)
jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
int last_dc_val, c_derived_tbl *dctbl,
c_derived_tbl *actbl)
{
return NULL;
}

57
simd/loongson/jsimd_mmi.h Normal file
View File

@@ -0,0 +1,57 @@
/*
* Loongson MMI optimizations for libjpeg-turbo
*
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
* All Rights Reserved.
* Authors: ZhuChen <zhuchen@loongson.cn>
* CaiWanwei <caiwanwei@loongson.cn>
* SunZhangzhi <sunzhangzhi-cq@loongson.cn>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jdct.h"
#include "loongson-mmintrin.h"
/* Common code */
#define SIZEOF_MMWORD 8
#define BYTE_BIT 8
#define WORD_BIT 16
#define SCALEBITS 16
#define _uint64_set_pi8(a, b, c, d, e, f, g, h) \
(((uint64_t)(uint8_t)a << 56) | \
((uint64_t)(uint8_t)b << 48) | \
((uint64_t)(uint8_t)c << 40) | \
((uint64_t)(uint8_t)d << 32) | \
((uint64_t)(uint8_t)e << 24) | \
((uint64_t)(uint8_t)f << 16) | \
((uint64_t)(uint8_t)g << 8) | \
((uint64_t)(uint8_t)h))
#define _uint64_set_pi16(a, b, c, d) (((uint64_t)(uint16_t)a << 48) | \
((uint64_t)(uint16_t)b << 32) | \
((uint64_t)(uint16_t)c << 16) | \
((uint64_t)(uint16_t)d))
#define _uint64_set_pi32(a, b) (((uint64_t)(uint32_t)a << 32) | \
((uint64_t)(uint32_t)b))
#define get_const_value(index) (*(__m64 *)&const_value[index])

File diff suppressed because it is too large Load Diff