SIMD support for performing color conversion using MIPS DSPr2 instructions

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@993 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
DRC
2013-07-24 21:50:20 +00:00
parent 822c8507bb
commit 0be9fa5735
8 changed files with 1078 additions and 0 deletions

View File

@@ -6,6 +6,10 @@ line padding (previously, it only supported 4-byte padding, which was
compatible with X Video.) Also, the decompress-to-YUV function has been
extended to support image scaling.
[2] Added SIMD acceleration for performing color conversion on DSPr2-capable
MIPS platforms. This speeds up the compression of full-color JPEGs by 6-17%
on such platforms and decompression by 3-5%.
1.3.0
=====

View File

@@ -180,3 +180,33 @@ AC_DEFUN([AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE],[
$2
fi
])
# AC_CHECK_COMPATIBLE_MIPSEL_ASSEMBLER_IFELSE
# --------------------------
# Test whether the assembler is suitable and supports MIPS instructions
AC_DEFUN([AC_CHECK_COMPATIBLE_MIPSEL_ASSEMBLER_IFELSE],[
have_mips_dspr2=no
ac_save_CFLAGS="$CFLAGS"
CFLAGS="$CCASFLAGS -mdspr2"
AC_COMPILE_IFELSE([[
int main ()
{
int c = 0, a = 0, b = 0;
__asm__ __volatile__ (
"precr.qb.ph %[c], %[a], %[b] \n\t"
: [c] "=r" (c)
: [a] "r" (a), [b] "r" (b)
);
return c;
}
]], have_mips_dspr2=yes)
CFLAGS=$ac_save_CFLAGS
if test "x$have_mips_dspr2" = "xyes" ; then
$1
else
$2
fi
])

View File

@@ -425,6 +425,16 @@ if test "x${with_simd}" != "xno"; then
with_simd=no
AC_MSG_WARN([SIMD support can't be enabled. Performance will suffer.])])
;;
mipsel*)
AC_MSG_RESULT([yes (mipsel)])
AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
AC_CHECK_COMPATIBLE_MIPSEL_ASSEMBLER_IFELSE(
[AC_MSG_RESULT([yes])
simd_arch=mipsel],
[AC_MSG_RESULT([no])
with_simd=no
AC_MSG_WARN([SIMD support can't be enabled. Performance will suffer.])])
;;
*)
AC_MSG_RESULT([no ("$host_cpu")])
AC_MSG_WARN([SIMD support not available for this CPU. Performance will suffer.])
@@ -444,6 +454,7 @@ AM_CONDITIONAL([WITH_SSE_FLOAT_DCT], [test "x$simd_arch" = "xx86_64" -o "x$simd_
AM_CONDITIONAL([SIMD_I386], [test "x$simd_arch" = "xi386"])
AM_CONDITIONAL([SIMD_X86_64], [test "x$simd_arch" = "xx86_64"])
AM_CONDITIONAL([SIMD_ARM], [test "x$simd_arch" = "xarm"])
AM_CONDITIONAL([SIMD_MIPSEL], [test "x$simd_arch" = "xmipsel"])
AM_CONDITIONAL([X86_64], [test "x$host_cpu" = "xx86_64" -o "x$host_cpu" = "xamd64"])
AM_CONDITIONAL([WITH_TURBOJPEG], [test "x$with_turbojpeg" != "xno"])

View File

@@ -58,6 +58,12 @@ libsimd_la_SOURCES = jsimd_arm.c jsimd_arm_neon.S
endif
if SIMD_MIPSEL
libsimd_la_SOURCES = jsimd_mips.c jsimd_mips_dspr2.S
endif
AM_CPPFLAGS = -I$(top_srcdir)
.asm.lo:

View File

@@ -3,6 +3,7 @@
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright 2011 D. R. Commander
* Copyright (C) 2013, MIPS Technologies, Inc., California
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -18,6 +19,7 @@
#define JSIMD_SSE 0x04
#define JSIMD_SSE2 0x08
#define JSIMD_ARM_NEON 0x10
#define JSIMD_MIPS_DSPR2 0x20
/* Short forms of external names for systems with brain-damaged linkers. */
@@ -386,6 +388,64 @@ EXTERN(void) jsimd_ycc_extxrgb_convert_neon
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows));
EXTERN(void) jsimd_rgb_ycc_convert_mips_dspr2
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extrgb_ycc_convert_mips_dspr2
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extrgbx_ycc_convert_mips_dspr2
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extbgr_ycc_convert_mips_dspr2
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extbgrx_ycc_convert_mips_dspr2
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extxbgr_ycc_convert_mips_dspr2
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extxrgb_ycc_convert_mips_dspr2
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN (void) jsimd_ycc_rgb_convert_mips_dspr2
JPP((JDIMENSION img_width,
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows));
EXTERN(void) jsimd_ycc_extrgb_convert_mips_dspr2
JPP((JDIMENSION img_width,
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows));
EXTERN(void) jsimd_ycc_extrgbx_convert_mips_dspr2
JPP((JDIMENSION img_width,
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows));
EXTERN(void) jsimd_ycc_extbgr_convert_mips_dspr2
JPP((JDIMENSION img_width,
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows));
EXTERN(void) jsimd_ycc_extbgrx_convert_mips_dspr2
JPP((JDIMENSION img_width,
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows));
EXTERN(void) jsimd_ycc_extxbgr_convert_mips_dspr2
JPP((JDIMENSION img_width,
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows));
EXTERN(void) jsimd_ycc_extxrgb_convert_mips_dspr2
JPP((JDIMENSION img_width,
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows));
/* SIMD Downsample */
EXTERN(void) jsimd_h2v2_downsample_mmx
JPP((JDIMENSION image_width, int max_v_samp_factor,

465
simd/jsimd_mips.c Normal file
View File

@@ -0,0 +1,465 @@
/*
* jsimd_mips.c
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright 2009-2011 D. R. Commander
* Copyright (C) 2013, MIPS Technologies, Inc., California
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
* For conditions of distribution and use, see copyright notice in jsimdext.inc
*
* This file contains the interface between the "normal" portions
* of the library and the SIMD implementations when running on
* MIPS architecture.
*
* Based on the stubs from 'jsimd_none.c'
*/
#define JPEG_INTERNALS
#include "../jinclude.h"
#include "../jpeglib.h"
#include "../jsimd.h"
#include "../jdct.h"
#include "../jsimddct.h"
#include "jsimd.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>
static unsigned int simd_support = ~0;
#if defined(__linux__)
LOCAL(int)
parse_proc_cpuinfo(const char* search_string)
{
const char* file_name = "/proc/cpuinfo";
char cpuinfo_line[256];
FILE* f = NULL;
simd_support = 0;
if ((f = fopen(file_name, "r")) != NULL) {
while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
if (strstr(cpuinfo_line, search_string) != NULL) {
fclose(f);
simd_support |= JSIMD_MIPS_DSPR2;
return 1;
}
}
fclose(f);
}
/* Did not find string in the proc file, or not Linux ELF. */
return 0;
}
#endif
/*
* Check what SIMD accelerations are supported.
*
* FIXME: This code is racy under a multi-threaded environment.
*/
LOCAL(void)
init_simd (void)
{
if (simd_support != ~0U)
return;
simd_support = 0;
#if defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
simd_support |= JSIMD_MIPS_DSPR2;
#elif defined(__linux__)
/* We still have a chance to use MIPS DSPR2 regardless of globally used
* -mdspr2 options passed to gcc by performing runtime detection via
* /proc/cpuinfo parsing on linux */
if (!parse_proc_cpuinfo("MIPS 74K"))
return;
#endif
}
GLOBAL(int)
jsimd_can_rgb_ycc (void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
return 0;
if (simd_support & JSIMD_MIPS_DSPR2)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_rgb_gray (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_ycc_rgb (void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
return 0;
if (simd_support & JSIMD_MIPS_DSPR2)
return 1;
return 0;
}
GLOBAL(void)
jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
switch(cinfo->in_color_space)
{
case JCS_EXT_RGB:
mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
mipsdspr2fct=jsimd_extrgbx_ycc_convert_mips_dspr2;
break;
case JCS_EXT_BGR:
mipsdspr2fct=jsimd_extbgr_ycc_convert_mips_dspr2;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
mipsdspr2fct=jsimd_extbgrx_ycc_convert_mips_dspr2;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
mipsdspr2fct=jsimd_extxbgr_ycc_convert_mips_dspr2;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
mipsdspr2fct=jsimd_extxrgb_ycc_convert_mips_dspr2;
break;
default:
mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
break;
}
if (simd_support & JSIMD_MIPS_DSPR2)
mipsdspr2fct(cinfo->image_width, input_buf,
output_buf, output_row, num_rows);
}
GLOBAL(void)
jsimd_rgb_gray_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
}
GLOBAL(void)
jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows)
{
void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
switch(cinfo->out_color_space)
{
case JCS_EXT_RGB:
mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
mipsdspr2fct=jsimd_ycc_extrgbx_convert_mips_dspr2;
break;
case JCS_EXT_BGR:
mipsdspr2fct=jsimd_ycc_extbgr_convert_mips_dspr2;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
mipsdspr2fct=jsimd_ycc_extbgrx_convert_mips_dspr2;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
mipsdspr2fct=jsimd_ycc_extxbgr_convert_mips_dspr2;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
mipsdspr2fct=jsimd_ycc_extxrgb_convert_mips_dspr2;
break;
default:
mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
break;
}
if (simd_support & JSIMD_MIPS_DSPR2)
mipsdspr2fct(cinfo->output_width, input_buf,
input_row, output_buf, num_rows);
}
GLOBAL(int)
jsimd_can_h2v2_downsample (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_downsample (void)
{
return 0;
}
GLOBAL(void)
jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
}
GLOBAL(void)
jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
}
GLOBAL(int)
jsimd_can_h2v2_upsample (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_upsample (void)
{
return 0;
}
GLOBAL(void)
jsimd_h2v2_upsample (j_decompress_ptr cinfo,
jpeg_component_info * compptr,
JSAMPARRAY input_data,
JSAMPARRAY * output_data_ptr)
{
}
GLOBAL(void)
jsimd_h2v1_upsample (j_decompress_ptr cinfo,
jpeg_component_info * compptr,
JSAMPARRAY input_data,
JSAMPARRAY * output_data_ptr)
{
}
GLOBAL(int)
jsimd_can_h2v2_fancy_upsample (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_fancy_upsample (void)
{
return 0;
}
GLOBAL(void)
jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
jpeg_component_info * compptr,
JSAMPARRAY input_data,
JSAMPARRAY * output_data_ptr)
{
}
GLOBAL(void)
jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
jpeg_component_info * compptr,
JSAMPARRAY input_data,
JSAMPARRAY * output_data_ptr)
{
}
GLOBAL(int)
jsimd_can_h2v2_merged_upsample (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_merged_upsample (void)
{
return 0;
}
GLOBAL(void)
jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr,
JSAMPARRAY output_buf)
{
}
GLOBAL(void)
jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr,
JSAMPARRAY output_buf)
{
}
GLOBAL(int)
jsimd_can_convsamp (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_convsamp_float (void)
{
return 0;
}
GLOBAL(void)
jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
DCTELEM * workspace)
{
}
GLOBAL(void)
jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
FAST_FLOAT * workspace)
{
}
GLOBAL(int)
jsimd_can_fdct_islow (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_fdct_ifast (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_fdct_float (void)
{
return 0;
}
GLOBAL(void)
jsimd_fdct_islow (DCTELEM * data)
{
}
GLOBAL(void)
jsimd_fdct_ifast (DCTELEM * data)
{
}
GLOBAL(void)
jsimd_fdct_float (FAST_FLOAT * data)
{
}
GLOBAL(int)
jsimd_can_quantize (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_quantize_float (void)
{
return 0;
}
GLOBAL(void)
jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
DCTELEM * workspace)
{
}
GLOBAL(void)
jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
FAST_FLOAT * workspace)
{
}
GLOBAL(int)
jsimd_can_idct_2x2 (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_idct_4x4 (void)
{
return 0;
}
GLOBAL(void)
jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
}
GLOBAL(void)
jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
}
GLOBAL(int)
jsimd_can_idct_islow (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_idct_ifast (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_idct_float (void)
{
return 0;
}
GLOBAL(void)
jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
}
GLOBAL(void)
jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
}
GLOBAL(void)
jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
}

250
simd/jsimd_mips_dspr2.S Normal file
View File

@@ -0,0 +1,250 @@
/*
* MIPS DSPr2 optimizations for libjpeg-turbo
*
* Copyright (C) 2013, MIPS Technologies, Inc., California.
* All rights reserved.
* Authors: Teodora Novkovic (teodora.novkovic@imgtec.com)
* Darko Laus (darko.laus@imgtec.com)
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#include "jsimd_mips_dspr2_asm.h"
/*****************************************************************************/
/*
* jsimd_extrgb_ycc_convert_mips_dspr2
* jsimd_extbgr_ycc_convert_mips_dspr2
* jsimd_extrgbx_ycc_convert_mips_dspr2
* jsimd_extbgrx_ycc_convert_mips_dspr2
* jsimd_extxbgr_ycc_convert_mips_dspr2
* jsimd_extxrgb_ycc_convert_mips_dspr2
*
* Colorspace conversion RGB -> YCbCr
*/
.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
.macro DO_RGB_TO_YCC r, \
g, \
b, \
inptr
lbu \r, \r_offs(\inptr)
lbu \g, \g_offs(\inptr)
lbu \b, \b_offs(\inptr)
addiu \inptr, \pixel_size
.endm
LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
/*
* a0 - cinfo->image_width
* a1 - input_buf
* a2 - output_buf
* a3 - output_row
* 16(sp) - num_rows
*/
SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
lw t7, 48(sp) // t7 = num_rows
li s0, 0x4c8b // FIX(0.29900)
li s1, 0x9646 // FIX(0.58700)
li s2, 0x1d2f // FIX(0.11400)
li s3, 0xffffd4cd // -FIX(0.16874)
li s4, 0xffffab33 // -FIX(0.33126)
li s5, 0x8000 // FIX(0.50000)
li s6, 0xffff94d1 // -FIX(0.41869)
li s7, 0xffffeb2f // -FIX(0.08131)
li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1
0:
addiu t7, -1 // --num_rows
lw t6, 0(a1) // t6 = input_buf[0]
lw t0, 0(a2)
lw t1, 4(a2)
lw t2, 8(a2)
sll t3, a3, 2
lwx t0, t3(t0) // t0 = output_buf[0][output_row]
lwx t1, t3(t1) // t1 = output_buf[1][output_row]
lwx t2, t3(t2) // t2 = output_buf[2][output_row]
addu t9, t2, a0 // t9 = end address
addiu a3, 1
1:
DO_RGB_TO_YCC t3, t4, t5, t6
mtlo s5, $ac0
mtlo t8, $ac1
mtlo t8, $ac2
maddu $ac0, s2, t5
maddu $ac1, s5, t5
maddu $ac2, s5, t3
maddu $ac0, s0, t3
maddu $ac1, s3, t3
maddu $ac2, s6, t4
maddu $ac0, s1, t4
maddu $ac1, s4, t4
maddu $ac2, s7, t5
extr.w t3, $ac0, 16
extr.w t4, $ac1, 16
extr.w t5, $ac2, 16
sb t3, 0(t0)
sb t4, 0(t1)
sb t5, 0(t2)
addiu t0, 1
addiu t2, 1
bne t2, t9, 1b
addiu t1, 1
bgtz t7, 0b
addiu a1, 4
RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
j ra
nop
END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
.purgem DO_RGB_TO_YCC
.endm
/*------------------------------------------id -- pix R G B */
GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2
GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0
GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
/*****************************************************************************/
/*
* jsimd_ycc_extrgb_convert_mips_dspr2
* jsimd_ycc_extbgr_convert_mips_dspr2
* jsimd_ycc_extrgbx_convert_mips_dspr2
* jsimd_ycc_extbgrx_convert_mips_dspr2
* jsimd_ycc_extxbgr_convert_mips_dspr2
* jsimd_ycc_extxrgb_convert_mips_dspr2
*
* Colorspace conversion YCbCr -> RGB
*/
.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs
.macro STORE_YCC_TO_RGB scratch0 \
scratch1 \
scratch2 \
outptr
sb \scratch0, \r_offs(\outptr)
sb \scratch1, \g_offs(\outptr)
sb \scratch2, \b_offs(\outptr)
.if (\pixel_size == 4)
li t0, 0xFF
sb t0, \a_offs(\outptr)
.endif
addiu \outptr, \pixel_size
.endm
LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
/*
* a0 - cinfo->image_width
* a1 - input_buf
* a2 - input_row
* a3 - output_buf
* 16(sp) - num_rows
*/
SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
lw s1, 48(sp)
li t3, 0x8000
li t4, 0x166e9 // FIX(1.40200)
li t5, 0x1c5a2 // FIX(1.77200)
li t6, 0xffff492e // -FIX(0.71414)
li t7, 0xffffa7e6 // -FIX(0.34414)
repl.ph t8, 128
0:
lw s0, 0(a3)
lw t0, 0(a1)
lw t1, 4(a1)
lw t2, 8(a1)
sll s5, a2, 2
addiu s1, -1
lwx s2, s5(t0)
lwx s3, s5(t1)
lwx s4, s5(t2)
addu t9, s2, a0
addiu a2, 1
1:
lbu s7, 0(s4) // cr
lbu s6, 0(s3) // cb
lbu s5, 0(s2) // y
addiu s2, 1
addiu s4, 1
addiu s7, -128
addiu s6, -128
mul t2, t7, s6
mul t0, t6, s7 // Crgtab[cr]
sll s7, 15
mulq_rs.w t1, t4, s7 // Crrtab[cr]
sll s6, 15
addu t2, t3 // Cbgtab[cb]
addu t2, t0
mulq_rs.w t0, t5, s6 // Cbbtab[cb]
sra t2, 16
addu t1, s5
addu t2, s5 // add y
ins t2, t1, 16, 16
subu.ph t2, t2, t8
addu t0, s5
shll_s.ph t2, t2, 8
subu t0, 128
shra.ph t2, t2, 8
shll_s.w t0, t0, 24
addu.ph t2, t2, t8 // clip & store
sra t0, t0, 24
sra t1, t2, 16
addiu t0, 128
STORE_YCC_TO_RGB t1, t2, t0, s0
bne s2, t9, 1b
addiu s3, 1
bgtz s1, 0b
addiu a3, 4
RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
j ra
nop
END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
.purgem STORE_YCC_TO_RGB
.endm
/*------------------------------------------id -- pix R G B A */
GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2, 3
GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0, 3
GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
/*****************************************************************************/

252
simd/jsimd_mips_dspr2_asm.h Normal file
View File

@@ -0,0 +1,252 @@
/*
* MIPS DSPr2 optimizations for libjpeg-turbo
*
* Copyright (C) 2013, MIPS Technologies, Inc., California.
* All rights reserved.
* Authors: Teodora Novkovic (teodora.novkovic@imgtec.com)
* Darko Laus (darko.laus@imgtec.com)
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define zero $0
#define AT $1
#define v0 $2
#define v1 $3
#define a0 $4
#define a1 $5
#define a2 $6
#define a3 $7
#define t0 $8
#define t1 $9
#define t2 $10
#define t3 $11
#define t4 $12
#define t5 $13
#define t6 $14
#define t7 $15
#define s0 $16
#define s1 $17
#define s2 $18
#define s3 $19
#define s4 $20
#define s5 $21
#define s6 $22
#define s7 $23
#define t8 $24
#define t9 $25
#define k0 $26
#define k1 $27
#define gp $28
#define sp $29
#define fp $30
#define s8 $30
#define ra $31
/*
* LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
*/
#define LEAF_MIPS32R2(symbol) \
.globl symbol; \
.align 2; \
.type symbol, @function; \
.ent symbol, 0; \
symbol: .frame sp, 0, ra; \
.set push; \
.set arch=mips32r2; \
.set noreorder; \
.set noat;
/*
* LEAF_MIPS_DSPR2 - declare leaf routine for MIPS DSPr2
*/
#define LEAF_MIPS_DSPR2(symbol) \
LEAF_MIPS32R2(symbol) \
.set dspr2;
/*
* END - mark end of function
*/
#define END(function) \
.set pop; \
.end function; \
.size function,.-function
/*
* Checks if stack offset is big enough for storing/restoring regs_num
* number of register to/from stack. Stack offset must be greater than
* or equal to the number of bytes needed for storing registers (regs_num*4).
* Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
* preserved for input arguments of the functions, already stored in a0-a3),
* stack size can be further optimized by utilizing this space.
*/
.macro CHECK_STACK_OFFSET regs_num, stack_offset
.if \stack_offset < \regs_num * 4 - 16
.error "Stack offset too small."
.endif
.endm
/*
* Saves set of registers on stack. Maximum number of registers that
* can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
* Stack offset is number of bytes that are added to stack pointer (sp)
* before registers are pushed in order to provide enough space on stack
* (offset must be multiple of 4, and must be big enough, as described by
* CHECK_STACK_OFFSET macro). This macro is intended to be used in
* combination with RESTORE_REGS_FROM_STACK macro. Example:
* SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
* RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
*/
.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
r2 = 0, r3 = 0, r4 = 0, \
r5 = 0, r6 = 0, r7 = 0, \
r8 = 0, r9 = 0, r10 = 0, \
r11 = 0, r12 = 0, r13 = 0, \
r14 = 0
.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
.error "Stack offset must be pozitive and multiple of 4."
.endif
.if \stack_offset != 0
addiu sp, sp, -\stack_offset
.endif
sw \r1, 0(sp)
.if \r2 != 0
sw \r2, 4(sp)
.endif
.if \r3 != 0
sw \r3, 8(sp)
.endif
.if \r4 != 0
sw \r4, 12(sp)
.endif
.if \r5 != 0
CHECK_STACK_OFFSET 5, \stack_offset
sw \r5, 16(sp)
.endif
.if \r6 != 0
CHECK_STACK_OFFSET 6, \stack_offset
sw \r6, 20(sp)
.endif
.if \r7 != 0
CHECK_STACK_OFFSET 7, \stack_offset
sw \r7, 24(sp)
.endif
.if \r8 != 0
CHECK_STACK_OFFSET 8, \stack_offset
sw \r8, 28(sp)
.endif
.if \r9 != 0
CHECK_STACK_OFFSET 9, \stack_offset
sw \r9, 32(sp)
.endif
.if \r10 != 0
CHECK_STACK_OFFSET 10, \stack_offset
sw \r10, 36(sp)
.endif
.if \r11 != 0
CHECK_STACK_OFFSET 11, \stack_offset
sw \r11, 40(sp)
.endif
.if \r12 != 0
CHECK_STACK_OFFSET 12, \stack_offset
sw \r12, 44(sp)
.endif
.if \r13 != 0
CHECK_STACK_OFFSET 13, \stack_offset
sw \r13, 48(sp)
.endif
.if \r14 != 0
CHECK_STACK_OFFSET 14, \stack_offset
sw \r14, 52(sp)
.endif
.endm
/*
* Restores set of registers from stack. Maximum number of registers that
* can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
* Stack offset is number of bytes that are added to stack pointer (sp)
* after registers are restored (offset must be multiple of 4, and must
* be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
* intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
* Example:
* SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
* RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
*/
.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
r2 = 0, r3 = 0, r4 = 0, \
r5 = 0, r6 = 0, r7 = 0, \
r8 = 0, r9 = 0, r10 = 0, \
r11 = 0, r12 = 0, r13 = 0, \
r14 = 0
.if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4)
.error "Stack offset must be pozitive and multiple of 4."
.endif
lw \r1, 0(sp)
.if \r2 != 0
lw \r2, 4(sp)
.endif
.if \r3 != 0
lw \r3, 8(sp)
.endif
.if \r4 != 0
lw \r4, 12(sp)
.endif
.if \r5 != 0
CHECK_STACK_OFFSET 5, \stack_offset
lw \r5, 16(sp)
.endif
.if \r6 != 0
CHECK_STACK_OFFSET 6, \stack_offset
lw \r6, 20(sp)
.endif
.if \r7 != 0
CHECK_STACK_OFFSET 7, \stack_offset
lw \r7, 24(sp)
.endif
.if \r8 != 0
CHECK_STACK_OFFSET 8, \stack_offset
lw \r8, 28(sp)
.endif
.if \r9 != 0
CHECK_STACK_OFFSET 9, \stack_offset
lw \r9, 32(sp)
.endif
.if \r10 != 0
CHECK_STACK_OFFSET 10, \stack_offset
lw \r10, 36(sp)
.endif
.if \r11 != 0
CHECK_STACK_OFFSET 11, \stack_offset
lw \r11, 40(sp)
.endif
.if \r12 != 0
CHECK_STACK_OFFSET 12, \stack_offset
lw \r12, 44(sp)
.endif
.if \r13 != 0
CHECK_STACK_OFFSET 13, \stack_offset
lw \r13, 48(sp)
.endif
.if \r14 != 0
CHECK_STACK_OFFSET 14, \stack_offset
lw \r14, 52(sp)
.endif
.if \stack_offset != 0
addiu sp, sp, \stack_offset
.endif
.endm