SIMD-accelerated slow integer IDCT routine for MIPS DSPr2

This commit is contained in:
DRC
2014-05-06 09:53:21 +00:00
parent 361192b972
commit 7824f70008
4 changed files with 319 additions and 3 deletions

View File

@@ -30,5 +30,5 @@
"Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \
"Copyright (C) 2009-2014 D. R. Commander\n" \
"Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
"Copyright (C) 2013 MIPS Technologies, Inc.\n" \
"Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
"Copyright (C) 2013 Linaro Limited"

View File

@@ -3,7 +3,7 @@
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright 2011 D. R. Commander
* Copyright (C) 2013, MIPS Technologies, Inc., California
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -801,6 +801,10 @@ EXTERN(void) jsimd_idct_ifast_neon JPP((void * dct_table,
JSAMPARRAY output_buf,
JDIMENSION output_col));
EXTERN(void) jsimd_idct_islow_mips_dspr2 JPP((void * dct_table,
JCOEFPTR coef_block,
int * output_buf,
JSAMPLE * output_col));
EXTERN(void) jsimd_idct_ifast_cols_mips_dspr2 JPP((JCOEF * inptr,
IFAST_MULT_TYPE * quantptr,
DCTELEM * wsptr,

View File

@@ -3,7 +3,7 @@
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright 2009-2011 D. R. Commander
* Copyright (C) 2013, MIPS Technologies, Inc., California
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -781,6 +781,23 @@ jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
GLOBAL(int)
jsimd_can_idct_islow (void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (sizeof(ISLOW_MULT_TYPE) != 2)
return 0;
if (simd_support & JSIMD_MIPS_DSPR2)
return 1;
return 0;
}
@@ -820,6 +837,21 @@ jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
if (simd_support & JSIMD_MIPS_DSPR2) {
int output[8] = {
(int)(output_buf[0] + output_col),
(int)(output_buf[1] + output_col),
(int)(output_buf[2] + output_col),
(int)(output_buf[3] + output_col),
(int)(output_buf[4] + output_col),
(int)(output_buf[5] + output_col),
(int)(output_buf[6] + output_col),
(int)(output_buf[7] + output_col),
};
jsimd_idct_islow_mips_dspr2(coef_block, compptr->dct_table,
output, IDCT_range_limit(cinfo));
}
}
GLOBAL(void)

View File

@@ -964,6 +964,286 @@ LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
nop
END(jsimd_h2v2_upsample_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2)
/*
* a0 - coef_block
* a1 - compptr->dcttable
* a2 - output
* a3 - range_limit
*/
SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
addiu sp, sp, -256
move v0, sp
addiu v1, zero, 8 // v1 = DCTSIZE = 8
1:
lh s4, 32(a0) // s4 = inptr[16]
lh s5, 64(a0) // s5 = inptr[32]
lh s6, 96(a0) // s6 = inptr[48]
lh t1, 112(a0) // t1 = inptr[56]
lh t7, 16(a0) // t7 = inptr[8]
lh t5, 80(a0) // t5 = inptr[40]
lh t3, 48(a0) // t3 = inptr[24]
or s4, s4, t1
or s4, s4, t3
or s4, s4, t5
or s4, s4, t7
or s4, s4, s5
or s4, s4, s6
bnez s4, 2f
addiu v1, v1, -1
lh s5, 0(a1) // quantptr[DCTSIZE*0]
lh s6, 0(a0) // inptr[DCTSIZE*0]
mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0])
sll s5, s5, 2
sw s5, 0(v0)
sw s5, 32(v0)
sw s5, 64(v0)
sw s5, 96(v0)
sw s5, 128(v0)
sw s5, 160(v0)
sw s5, 192(v0)
b 3f
sw s5, 224(v0)
2:
lh t0, 112(a1)
lh t2, 48(a1)
lh t4, 80(a1)
lh t6, 16(a1)
mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7])
mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3])
mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5])
mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1])
lh t4, 32(a1)
lh t5, 32(a0)
lh t6, 96(a1)
lh t7, 96(a0)
addu s0, t0, t1 // z3 = tmp0 + tmp2
addu s1, t1, t2 // z2 = tmp1 + tmp2
addu s2, t2, t3 // z4 = tmp1 + tmp3
addu s3, s0, s2 // z3 + z4
addiu t9, zero, 9633 // FIX_1_175875602
mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
addu t8, t0, t3 // z1 = tmp0 + tmp3
addiu t9, zero, 2446 // FIX_0_298631336
mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
addiu t9, zero, 16819 // FIX_2_053119869
mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
addiu t9, zero, 25172 // FIX_3_072711026
mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
addiu t9, zero, 12299 // FIX_1_501321110
mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
addiu t9, zero, 16069 // FIX_1_961570560
mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560)
addiu t9, zero, 3196 // FIX_0_390180644
mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644)
addiu t9, zero, 7373 // FIX_0_899976223
mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223)
addiu t9, zero, 20995 // FIX_2_562915447
mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447)
subu s0, s3, s0 // z3 += z5
addu t0, t0, s0 // tmp0 += z3
addu t1, t1, s0 // tmp2 += z3
subu s2, s3, s2 // z4 += z5
addu t2, t2, s2 // tmp1 += z4
addu t3, t3, s2 // tmp3 += z4
subu t0, t0, t8 // tmp0 += z1
subu t1, t1, s1 // tmp2 += z2
subu t2, t2, s1 // tmp1 += z2
subu t3, t3, t8 // tmp3 += z1
mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2])
addiu t9, zero, 6270 // FIX_0_765366865
mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6])
lh t4, 0(a1)
lh t5, 0(a0)
lh t6, 64(a1)
lh t7, 64(a0)
mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865)
mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0])
mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4])
addiu t9, zero, 4433 // FIX_0_541196100
addu s3, s0, s1 // z2 + z3
mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
addiu t9, zero, 15137 // FIX_1_847759065
mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065)
addu t4, t5, t6
subu t5, t5, t6
sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS
sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS
addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
addu s0, t4, t7
subu s1, t4, t7
addu s2, t5, t6
subu s3, t5, t6
addu t4, s0, t3
subu s0, s0, t3
addu t3, s2, t1
subu s2, s2, t1
addu t1, s3, t2
subu s3, s3, t2
addu t2, s1, t0
subu s1, s1, t0
shra_r.w t4, t4, 11
shra_r.w t3, t3, 11
shra_r.w t1, t1, 11
shra_r.w t2, t2, 11
shra_r.w s1, s1, 11
shra_r.w s3, s3, 11
shra_r.w s2, s2, 11
shra_r.w s0, s0, 11
sw t4, 0(v0)
sw t3, 32(v0)
sw t1, 64(v0)
sw t2, 96(v0)
sw s1, 128(v0)
sw s3, 160(v0)
sw s2, 192(v0)
sw s0, 224(v0)
3:
addiu a1, a1, 2
addiu a0, a0, 2
bgtz v1, 1b
addiu v0, v0, 4
move v0, sp
addiu v1, zero, 8
4:
lw t0, 8(v0) // z2 = (INT32) wsptr[2]
lw t1, 24(v0) // z3 = (INT32) wsptr[6]
lw t2, 0(v0) // (INT32) wsptr[0]
lw t3, 16(v0) // (INT32) wsptr[4]
lw s4, 4(v0) // (INT32) wsptr[1]
lw s5, 12(v0) // (INT32) wsptr[3]
lw s6, 20(v0) // (INT32) wsptr[5]
lw s7, 28(v0) // (INT32) wsptr[7]
or s4, s4, t0
or s4, s4, t1
or s4, s4, t3
or s4, s4, s7
or s4, s4, s5
or s4, s4, s6
bnez s4, 5f
addiu v1, v1, -1
shra_r.w s5, t2, 5
andi s5, s5, 0x3ff
lbux s5, s5(a3)
lw s1, 0(a2)
replv.qb s5, s5
usw s5, 0(s1)
usw s5, 4(s1)
b 6f
nop
5:
addu t4, t0, t1 // z2 + z3
addiu t8, zero, 4433 // FIX_0_541196100
mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
addiu t8, zero, 15137 // FIX_1_847759065
mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065)
addiu t8, zero, 6270 // FIX_0_765366865
mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865)
addu t4, t2, t3 // (INT32) wsptr[0] + (INT32) wsptr[4]
subu t2, t2, t3 // (INT32) wsptr[0] - (INT32) wsptr[4]
sll t4, t4, 13 // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS
sll t2, t2, 13 // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS
subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
subu t3, t2, t1 // tmp12 = tmp1 - tmp2
addu t2, t2, t1 // tmp11 = tmp1 + tmp2
addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
subu t1, t4, t5 // tmp13 = tmp0 - tmp3
addu t0, t4, t5 // tmp10 = tmp0 + tmp3
lw t4, 28(v0) // tmp0 = (INT32) wsptr[7]
lw t6, 12(v0) // tmp2 = (INT32) wsptr[3]
lw t5, 20(v0) // tmp1 = (INT32) wsptr[5]
lw t7, 4(v0) // tmp3 = (INT32) wsptr[1]
addu s0, t4, t6 // z3 = tmp0 + tmp2
addiu t8, zero, 9633 // FIX_1_175875602
addu s1, t5, t7 // z4 = tmp1 + tmp3
addu s2, s0, s1 // z3 + z4
mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
addu s3, t4, t7 // z1 = tmp0 + tmp3
addu t9, t5, t6 // z2 = tmp1 + tmp2
addiu t8, zero, 16069 // FIX_1_961570560
mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560)
addiu t8, zero, 3196 // FIX_0_390180644
mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644)
addiu t8, zero, 2446 // FIX_0_298631336
mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
addiu t8, zero, 7373 // FIX_0_899976223
mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223)
addiu t8, zero, 16819 // FIX_2_053119869
mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
addiu t8, zero, 20995 // FIX_2_562915447
mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447)
addiu t8, zero, 25172 // FIX_3_072711026
mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
addiu t8, zero, 12299 // FIX_1_501321110
mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
subu s0, s2, s0 // z3 += z5
subu s1, s2, s1 // z4 += z5
addu t4, t4, s0
subu t4, t4, s3 // tmp0
addu t5, t5, s1
subu t5, t5, t9 // tmp1
addu t6, t6, s0
subu t6, t6, t9 // tmp2
addu t7, t7, s1
subu t7, t7, s3 // tmp3
addu s0, t0, t7
subu t0, t0, t7
addu t7, t2, t6
subu t2, t2, t6
addu t6, t3, t5
subu t3, t3, t5
addu t5, t1, t4
subu t1, t1, t4
shra_r.w s0, s0, 18
shra_r.w t7, t7, 18
shra_r.w t6, t6, 18
shra_r.w t5, t5, 18
shra_r.w t1, t1, 18
shra_r.w t3, t3, 18
shra_r.w t2, t2, 18
shra_r.w t0, t0, 18
andi s0, s0, 0x3ff
andi t7, t7, 0x3ff
andi t6, t6, 0x3ff
andi t5, t5, 0x3ff
andi t1, t1, 0x3ff
andi t3, t3, 0x3ff
andi t2, t2, 0x3ff
andi t0, t0, 0x3ff
lw s1, 0(a2)
lbux s0, s0(a3)
lbux t7, t7(a3)
lbux t6, t6(a3)
lbux t5, t5(a3)
lbux t1, t1(a3)
lbux t3, t3(a3)
lbux t2, t2(a3)
lbux t0, t0(a3)
sb s0, 0(s1)
sb t7, 1(s1)
sb t6, 2(s1)
sb t5, 3(s1)
sb t1, 4(s1)
sb t3, 5(s1)
sb t2, 6(s1)
sb t0, 7(s1)
6:
addiu v0, v0, 32
bgtz v1, 4b
addiu a2, a2, 4
addiu sp, sp, 256
RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
j ra
nop
END(jsimd_idct_islow_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
/*