SIMD-accelerated fast integer inverse DCT routine for MIPS DSPr2

This commit is contained in:
DRC
2013-10-08 02:18:59 +00:00
parent 6addfed58b
commit 10138c9d35
3 changed files with 349 additions and 2 deletions

View File

@@ -789,6 +789,15 @@ EXTERN(void) jsimd_idct_ifast_neon JPP((void * dct_table,
JSAMPARRAY output_buf,
JDIMENSION output_col));
EXTERN(void) jsimd_idct_ifast_cols_mips_dspr2 JPP((JCOEF * inptr,
IFAST_MULT_TYPE * quantptr,
DCTELEM * wsptr,
const int * idct_coefs));
EXTERN(void) jsimd_idct_ifast_rows_mips_dspr2 JPP((DCTELEM * wsptr,
JSAMPARRAY output_buf,
JDIMENSION output_col,
const int * idct_coefs));
EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table,
JCOEFPTR coef_block,
JSAMPARRAY output_buf,

View File

@@ -78,6 +78,12 @@ init_simd (void)
return;
#endif
}
static const int mips_idct_ifast_coefs[4] = {
0x45404540, // FIX( 1.082392200 / 2) = 17734 = 0x4546
0x5A805A80, // FIX( 1.414213562 / 2) = 23170 = 0x5A82
0x76407640, // FIX( 1.847759065 / 2) = 30274 = 0x7642
0xAC60AC60 // FIX(-2.613125930 / 4) = -21407 = 0xAC61
};
GLOBAL(int)
jsimd_can_rgb_ycc (void)
@@ -726,6 +732,25 @@ jsimd_can_idct_islow (void)
GLOBAL(int)
jsimd_can_idct_ifast (void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (sizeof(IFAST_MULT_TYPE) != 2)
return 0;
if (IFAST_SCALE_BITS != 2)
return 0;
if ((simd_support & JSIMD_MIPS_DSPR2))
return 1;
return 0;
}
@@ -744,9 +769,29 @@ jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
GLOBAL(void)
jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
if (simd_support & JSIMD_MIPS_DSPR2) {
JCOEFPTR inptr;
IFAST_MULT_TYPE * quantptr;
DCTELEM workspace[DCTSIZE2]; /* buffers data between passes */
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
jsimd_idct_ifast_cols_mips_dspr2(inptr, quantptr,
workspace, mips_idct_ifast_coefs);
/* Pass 2: process rows from work array, store into output array. */
/* Note that we must descale the results by a factor of 8 == 2**3, */
/* and also undo the PASS1_BITS scaling. */
jsimd_idct_ifast_rows_mips_dspr2(workspace, output_buf,
output_col, mips_idct_ifast_coefs);
}
}
GLOBAL(void)

View File

@@ -963,6 +963,299 @@ LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
j ra
nop
END(jsimd_h2v2_upsample_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
/*
* a0 - inptr
* a1 - quantptr
* a2 - wsptr
* a3 - mips_idct_ifast_coefs
*/
SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
addiu t9, a0, 16 // end address
or AT, a3, zero
0:
lw s0, 0(a1) // quantptr[DCTSIZE*0]
lw t0, 0(a0) // inptr[DCTSIZE*0]
lw t1, 16(a0) // inptr[DCTSIZE*1]
muleq_s.w.phl v0, t0, s0 // tmp0 ...
lw t2, 32(a0) // inptr[DCTSIZE*2]
lw t3, 48(a0) // inptr[DCTSIZE*3]
lw t4, 64(a0) // inptr[DCTSIZE*4]
lw t5, 80(a0) // inptr[DCTSIZE*5]
muleq_s.w.phr t0, t0, s0 // ... tmp0 ...
lw t6, 96(a0) // inptr[DCTSIZE*6]
lw t7, 112(a0) // inptr[DCTSIZE*7]
or s4, t1, t2
or s5, t3, t4
bnez s4, 1f
ins t0, v0, 16, 16 // ... tmp0
bnez s5, 1f
or s6, t5, t6
or s6, s6, t7
bnez s6, 1f
sw t0, 0(a2) // wsptr[DCTSIZE*0]
sw t0, 16(a2) // wsptr[DCTSIZE*1]
sw t0, 32(a2) // wsptr[DCTSIZE*2]
sw t0, 48(a2) // wsptr[DCTSIZE*3]
sw t0, 64(a2) // wsptr[DCTSIZE*4]
sw t0, 80(a2) // wsptr[DCTSIZE*5]
sw t0, 96(a2) // wsptr[DCTSIZE*6]
sw t0, 112(a2) // wsptr[DCTSIZE*7]
addiu a0, a0, 4
b 2f
addiu a1, a1, 4
1:
lw s1, 32(a1) // quantptr[DCTSIZE*2]
lw s2, 64(a1) // quantptr[DCTSIZE*4]
muleq_s.w.phl v0, t2, s1 // tmp1 ...
muleq_s.w.phr t2, t2, s1 // ... tmp1 ...
lw s0, 16(a1) // quantptr[DCTSIZE*1]
lw s1, 48(a1) // quantptr[DCTSIZE*3]
lw s3, 96(a1) // quantptr[DCTSIZE*6]
muleq_s.w.phl v1, t4, s2 // tmp2 ...
muleq_s.w.phr t4, t4, s2 // ... tmp2 ...
lw s2, 80(a1) // quantptr[DCTSIZE*5]
lw t8, 4(AT) // FIX(1.414213562)
ins t2, v0, 16, 16 // ... tmp1
muleq_s.w.phl v0, t6, s3 // tmp3 ...
muleq_s.w.phr t6, t6, s3 // ... tmp3 ...
ins t4, v1, 16, 16 // ... tmp2
addq.ph s4, t0, t4 // tmp10
subq.ph s5, t0, t4 // tmp11
ins t6, v0, 16, 16 // ... tmp3
subq.ph s6, t2, t6 // tmp12 ...
addq.ph s7, t2, t6 // tmp13
mulq_s.ph s6, s6, t8 // ... tmp12 ...
addq.ph t0, s4, s7 // tmp0
subq.ph t6, s4, s7 // tmp3
muleq_s.w.phl v0, t1, s0 // tmp4 ...
muleq_s.w.phr t1, t1, s0 // ... tmp4 ...
shll_s.ph s6, s6, 1 // x2
lw s3, 112(a1) // quantptr[DCTSIZE*7]
subq.ph s6, s6, s7 // ... tmp12
muleq_s.w.phl v1, t7, s3 // tmp7 ...
muleq_s.w.phr t7, t7, s3 // ... tmp7 ...
ins t1, v0, 16, 16 // ... tmp4
addq.ph t2, s5, s6 // tmp1
subq.ph t4, s5, s6 // tmp2
muleq_s.w.phl v0, t5, s2 // tmp6 ...
muleq_s.w.phr t5, t5, s2 // ... tmp6 ...
ins t7, v1, 16, 16 // ... tmp7
addq.ph s5, t1, t7 // z11
subq.ph s6, t1, t7 // z12
muleq_s.w.phl v1, t3, s1 // tmp5 ...
muleq_s.w.phr t3, t3, s1 // ... tmp5 ...
ins t5, v0, 16, 16 // ... tmp6
ins t3, v1, 16, 16 // ... tmp5
addq.ph s7, t5, t3 // z13
subq.ph v0, t5, t3 // z10
addq.ph t7, s5, s7 // tmp7
subq.ph s5, s5, s7 // tmp11 ...
addq.ph v1, v0, s6 // z5 ...
mulq_s.ph s5, s5, t8 // ... tmp11
lw t8, 8(AT) // FIX(1.847759065)
lw s4, 0(AT) // FIX(1.082392200)
addq.ph s0, t0, t7
subq.ph s1, t0, t7
mulq_s.ph v1, v1, t8 // ... z5
shll_s.ph s5, s5, 1 // x2
lw t8, 12(AT) // FIX(-2.613125930)
sw s0, 0(a2) // wsptr[DCTSIZE*0]
shll_s.ph v0, v0, 1 // x4
mulq_s.ph v0, v0, t8 // tmp12 ...
mulq_s.ph s4, s6, s4 // tmp10 ...
shll_s.ph v1, v1, 1 // x2
addiu a0, a0, 4
addiu a1, a1, 4
sw s1, 112(a2) // wsptr[DCTSIZE*7]
shll_s.ph s6, v0, 1 // x4
shll_s.ph s4, s4, 1 // x2
addq.ph s6, s6, v1 // ... tmp12
subq.ph t5, s6, t7 // tmp6
subq.ph s4, s4, v1 // ... tmp10
subq.ph t3, s5, t5 // tmp5
addq.ph s2, t2, t5
addq.ph t1, s4, t3 // tmp4
subq.ph s3, t2, t5
sw s2, 16(a2) // wsptr[DCTSIZE*1]
sw s3, 96(a2) // wsptr[DCTSIZE*6]
addq.ph v0, t4, t3
subq.ph v1, t4, t3
sw v0, 32(a2) // wsptr[DCTSIZE*2]
sw v1, 80(a2) // wsptr[DCTSIZE*5]
addq.ph v0, t6, t1
subq.ph v1, t6, t1
sw v0, 64(a2) // wsptr[DCTSIZE*4]
sw v1, 48(a2) // wsptr[DCTSIZE*3]
2:
bne a0, t9, 0b
addiu a2, a2, 4
RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
j ra
nop
END(jsimd_idct_ifast_cols_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2)
/*
* a0 - wsptr
* a1 - output_buf
* a2 - output_col
* a3 - mips_idct_ifast_coefs
*/
SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
addiu t9, a0, 128 // end address
lui s8, 0x8080
ori s8, s8, 0x8080
0:
lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs)
lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a
lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A
lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c
lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C
lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e
lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E
lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g
lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G
precrq.ph.w t1, s0, t0 // B b
ins t0, s0, 16, 16 // A a
bnez t1, 1f
or s0, t2, s2
bnez s0, 1f
or s0, t4, s4
bnez s0, 1f
or s0, t6, s6
bnez s0, 1f
shll_s.ph s0, t0, 2 // A a
lw a3, 0(a1)
lw AT, 4(a1)
precrq.ph.w t0, s0, s0 // A A
ins s0, s0, 16, 16 // a a
addu a3, a3, a2
addu AT, AT, a2
precrq.qb.ph t0, t0, t0 // A A A A
precrq.qb.ph s0, s0, s0 // a a a a
addu.qb s0, s0, s8
addu.qb t0, t0, s8
sw s0, 0(a3)
sw s0, 4(a3)
sw t0, 0(AT)
sw t0, 4(AT)
addiu a0, a0, 32
bne a0, t9, 0b
addiu a1, a1, 8
b 2f
nop
1:
precrq.ph.w t3, s2, t2
ins t2, s2, 16, 16
precrq.ph.w t5, s4, t4
ins t4, s4, 16, 16
precrq.ph.w t7, s6, t6
ins t6, s6, 16, 16
lw t8, 4(AT) // FIX(1.414213562)
addq.ph s4, t0, t4 // tmp10
subq.ph s5, t0, t4 // tmp11
subq.ph s6, t2, t6 // tmp12 ...
addq.ph s7, t2, t6 // tmp13
mulq_s.ph s6, s6, t8 // ... tmp12 ...
addq.ph t0, s4, s7 // tmp0
subq.ph t6, s4, s7 // tmp3
shll_s.ph s6, s6, 1 // x2
subq.ph s6, s6, s7 // ... tmp12
addq.ph t2, s5, s6 // tmp1
subq.ph t4, s5, s6 // tmp2
addq.ph s5, t1, t7 // z11
subq.ph s6, t1, t7 // z12
addq.ph s7, t5, t3 // z13
subq.ph v0, t5, t3 // z10
addq.ph t7, s5, s7 // tmp7
subq.ph s5, s5, s7 // tmp11 ...
addq.ph v1, v0, s6 // z5 ...
mulq_s.ph s5, s5, t8 // ... tmp11
lw t8, 8(AT) // FIX(1.847759065)
lw s4, 0(AT) // FIX(1.082392200)
addq.ph s0, t0, t7 // tmp0 + tmp7
subq.ph s7, t0, t7 // tmp0 - tmp7
mulq_s.ph v1, v1, t8 // ... z5
lw a3, 0(a1)
lw t8, 12(AT) // FIX(-2.613125930)
shll_s.ph s5, s5, 1 // x2
addu a3, a3, a2
shll_s.ph v0, v0, 1 // x4
mulq_s.ph v0, v0, t8 // tmp12 ...
mulq_s.ph s4, s6, s4 // tmp10 ...
shll_s.ph v1, v1, 1 // x2
addiu a0, a0, 32
addiu a1, a1, 8
shll_s.ph s6, v0, 1 // x4
shll_s.ph s4, s4, 1 // x2
addq.ph s6, s6, v1 // ... tmp12
shll_s.ph s0, s0, 2
subq.ph t5, s6, t7 // tmp6
subq.ph s4, s4, v1 // ... tmp10
subq.ph t3, s5, t5 // tmp5
shll_s.ph s7, s7, 2
addq.ph t1, s4, t3 // tmp4
addq.ph s1, t2, t5 // tmp1 + tmp6
subq.ph s6, t2, t5 // tmp1 - tmp6
addq.ph s2, t4, t3 // tmp2 + tmp5
subq.ph s5, t4, t3 // tmp2 - tmp5
addq.ph s4, t6, t1 // tmp3 + tmp4
subq.ph s3, t6, t1 // tmp3 - tmp4
shll_s.ph s1, s1, 2
shll_s.ph s2, s2, 2
shll_s.ph s3, s3, 2
shll_s.ph s4, s4, 2
shll_s.ph s5, s5, 2
shll_s.ph s6, s6, 2
precrq.ph.w t0, s1, s0 // B A
ins s0, s1, 16, 16 // b a
precrq.ph.w t2, s3, s2 // D C
ins s2, s3, 16, 16 // d c
precrq.ph.w t4, s5, s4 // F E
ins s4, s5, 16, 16 // f e
precrq.ph.w t6, s7, s6 // H G
ins s6, s7, 16, 16 // h g
precrq.qb.ph t0, t2, t0 // D C B A
precrq.qb.ph s0, s2, s0 // d c b a
precrq.qb.ph t4, t6, t4 // H G F E
precrq.qb.ph s4, s6, s4 // h g f e
addu.qb s0, s0, s8
addu.qb s4, s4, s8
sw s0, 0(a3) // outptr[0/1/2/3] d c b a
sw s4, 4(a3) // outptr[4/5/6/7] h g f e
lw a3, -4(a1)
addu.qb t0, t0, s8
addu a3, a3, a2
addu.qb t4, t4, s8
sw t0, 0(a3) // outptr[0/1/2/3] D C B A
bne a0, t9, 0b
sw t4, 4(a3) // outptr[4/5/6/7] H G F E
2:
RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
j ra
nop
END(jsimd_idct_ifast_rows_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
/*