SIMD-accelerated fast integer inverse DCT routine for MIPS DSPr2
This commit is contained in:
@@ -789,6 +789,15 @@ EXTERN(void) jsimd_idct_ifast_neon JPP((void * dct_table,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col));
|
||||
|
||||
EXTERN(void) jsimd_idct_ifast_cols_mips_dspr2 JPP((JCOEF * inptr,
|
||||
IFAST_MULT_TYPE * quantptr,
|
||||
DCTELEM * wsptr,
|
||||
const int * idct_coefs));
|
||||
EXTERN(void) jsimd_idct_ifast_rows_mips_dspr2 JPP((DCTELEM * wsptr,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col,
|
||||
const int * idct_coefs));
|
||||
|
||||
EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table,
|
||||
JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf,
|
||||
|
||||
@@ -78,6 +78,12 @@ init_simd (void)
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
static const int mips_idct_ifast_coefs[4] = {
|
||||
0x45404540, // FIX( 1.082392200 / 2) = 17734 = 0x4546
|
||||
0x5A805A80, // FIX( 1.414213562 / 2) = 23170 = 0x5A82
|
||||
0x76407640, // FIX( 1.847759065 / 2) = 30274 = 0x7642
|
||||
0xAC60AC60 // FIX(-2.613125930 / 4) = -21407 = 0xAC61
|
||||
};
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_rgb_ycc (void)
|
||||
@@ -726,6 +732,25 @@ jsimd_can_idct_islow (void)
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_ifast (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (sizeof(IFAST_MULT_TYPE) != 2)
|
||||
return 0;
|
||||
if (IFAST_SCALE_BITS != 2)
|
||||
return 0;
|
||||
|
||||
if ((simd_support & JSIMD_MIPS_DSPR2))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -744,9 +769,29 @@ jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
if (simd_support & JSIMD_MIPS_DSPR2) {
|
||||
JCOEFPTR inptr;
|
||||
IFAST_MULT_TYPE * quantptr;
|
||||
DCTELEM workspace[DCTSIZE2]; /* buffers data between passes */
|
||||
|
||||
/* Pass 1: process columns from input, store into work array. */
|
||||
|
||||
inptr = coef_block;
|
||||
quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
|
||||
|
||||
jsimd_idct_ifast_cols_mips_dspr2(inptr, quantptr,
|
||||
workspace, mips_idct_ifast_coefs);
|
||||
|
||||
/* Pass 2: process rows from work array, store into output array. */
|
||||
/* Note that we must descale the results by a factor of 8 == 2**3, */
|
||||
/* and also undo the PASS1_BITS scaling. */
|
||||
|
||||
jsimd_idct_ifast_rows_mips_dspr2(workspace, output_buf,
|
||||
output_col, mips_idct_ifast_coefs);
|
||||
}
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
|
||||
@@ -963,6 +963,299 @@ LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
|
||||
j ra
|
||||
nop
|
||||
END(jsimd_h2v2_upsample_mips_dspr2)
|
||||
|
||||
/*****************************************************************************/
|
||||
LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
|
||||
/*
|
||||
* a0 - inptr
|
||||
* a1 - quantptr
|
||||
* a2 - wsptr
|
||||
* a3 - mips_idct_ifast_coefs
|
||||
*/
|
||||
|
||||
SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
|
||||
|
||||
addiu t9, a0, 16 // end address
|
||||
or AT, a3, zero
|
||||
|
||||
0:
|
||||
lw s0, 0(a1) // quantptr[DCTSIZE*0]
|
||||
lw t0, 0(a0) // inptr[DCTSIZE*0]
|
||||
lw t1, 16(a0) // inptr[DCTSIZE*1]
|
||||
muleq_s.w.phl v0, t0, s0 // tmp0 ...
|
||||
lw t2, 32(a0) // inptr[DCTSIZE*2]
|
||||
lw t3, 48(a0) // inptr[DCTSIZE*3]
|
||||
lw t4, 64(a0) // inptr[DCTSIZE*4]
|
||||
lw t5, 80(a0) // inptr[DCTSIZE*5]
|
||||
muleq_s.w.phr t0, t0, s0 // ... tmp0 ...
|
||||
lw t6, 96(a0) // inptr[DCTSIZE*6]
|
||||
lw t7, 112(a0) // inptr[DCTSIZE*7]
|
||||
or s4, t1, t2
|
||||
or s5, t3, t4
|
||||
bnez s4, 1f
|
||||
ins t0, v0, 16, 16 // ... tmp0
|
||||
bnez s5, 1f
|
||||
or s6, t5, t6
|
||||
or s6, s6, t7
|
||||
bnez s6, 1f
|
||||
sw t0, 0(a2) // wsptr[DCTSIZE*0]
|
||||
sw t0, 16(a2) // wsptr[DCTSIZE*1]
|
||||
sw t0, 32(a2) // wsptr[DCTSIZE*2]
|
||||
sw t0, 48(a2) // wsptr[DCTSIZE*3]
|
||||
sw t0, 64(a2) // wsptr[DCTSIZE*4]
|
||||
sw t0, 80(a2) // wsptr[DCTSIZE*5]
|
||||
sw t0, 96(a2) // wsptr[DCTSIZE*6]
|
||||
sw t0, 112(a2) // wsptr[DCTSIZE*7]
|
||||
addiu a0, a0, 4
|
||||
b 2f
|
||||
addiu a1, a1, 4
|
||||
|
||||
1:
|
||||
lw s1, 32(a1) // quantptr[DCTSIZE*2]
|
||||
lw s2, 64(a1) // quantptr[DCTSIZE*4]
|
||||
muleq_s.w.phl v0, t2, s1 // tmp1 ...
|
||||
muleq_s.w.phr t2, t2, s1 // ... tmp1 ...
|
||||
lw s0, 16(a1) // quantptr[DCTSIZE*1]
|
||||
lw s1, 48(a1) // quantptr[DCTSIZE*3]
|
||||
lw s3, 96(a1) // quantptr[DCTSIZE*6]
|
||||
muleq_s.w.phl v1, t4, s2 // tmp2 ...
|
||||
muleq_s.w.phr t4, t4, s2 // ... tmp2 ...
|
||||
lw s2, 80(a1) // quantptr[DCTSIZE*5]
|
||||
lw t8, 4(AT) // FIX(1.414213562)
|
||||
ins t2, v0, 16, 16 // ... tmp1
|
||||
muleq_s.w.phl v0, t6, s3 // tmp3 ...
|
||||
muleq_s.w.phr t6, t6, s3 // ... tmp3 ...
|
||||
ins t4, v1, 16, 16 // ... tmp2
|
||||
addq.ph s4, t0, t4 // tmp10
|
||||
subq.ph s5, t0, t4 // tmp11
|
||||
ins t6, v0, 16, 16 // ... tmp3
|
||||
subq.ph s6, t2, t6 // tmp12 ...
|
||||
addq.ph s7, t2, t6 // tmp13
|
||||
mulq_s.ph s6, s6, t8 // ... tmp12 ...
|
||||
addq.ph t0, s4, s7 // tmp0
|
||||
subq.ph t6, s4, s7 // tmp3
|
||||
muleq_s.w.phl v0, t1, s0 // tmp4 ...
|
||||
muleq_s.w.phr t1, t1, s0 // ... tmp4 ...
|
||||
shll_s.ph s6, s6, 1 // x2
|
||||
lw s3, 112(a1) // quantptr[DCTSIZE*7]
|
||||
subq.ph s6, s6, s7 // ... tmp12
|
||||
muleq_s.w.phl v1, t7, s3 // tmp7 ...
|
||||
muleq_s.w.phr t7, t7, s3 // ... tmp7 ...
|
||||
ins t1, v0, 16, 16 // ... tmp4
|
||||
addq.ph t2, s5, s6 // tmp1
|
||||
subq.ph t4, s5, s6 // tmp2
|
||||
muleq_s.w.phl v0, t5, s2 // tmp6 ...
|
||||
muleq_s.w.phr t5, t5, s2 // ... tmp6 ...
|
||||
ins t7, v1, 16, 16 // ... tmp7
|
||||
addq.ph s5, t1, t7 // z11
|
||||
subq.ph s6, t1, t7 // z12
|
||||
muleq_s.w.phl v1, t3, s1 // tmp5 ...
|
||||
muleq_s.w.phr t3, t3, s1 // ... tmp5 ...
|
||||
ins t5, v0, 16, 16 // ... tmp6
|
||||
ins t3, v1, 16, 16 // ... tmp5
|
||||
addq.ph s7, t5, t3 // z13
|
||||
subq.ph v0, t5, t3 // z10
|
||||
addq.ph t7, s5, s7 // tmp7
|
||||
subq.ph s5, s5, s7 // tmp11 ...
|
||||
addq.ph v1, v0, s6 // z5 ...
|
||||
mulq_s.ph s5, s5, t8 // ... tmp11
|
||||
lw t8, 8(AT) // FIX(1.847759065)
|
||||
lw s4, 0(AT) // FIX(1.082392200)
|
||||
addq.ph s0, t0, t7
|
||||
subq.ph s1, t0, t7
|
||||
mulq_s.ph v1, v1, t8 // ... z5
|
||||
shll_s.ph s5, s5, 1 // x2
|
||||
lw t8, 12(AT) // FIX(-2.613125930)
|
||||
sw s0, 0(a2) // wsptr[DCTSIZE*0]
|
||||
shll_s.ph v0, v0, 1 // x4
|
||||
mulq_s.ph v0, v0, t8 // tmp12 ...
|
||||
mulq_s.ph s4, s6, s4 // tmp10 ...
|
||||
shll_s.ph v1, v1, 1 // x2
|
||||
addiu a0, a0, 4
|
||||
addiu a1, a1, 4
|
||||
sw s1, 112(a2) // wsptr[DCTSIZE*7]
|
||||
shll_s.ph s6, v0, 1 // x4
|
||||
shll_s.ph s4, s4, 1 // x2
|
||||
addq.ph s6, s6, v1 // ... tmp12
|
||||
subq.ph t5, s6, t7 // tmp6
|
||||
subq.ph s4, s4, v1 // ... tmp10
|
||||
subq.ph t3, s5, t5 // tmp5
|
||||
addq.ph s2, t2, t5
|
||||
addq.ph t1, s4, t3 // tmp4
|
||||
subq.ph s3, t2, t5
|
||||
sw s2, 16(a2) // wsptr[DCTSIZE*1]
|
||||
sw s3, 96(a2) // wsptr[DCTSIZE*6]
|
||||
addq.ph v0, t4, t3
|
||||
subq.ph v1, t4, t3
|
||||
sw v0, 32(a2) // wsptr[DCTSIZE*2]
|
||||
sw v1, 80(a2) // wsptr[DCTSIZE*5]
|
||||
addq.ph v0, t6, t1
|
||||
subq.ph v1, t6, t1
|
||||
sw v0, 64(a2) // wsptr[DCTSIZE*4]
|
||||
sw v1, 48(a2) // wsptr[DCTSIZE*3]
|
||||
|
||||
2:
|
||||
bne a0, t9, 0b
|
||||
addiu a2, a2, 4
|
||||
|
||||
RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
|
||||
|
||||
j ra
|
||||
nop
|
||||
|
||||
END(jsimd_idct_ifast_cols_mips_dspr2)
|
||||
|
||||
/*****************************************************************************/
|
||||
LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2)
|
||||
/*
|
||||
* a0 - wsptr
|
||||
* a1 - output_buf
|
||||
* a2 - output_col
|
||||
* a3 - mips_idct_ifast_coefs
|
||||
*/
|
||||
|
||||
SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
|
||||
|
||||
addiu t9, a0, 128 // end address
|
||||
lui s8, 0x8080
|
||||
ori s8, s8, 0x8080
|
||||
|
||||
0:
|
||||
lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs)
|
||||
lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a
|
||||
lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A
|
||||
lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c
|
||||
lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C
|
||||
lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e
|
||||
lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E
|
||||
lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g
|
||||
lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G
|
||||
precrq.ph.w t1, s0, t0 // B b
|
||||
ins t0, s0, 16, 16 // A a
|
||||
bnez t1, 1f
|
||||
or s0, t2, s2
|
||||
bnez s0, 1f
|
||||
or s0, t4, s4
|
||||
bnez s0, 1f
|
||||
or s0, t6, s6
|
||||
bnez s0, 1f
|
||||
shll_s.ph s0, t0, 2 // A a
|
||||
lw a3, 0(a1)
|
||||
lw AT, 4(a1)
|
||||
precrq.ph.w t0, s0, s0 // A A
|
||||
ins s0, s0, 16, 16 // a a
|
||||
addu a3, a3, a2
|
||||
addu AT, AT, a2
|
||||
precrq.qb.ph t0, t0, t0 // A A A A
|
||||
precrq.qb.ph s0, s0, s0 // a a a a
|
||||
addu.qb s0, s0, s8
|
||||
addu.qb t0, t0, s8
|
||||
sw s0, 0(a3)
|
||||
sw s0, 4(a3)
|
||||
sw t0, 0(AT)
|
||||
sw t0, 4(AT)
|
||||
addiu a0, a0, 32
|
||||
bne a0, t9, 0b
|
||||
addiu a1, a1, 8
|
||||
b 2f
|
||||
nop
|
||||
|
||||
1:
|
||||
precrq.ph.w t3, s2, t2
|
||||
ins t2, s2, 16, 16
|
||||
precrq.ph.w t5, s4, t4
|
||||
ins t4, s4, 16, 16
|
||||
precrq.ph.w t7, s6, t6
|
||||
ins t6, s6, 16, 16
|
||||
lw t8, 4(AT) // FIX(1.414213562)
|
||||
addq.ph s4, t0, t4 // tmp10
|
||||
subq.ph s5, t0, t4 // tmp11
|
||||
subq.ph s6, t2, t6 // tmp12 ...
|
||||
addq.ph s7, t2, t6 // tmp13
|
||||
mulq_s.ph s6, s6, t8 // ... tmp12 ...
|
||||
addq.ph t0, s4, s7 // tmp0
|
||||
subq.ph t6, s4, s7 // tmp3
|
||||
shll_s.ph s6, s6, 1 // x2
|
||||
subq.ph s6, s6, s7 // ... tmp12
|
||||
addq.ph t2, s5, s6 // tmp1
|
||||
subq.ph t4, s5, s6 // tmp2
|
||||
addq.ph s5, t1, t7 // z11
|
||||
subq.ph s6, t1, t7 // z12
|
||||
addq.ph s7, t5, t3 // z13
|
||||
subq.ph v0, t5, t3 // z10
|
||||
addq.ph t7, s5, s7 // tmp7
|
||||
subq.ph s5, s5, s7 // tmp11 ...
|
||||
addq.ph v1, v0, s6 // z5 ...
|
||||
mulq_s.ph s5, s5, t8 // ... tmp11
|
||||
lw t8, 8(AT) // FIX(1.847759065)
|
||||
lw s4, 0(AT) // FIX(1.082392200)
|
||||
addq.ph s0, t0, t7 // tmp0 + tmp7
|
||||
subq.ph s7, t0, t7 // tmp0 - tmp7
|
||||
mulq_s.ph v1, v1, t8 // ... z5
|
||||
lw a3, 0(a1)
|
||||
lw t8, 12(AT) // FIX(-2.613125930)
|
||||
shll_s.ph s5, s5, 1 // x2
|
||||
addu a3, a3, a2
|
||||
shll_s.ph v0, v0, 1 // x4
|
||||
mulq_s.ph v0, v0, t8 // tmp12 ...
|
||||
mulq_s.ph s4, s6, s4 // tmp10 ...
|
||||
shll_s.ph v1, v1, 1 // x2
|
||||
addiu a0, a0, 32
|
||||
addiu a1, a1, 8
|
||||
shll_s.ph s6, v0, 1 // x4
|
||||
shll_s.ph s4, s4, 1 // x2
|
||||
addq.ph s6, s6, v1 // ... tmp12
|
||||
shll_s.ph s0, s0, 2
|
||||
subq.ph t5, s6, t7 // tmp6
|
||||
subq.ph s4, s4, v1 // ... tmp10
|
||||
subq.ph t3, s5, t5 // tmp5
|
||||
shll_s.ph s7, s7, 2
|
||||
addq.ph t1, s4, t3 // tmp4
|
||||
addq.ph s1, t2, t5 // tmp1 + tmp6
|
||||
subq.ph s6, t2, t5 // tmp1 - tmp6
|
||||
addq.ph s2, t4, t3 // tmp2 + tmp5
|
||||
subq.ph s5, t4, t3 // tmp2 - tmp5
|
||||
addq.ph s4, t6, t1 // tmp3 + tmp4
|
||||
subq.ph s3, t6, t1 // tmp3 - tmp4
|
||||
shll_s.ph s1, s1, 2
|
||||
shll_s.ph s2, s2, 2
|
||||
shll_s.ph s3, s3, 2
|
||||
shll_s.ph s4, s4, 2
|
||||
shll_s.ph s5, s5, 2
|
||||
shll_s.ph s6, s6, 2
|
||||
precrq.ph.w t0, s1, s0 // B A
|
||||
ins s0, s1, 16, 16 // b a
|
||||
precrq.ph.w t2, s3, s2 // D C
|
||||
ins s2, s3, 16, 16 // d c
|
||||
precrq.ph.w t4, s5, s4 // F E
|
||||
ins s4, s5, 16, 16 // f e
|
||||
precrq.ph.w t6, s7, s6 // H G
|
||||
ins s6, s7, 16, 16 // h g
|
||||
precrq.qb.ph t0, t2, t0 // D C B A
|
||||
precrq.qb.ph s0, s2, s0 // d c b a
|
||||
precrq.qb.ph t4, t6, t4 // H G F E
|
||||
precrq.qb.ph s4, s6, s4 // h g f e
|
||||
addu.qb s0, s0, s8
|
||||
addu.qb s4, s4, s8
|
||||
sw s0, 0(a3) // outptr[0/1/2/3] d c b a
|
||||
sw s4, 4(a3) // outptr[4/5/6/7] h g f e
|
||||
lw a3, -4(a1)
|
||||
addu.qb t0, t0, s8
|
||||
addu a3, a3, a2
|
||||
addu.qb t4, t4, s8
|
||||
sw t0, 0(a3) // outptr[0/1/2/3] D C B A
|
||||
bne a0, t9, 0b
|
||||
sw t4, 4(a3) // outptr[4/5/6/7] H G F E
|
||||
|
||||
2:
|
||||
|
||||
RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
|
||||
|
||||
j ra
|
||||
nop
|
||||
|
||||
END(jsimd_idct_ifast_rows_mips_dspr2)
|
||||
|
||||
/*****************************************************************************/
|
||||
LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
|
||||
/*
|
||||
|
||||
Reference in New Issue
Block a user