SIMD support for performing fancy upsampling using MIPS DSPr2 instructions
This commit is contained in:
@@ -586,6 +586,13 @@ EXTERN(void) jsimd_h2v1_fancy_upsample_neon
|
||||
JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
|
||||
JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
|
||||
|
||||
EXTERN(void) jsimd_h2v1_fancy_upsample_mips_dspr2
|
||||
JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
|
||||
JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
|
||||
EXTERN(void) jsimd_h2v2_fancy_upsample_mips_dspr2
|
||||
JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
|
||||
JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
|
||||
|
||||
/* SIMD Sample Conversion */
|
||||
EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
|
||||
JDIMENSION start_col,
|
||||
|
||||
@@ -265,12 +265,32 @@ jsimd_h2v1_upsample (j_decompress_ptr cinfo,
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_fancy_upsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (simd_support & JSIMD_MIPS_DSPR2)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v1_fancy_upsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (simd_support & JSIMD_MIPS_DSPR2)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -280,6 +300,9 @@ jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY * output_data_ptr)
|
||||
{
|
||||
if (simd_support & JSIMD_MIPS_DSPR2)
|
||||
jsimd_h2v2_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data, output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -288,6 +311,9 @@ jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY * output_data_ptr)
|
||||
{
|
||||
if (simd_support & JSIMD_MIPS_DSPR2)
|
||||
jsimd_h2v1_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data, output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
|
||||
@@ -248,3 +248,242 @@ GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
|
||||
GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
|
||||
|
||||
/*****************************************************************************/
|
||||
/*
|
||||
* jsimd_h2v2_fancy_upsample_mips_dspr2
|
||||
*
|
||||
* Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
|
||||
*/
|
||||
LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2)
|
||||
/*
|
||||
* a0 - cinfo->max_v_samp_factor
|
||||
* a1 - downsampled_width
|
||||
* a2 - input_data
|
||||
* a3 - output_data_ptr
|
||||
*/
|
||||
|
||||
SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
|
||||
|
||||
li s4, 0
|
||||
lw s2, 0(a3) // s2 = *output_data_ptr
|
||||
0:
|
||||
li t9, 2
|
||||
lw s1, -4(a2) // s1 = inptr1
|
||||
|
||||
1:
|
||||
lw s0, 0(a2) // s0 = inptr0
|
||||
lwx s3, s4(s2)
|
||||
addiu s5, a1, -2 // s5 = downsampled_width - 2
|
||||
srl t4, s5, 1
|
||||
sll t4, t4, 1
|
||||
lbu t0, 0(s0)
|
||||
lbu t1, 1(s0)
|
||||
lbu t2, 0(s1)
|
||||
lbu t3, 1(s1)
|
||||
addiu s0, 2
|
||||
addiu s1, 2
|
||||
addu t8, s0, t4 // t8 = end address
|
||||
andi s5, s5, 1 // s5 = residual
|
||||
sll t4, t0, 1
|
||||
sll t6, t1, 1
|
||||
addu t0, t0, t4 // t0 = (*inptr0++) * 3
|
||||
addu t1, t1, t6 // t1 = (*inptr0++) * 3
|
||||
addu t7, t0, t2 // t7 = thiscolsum
|
||||
addu t6, t1, t3 // t5 = nextcolsum
|
||||
sll t0, t7, 2 // t0 = thiscolsum * 4
|
||||
subu t1, t0, t7 // t1 = thiscolsum * 3
|
||||
shra_r.w t0, t0, 4
|
||||
addiu t1, 7
|
||||
addu t1, t1, t6
|
||||
srl t1, t1, 4
|
||||
sb t0, 0(s3)
|
||||
sb t1, 1(s3)
|
||||
addiu s3, 2
|
||||
2:
|
||||
lh t0, 0(s0) // t0 = A3|A2
|
||||
lh t2, 0(s1) // t2 = B3|B2
|
||||
addiu s0, 2
|
||||
addiu s1, 2
|
||||
preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2
|
||||
preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2
|
||||
shll.ph t1, t0, 1
|
||||
sll t3, t6, 1
|
||||
addu.ph t0, t1, t0 // t0 = A3*3|A2*3
|
||||
addu t3, t3, t6 // t3 = this * 3
|
||||
addu.ph t0, t0, t2 // t0 = next2|next1
|
||||
addu t1, t3, t7
|
||||
andi t7, t0, 0xFFFF // t7 = next1
|
||||
sll t2, t7, 1
|
||||
addu t2, t7, t2 // t2 = next1*3
|
||||
addu t4, t2, t6
|
||||
srl t6, t0, 16 // t6 = next2
|
||||
shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4
|
||||
addu t0, t3, t7
|
||||
addiu t0, 7
|
||||
srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4
|
||||
shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4
|
||||
addu t2, t2, t6
|
||||
addiu t2, 7
|
||||
srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4
|
||||
sb t1, 0(s3)
|
||||
sb t0, 1(s3)
|
||||
sb t4, 2(s3)
|
||||
sb t2, 3(s3)
|
||||
bne t8, s0, 2b
|
||||
addiu s3, 4
|
||||
beqz s5, 4f
|
||||
addu t8, s0, s5
|
||||
3:
|
||||
lbu t0, 0(s0)
|
||||
lbu t2, 0(s1)
|
||||
addiu s0, 1
|
||||
addiu s1, 1
|
||||
sll t3, t6, 1
|
||||
sll t1, t0, 1
|
||||
addu t1, t0, t1 // t1 = inptr0 * 3
|
||||
addu t3, t3, t6 // t3 = thiscolsum * 3
|
||||
addu t5, t1, t2
|
||||
addu t1, t3, t7
|
||||
shra_r.w t1, t1, 4
|
||||
addu t0, t3, t5
|
||||
addiu t0, 7
|
||||
srl t0, t0, 4
|
||||
sb t1, 0(s3)
|
||||
sb t0, 1(s3)
|
||||
addiu s3, 2
|
||||
move t7, t6
|
||||
bne t8, s0, 3b
|
||||
move t6, t5
|
||||
4:
|
||||
sll t0, t6, 2 // t0 = thiscolsum * 4
|
||||
subu t1, t0, t6 // t1 = thiscolsum * 3
|
||||
addu t1, t1, t7
|
||||
addiu s4, 4
|
||||
shra_r.w t1, t1, 4
|
||||
addiu t0, 7
|
||||
srl t0, t0, 4
|
||||
sb t1, 0(s3)
|
||||
sb t0, 1(s3)
|
||||
addiu t9, -1
|
||||
addiu s3, 2
|
||||
bnez t9, 1b
|
||||
lw s1, 4(a2)
|
||||
srl t0, s4, 2
|
||||
subu t0, a0, t0
|
||||
bgtz t0, 0b
|
||||
addiu a2, 4
|
||||
|
||||
RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
|
||||
|
||||
j ra
|
||||
nop
|
||||
END(jsimd_h2v2_fancy_upsample_mips_dspr2)
|
||||
|
||||
/*****************************************************************************/
|
||||
LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2)
|
||||
/*
|
||||
* a0 - cinfo->max_v_samp_factor
|
||||
* a1 - downsampled_width
|
||||
* a2 - input_data
|
||||
* a3 - output_data_ptr
|
||||
*/
|
||||
|
||||
SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
|
||||
|
||||
.set at
|
||||
|
||||
beqz a0, 3f
|
||||
sll t0, a0, 2
|
||||
lw s1, 0(a3)
|
||||
addu s0, s1, t0
|
||||
li s3, 0x10001
|
||||
0:
|
||||
addiu t8, a1, -2
|
||||
srl t9, t8, 2
|
||||
lw t7, 0(a2)
|
||||
lw s2, 0(s1)
|
||||
lbu t0, 0(t7)
|
||||
lbu t1, 1(t7) // t1 = inptr[1]
|
||||
sll t2, t0, 1
|
||||
addu t2, t2, t0 // t2 = invalue*3
|
||||
addu t2, t2, t1
|
||||
shra_r.w t2, t2, 2
|
||||
sb t0, 0(s2)
|
||||
sb t2, 1(s2)
|
||||
beqz t9, 11f
|
||||
addiu s2, 2
|
||||
1:
|
||||
ulw t0, 0(t7) // t0 = |P3|P2|P1|P0|
|
||||
ulw t1, 1(t7)
|
||||
ulh t2, 4(t7) // t2 = |0|0|P5|P4|
|
||||
preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2|
|
||||
preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0|
|
||||
preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4|
|
||||
preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3|
|
||||
preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1|
|
||||
shll.ph t5, t4, 1
|
||||
shll.ph t6, t1, 1
|
||||
addu.ph t5, t5, t4 // t5 = |P4*3|P3*3|
|
||||
addu.ph t6, t6, t1 // t6 = |P2*3|P1*3|
|
||||
addu.ph t4, t3, s3
|
||||
addu.ph t0, t0, s3
|
||||
addu.ph t4, t4, t5
|
||||
addu.ph t0, t0, t6
|
||||
shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2|
|
||||
shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0|
|
||||
addu.ph t2, t2, t5
|
||||
addu.ph t3, t3, t6
|
||||
shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4|
|
||||
shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2|
|
||||
shll.ph t2, t2, 8
|
||||
shll.ph t3, t3, 8
|
||||
or t2, t4, t2
|
||||
or t3, t3, t0
|
||||
addiu t9, -1
|
||||
usw t3, 0(s2)
|
||||
usw t2, 4(s2)
|
||||
addiu s2, 8
|
||||
bgtz t9, 1b
|
||||
addiu t7, 4
|
||||
11:
|
||||
andi t8, 3
|
||||
beqz t8, 3f
|
||||
addiu t7, 1
|
||||
2:
|
||||
lbu t0, 0(t7)
|
||||
addiu t7, 1
|
||||
sll t1, t0, 1
|
||||
addu t2, t0, t1 // t2 = invalue
|
||||
lbu t3, -2(t7)
|
||||
lbu t4, 0(t7)
|
||||
addiu t3, 1
|
||||
addiu t4, 2
|
||||
addu t3, t3, t2
|
||||
addu t4, t4, t2
|
||||
srl t3, 2
|
||||
srl t4, 2
|
||||
sb t3, 0(s2)
|
||||
sb t4, 1(s2)
|
||||
addiu t8, -1
|
||||
bgtz t8, 2b
|
||||
addiu s2, 2
|
||||
|
||||
lbu t0, 0(t7)
|
||||
lbu t2, -1(t7)
|
||||
sll t1, t0, 1
|
||||
addu t1, t1, t0 // t1 = invalue * 3
|
||||
addu t1, t1, t2
|
||||
addiu t1, 1
|
||||
srl t1, t1, 2
|
||||
sb t1, 0(s2)
|
||||
sb t0, 1(s2)
|
||||
addiu s1, 4
|
||||
bne s1, s0, 0b
|
||||
addiu a2, 4
|
||||
3:
|
||||
RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
|
||||
|
||||
j ra
|
||||
nop
|
||||
END(jsimd_h2v1_fancy_upsample_mips_dspr2)
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
Reference in New Issue
Block a user