/* * MIPS DSPr2 optimizations for libjpeg-turbo * * Copyright (C) 2013, MIPS Technologies, Inc., California. * All rights reserved. * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com) * Darko Laus (darko.laus@imgtec.com) * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute it * freely, subject to the following restrictions: * * 1. The origin of this software must not be misrepresented; you must not * claim that you wrote the original software. If you use this software * in a product, an acknowledgment in the product documentation would be * appreciated but is not required. * 2. Altered source versions must be plainly marked as such, and must not be * misrepresented as being the original software. * 3. This notice may not be removed or altered from any source distribution. */ #include "jsimd_mips_dspr2_asm.h" /*****************************************************************************/ /* * jsimd_extrgb_ycc_convert_mips_dspr2 * jsimd_extbgr_ycc_convert_mips_dspr2 * jsimd_extrgbx_ycc_convert_mips_dspr2 * jsimd_extbgrx_ycc_convert_mips_dspr2 * jsimd_extxbgr_ycc_convert_mips_dspr2 * jsimd_extxrgb_ycc_convert_mips_dspr2 * * Colorspace conversion RGB -> YCbCr */ .macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs .macro DO_RGB_TO_YCC r, \ g, \ b, \ inptr lbu \r, \r_offs(\inptr) lbu \g, \g_offs(\inptr) lbu \b, \b_offs(\inptr) addiu \inptr, \pixel_size .endm LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2) /* * a0 - cinfo->image_width * a1 - input_buf * a2 - output_buf * a3 - output_row * 16(sp) - num_rows */ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 lw t7, 48(sp) // t7 = num_rows li s0, 0x4c8b // FIX(0.29900) li s1, 0x9646 // FIX(0.58700) li s2, 0x1d2f // FIX(0.11400) li s3, 0xffffd4cd // -FIX(0.16874) li s4, 0xffffab33 // -FIX(0.33126) li s5, 0x8000 // FIX(0.50000) li s6, 0xffff94d1 // -FIX(0.41869) li s7, 0xffffeb2f // -FIX(0.08131) li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1 0: addiu t7, -1 // --num_rows lw t6, 0(a1) // t6 = input_buf[0] lw t0, 0(a2) lw t1, 4(a2) lw t2, 8(a2) sll t3, a3, 2 lwx t0, t3(t0) // t0 = output_buf[0][output_row] lwx t1, t3(t1) // t1 = output_buf[1][output_row] lwx t2, t3(t2) // t2 = output_buf[2][output_row] addu t9, t2, a0 // t9 = end address addiu a3, 1 1: DO_RGB_TO_YCC t3, t4, t5, t6 mtlo s5, $ac0 mtlo t8, $ac1 mtlo t8, $ac2 maddu $ac0, s2, t5 maddu $ac1, s5, t5 maddu $ac2, s5, t3 maddu $ac0, s0, t3 maddu $ac1, s3, t3 maddu $ac2, s6, t4 maddu $ac0, s1, t4 maddu $ac1, s4, t4 maddu $ac2, s7, t5 extr.w t3, $ac0, 16 extr.w t4, $ac1, 16 extr.w t5, $ac2, 16 sb t3, 0(t0) sb t4, 0(t1) sb t5, 0(t2) addiu t0, 1 addiu t2, 1 bne t2, t9, 1b addiu t1, 1 bgtz t7, 0b addiu a1, 4 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 j ra nop END(jsimd_\colorid\()_ycc_convert_mips_dspr2) .purgem DO_RGB_TO_YCC .endm /*------------------------------------------id -- pix R G B */ GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3 /*****************************************************************************/ /* * jsimd_ycc_extrgb_convert_mips_dspr2 * jsimd_ycc_extbgr_convert_mips_dspr2 * jsimd_ycc_extrgbx_convert_mips_dspr2 * jsimd_ycc_extbgrx_convert_mips_dspr2 * jsimd_ycc_extxbgr_convert_mips_dspr2 * jsimd_ycc_extxrgb_convert_mips_dspr2 * * Colorspace conversion YCbCr -> RGB */ .macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs .macro STORE_YCC_TO_RGB scratch0 \ scratch1 \ scratch2 \ outptr sb \scratch0, \r_offs(\outptr) sb \scratch1, \g_offs(\outptr) sb \scratch2, \b_offs(\outptr) .if (\pixel_size == 4) li t0, 0xFF sb t0, \a_offs(\outptr) .endif addiu \outptr, \pixel_size .endm LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2) /* * a0 - cinfo->image_width * a1 - input_buf * a2 - input_row * a3 - output_buf * 16(sp) - num_rows */ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 lw s1, 48(sp) li t3, 0x8000 li t4, 0x166e9 // FIX(1.40200) li t5, 0x1c5a2 // FIX(1.77200) li t6, 0xffff492e // -FIX(0.71414) li t7, 0xffffa7e6 // -FIX(0.34414) repl.ph t8, 128 0: lw s0, 0(a3) lw t0, 0(a1) lw t1, 4(a1) lw t2, 8(a1) sll s5, a2, 2 addiu s1, -1 lwx s2, s5(t0) lwx s3, s5(t1) lwx s4, s5(t2) addu t9, s2, a0 addiu a2, 1 1: lbu s7, 0(s4) // cr lbu s6, 0(s3) // cb lbu s5, 0(s2) // y addiu s2, 1 addiu s4, 1 addiu s7, -128 addiu s6, -128 mul t2, t7, s6 mul t0, t6, s7 // Crgtab[cr] sll s7, 15 mulq_rs.w t1, t4, s7 // Crrtab[cr] sll s6, 15 addu t2, t3 // Cbgtab[cb] addu t2, t0 mulq_rs.w t0, t5, s6 // Cbbtab[cb] sra t2, 16 addu t1, s5 addu t2, s5 // add y ins t2, t1, 16, 16 subu.ph t2, t2, t8 addu t0, s5 shll_s.ph t2, t2, 8 subu t0, 128 shra.ph t2, t2, 8 shll_s.w t0, t0, 24 addu.ph t2, t2, t8 // clip & store sra t0, t0, 24 sra t1, t2, 16 addiu t0, 128 STORE_YCC_TO_RGB t1, t2, t0, s0 bne s2, t9, 1b addiu s3, 1 bgtz s1, 0b addiu a3, 4 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 j ra nop END(jsimd_ycc_\colorid\()_convert_mips_dspr2) .purgem STORE_YCC_TO_RGB .endm /*------------------------------------------id -- pix R G B A */ GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2, 3 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0, 3 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0 /*****************************************************************************/ /* * jsimd_h2v2_fancy_upsample_mips_dspr2 * * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. */ LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2) /* * a0 - cinfo->max_v_samp_factor * a1 - downsampled_width * a2 - input_data * a3 - output_data_ptr */ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 li s4, 0 lw s2, 0(a3) // s2 = *output_data_ptr 0: li t9, 2 lw s1, -4(a2) // s1 = inptr1 1: lw s0, 0(a2) // s0 = inptr0 lwx s3, s4(s2) addiu s5, a1, -2 // s5 = downsampled_width - 2 srl t4, s5, 1 sll t4, t4, 1 lbu t0, 0(s0) lbu t1, 1(s0) lbu t2, 0(s1) lbu t3, 1(s1) addiu s0, 2 addiu s1, 2 addu t8, s0, t4 // t8 = end address andi s5, s5, 1 // s5 = residual sll t4, t0, 1 sll t6, t1, 1 addu t0, t0, t4 // t0 = (*inptr0++) * 3 addu t1, t1, t6 // t1 = (*inptr0++) * 3 addu t7, t0, t2 // t7 = thiscolsum addu t6, t1, t3 // t5 = nextcolsum sll t0, t7, 2 // t0 = thiscolsum * 4 subu t1, t0, t7 // t1 = thiscolsum * 3 shra_r.w t0, t0, 4 addiu t1, 7 addu t1, t1, t6 srl t1, t1, 4 sb t0, 0(s3) sb t1, 1(s3) addiu s3, 2 2: lh t0, 0(s0) // t0 = A3|A2 lh t2, 0(s1) // t2 = B3|B2 addiu s0, 2 addiu s1, 2 preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2 preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2 shll.ph t1, t0, 1 sll t3, t6, 1 addu.ph t0, t1, t0 // t0 = A3*3|A2*3 addu t3, t3, t6 // t3 = this * 3 addu.ph t0, t0, t2 // t0 = next2|next1 addu t1, t3, t7 andi t7, t0, 0xFFFF // t7 = next1 sll t2, t7, 1 addu t2, t7, t2 // t2 = next1*3 addu t4, t2, t6 srl t6, t0, 16 // t6 = next2 shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4 addu t0, t3, t7 addiu t0, 7 srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4 shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4 addu t2, t2, t6 addiu t2, 7 srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4 sb t1, 0(s3) sb t0, 1(s3) sb t4, 2(s3) sb t2, 3(s3) bne t8, s0, 2b addiu s3, 4 beqz s5, 4f addu t8, s0, s5 3: lbu t0, 0(s0) lbu t2, 0(s1) addiu s0, 1 addiu s1, 1 sll t3, t6, 1 sll t1, t0, 1 addu t1, t0, t1 // t1 = inptr0 * 3 addu t3, t3, t6 // t3 = thiscolsum * 3 addu t5, t1, t2 addu t1, t3, t7 shra_r.w t1, t1, 4 addu t0, t3, t5 addiu t0, 7 srl t0, t0, 4 sb t1, 0(s3) sb t0, 1(s3) addiu s3, 2 move t7, t6 bne t8, s0, 3b move t6, t5 4: sll t0, t6, 2 // t0 = thiscolsum * 4 subu t1, t0, t6 // t1 = thiscolsum * 3 addu t1, t1, t7 addiu s4, 4 shra_r.w t1, t1, 4 addiu t0, 7 srl t0, t0, 4 sb t1, 0(s3) sb t0, 1(s3) addiu t9, -1 addiu s3, 2 bnez t9, 1b lw s1, 4(a2) srl t0, s4, 2 subu t0, a0, t0 bgtz t0, 0b addiu a2, 4 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 j ra nop END(jsimd_h2v2_fancy_upsample_mips_dspr2) /*****************************************************************************/ LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2) /* * a0 - cinfo->max_v_samp_factor * a1 - downsampled_width * a2 - input_data * a3 - output_data_ptr */ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 .set at beqz a0, 3f sll t0, a0, 2 lw s1, 0(a3) addu s0, s1, t0 li s3, 0x10001 0: addiu t8, a1, -2 srl t9, t8, 2 lw t7, 0(a2) lw s2, 0(s1) lbu t0, 0(t7) lbu t1, 1(t7) // t1 = inptr[1] sll t2, t0, 1 addu t2, t2, t0 // t2 = invalue*3 addu t2, t2, t1 shra_r.w t2, t2, 2 sb t0, 0(s2) sb t2, 1(s2) beqz t9, 11f addiu s2, 2 1: ulw t0, 0(t7) // t0 = |P3|P2|P1|P0| ulw t1, 1(t7) ulh t2, 4(t7) // t2 = |0|0|P5|P4| preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2| preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0| preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4| preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3| preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1| shll.ph t5, t4, 1 shll.ph t6, t1, 1 addu.ph t5, t5, t4 // t5 = |P4*3|P3*3| addu.ph t6, t6, t1 // t6 = |P2*3|P1*3| addu.ph t4, t3, s3 addu.ph t0, t0, s3 addu.ph t4, t4, t5 addu.ph t0, t0, t6 shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2| shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0| addu.ph t2, t2, t5 addu.ph t3, t3, t6 shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4| shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2| shll.ph t2, t2, 8 shll.ph t3, t3, 8 or t2, t4, t2 or t3, t3, t0 addiu t9, -1 usw t3, 0(s2) usw t2, 4(s2) addiu s2, 8 bgtz t9, 1b addiu t7, 4 11: andi t8, 3 beqz t8, 3f addiu t7, 1 2: lbu t0, 0(t7) addiu t7, 1 sll t1, t0, 1 addu t2, t0, t1 // t2 = invalue lbu t3, -2(t7) lbu t4, 0(t7) addiu t3, 1 addiu t4, 2 addu t3, t3, t2 addu t4, t4, t2 srl t3, 2 srl t4, 2 sb t3, 0(s2) sb t4, 1(s2) addiu t8, -1 bgtz t8, 2b addiu s2, 2 lbu t0, 0(t7) lbu t2, -1(t7) sll t1, t0, 1 addu t1, t1, t0 // t1 = invalue * 3 addu t1, t1, t2 addiu t1, 1 srl t1, t1, 2 sb t1, 0(s2) sb t0, 1(s2) addiu s1, 4 bne s1, s0, 0b addiu a2, 4 3: RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 j ra nop END(jsimd_h2v1_fancy_upsample_mips_dspr2) /*****************************************************************************/