IJG R6b with x86SIMD V1.02
Independent JPEG Group's JPEG software release 6b with x86 SIMD extension for IJG JPEG library version 1.02
This commit is contained in:
288
jfdctflt.asm
Normal file
288
jfdctflt.asm
Normal file
@@ -0,0 +1,288 @@
|
||||
;
|
||||
; jfdctflt.asm - floating-point FDCT (non-SIMD)
|
||||
;
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a floating-point implementation of the forward DCT
|
||||
; (Discrete Cosine Transform). The following code is based directly on
|
||||
; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
|
||||
;
|
||||
; Last Modified : October 17, 2004
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
%ifdef DCT_FLOAT_SUPPORTED
|
||||
|
||||
; This module is specialized to the case DCTSIZE = 8.
|
||||
;
|
||||
%if DCTSIZE != 8
|
||||
%error "Sorry, this code only copes with 8x8 DCTs."
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
%define ROTATOR_TYPE FP32 ; float
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_fdct_float)
|
||||
|
||||
EXTN(jconst_fdct_float):
|
||||
|
||||
F_0_382 dd 0.382683432365089771728460 ; cos(PI*3/8)
|
||||
F_0_707 dd 0.707106781186547524400844 ; cos(PI*1/4)
|
||||
F_0_541 dd 0.541196100146196984399723 ; cos(PI*1/8)-cos(PI*3/8)
|
||||
F_1_306 dd 1.306562964876376527856643 ; cos(PI*1/8)+cos(PI*3/8)
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jpeg_fdct_float (FAST_FLOAT * data)
|
||||
;
|
||||
|
||||
%define data(b) (b)+8 ; FAST_FLOAT * data
|
||||
|
||||
align 16
|
||||
global EXTN(jpeg_fdct_float)
|
||||
|
||||
EXTN(jpeg_fdct_float):
|
||||
push ebp
|
||||
mov ebp,esp
|
||||
pushpic ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
; push esi ; unused
|
||||
; push edi ; unused
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
mov edx, POINTER [data(ebp)] ; (FAST_FLOAT *)
|
||||
mov ecx, DCTSIZE
|
||||
alignx 16,7
|
||||
.rowloop:
|
||||
fld FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
|
||||
fadd FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
|
||||
fld FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
|
||||
fadd FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
|
||||
fld FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
|
||||
fadd FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
|
||||
fld FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
|
||||
fadd FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; -- Even part
|
||||
|
||||
fld st2 ; st2 = st2 + st1, st1 = st2 - st1
|
||||
fsub st0,st2
|
||||
fxch st0,st2
|
||||
faddp st3,st0
|
||||
fld st3 ; st3 = st3 + st0, st0 = st3 - st0
|
||||
fsub st0,st1
|
||||
fxch st0,st1
|
||||
faddp st4,st0
|
||||
|
||||
fadd st0,st1
|
||||
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
|
||||
|
||||
fld st2 ; st3 = st2 + st3, st2 = st2 - st3
|
||||
fsub st0,st4
|
||||
fxch st0,st3
|
||||
faddp st4,st0
|
||||
fld st1 ; st0 = st1 + st0, st1 = st1 - st0
|
||||
fsub st0,st1
|
||||
fxch st0,st2
|
||||
faddp st1,st0
|
||||
|
||||
fld FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
|
||||
fsub FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
|
||||
fxch st0,st4
|
||||
fld FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
|
||||
fsub FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
|
||||
fxch st0,st4
|
||||
fld FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
|
||||
fsub FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
|
||||
fxch st0,st4
|
||||
fld FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
|
||||
fsub FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
|
||||
fxch st0,st4
|
||||
|
||||
fstp FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
|
||||
fstp FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
|
||||
fstp FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
|
||||
fstp FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; -- Odd part
|
||||
|
||||
fadd st2,st0
|
||||
fadd st0,st1
|
||||
fxch st0,st3
|
||||
fadd st1,st0
|
||||
fxch st0,st3
|
||||
|
||||
fld st2
|
||||
fxch st0,st1
|
||||
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
|
||||
fxch st0,st1
|
||||
fsub st0,st2
|
||||
fxch st0,st3
|
||||
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_541)]
|
||||
fxch st0,st3
|
||||
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_382)]
|
||||
fxch st0,st2
|
||||
fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_306)]
|
||||
fxch st0,st2
|
||||
fadd st3,st0
|
||||
faddp st2,st0
|
||||
|
||||
fld st3 ; st3 = st3 + st0, st0 = st3 - st0
|
||||
fsub st0,st1
|
||||
fxch st0,st1
|
||||
faddp st4,st0
|
||||
|
||||
fld st2 ; st0 = st0 + st2, st2 = st0 - st2
|
||||
fsubr st0,st1
|
||||
fxch st0,st3
|
||||
faddp st1,st0
|
||||
fld st1 ; st3 = st3 + st1, st1 = st3 - st1
|
||||
fsubr st0,st4
|
||||
fxch st0,st2
|
||||
faddp st4,st0
|
||||
|
||||
fstp FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
|
||||
fstp FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
|
||||
fstp FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
|
||||
fstp FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
add edx, byte DCTSIZE*SIZEOF_FAST_FLOAT
|
||||
dec ecx ; advance pointer to next row
|
||||
jnz near .rowloop
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
mov edx, POINTER [data(ebp)] ; (FAST_FLOAT *)
|
||||
mov ecx, DCTSIZE
|
||||
alignx 16,7
|
||||
.columnloop:
|
||||
fld FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
|
||||
fadd FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
|
||||
fld FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
|
||||
fadd FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
|
||||
fld FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
|
||||
fadd FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
|
||||
fld FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
|
||||
fadd FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; -- Even part
|
||||
|
||||
fld st2 ; st2 = st2 + st1, st1 = st2 - st1
|
||||
fsub st0,st2
|
||||
fxch st0,st2
|
||||
faddp st3,st0
|
||||
fld st3 ; st3 = st3 + st0, st0 = st3 - st0
|
||||
fsub st0,st1
|
||||
fxch st0,st1
|
||||
faddp st4,st0
|
||||
|
||||
fadd st0,st1
|
||||
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
|
||||
|
||||
fld st2 ; st3 = st2 + st3, st2 = st2 - st3
|
||||
fsub st0,st4
|
||||
fxch st0,st3
|
||||
faddp st4,st0
|
||||
fld st1 ; st0 = st1 + st0, st1 = st1 - st0
|
||||
fsub st0,st1
|
||||
fxch st0,st2
|
||||
faddp st1,st0
|
||||
|
||||
fld FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
|
||||
fsub FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
|
||||
fxch st0,st4
|
||||
fld FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
|
||||
fsub FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
|
||||
fxch st0,st4
|
||||
fld FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
|
||||
fsub FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
|
||||
fxch st0,st4
|
||||
fld FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
|
||||
fsub FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
|
||||
fxch st0,st4
|
||||
|
||||
fstp FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
|
||||
fstp FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
|
||||
fstp FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
|
||||
fstp FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; -- Odd part
|
||||
|
||||
fadd st2,st0
|
||||
fadd st0,st1
|
||||
fxch st0,st3
|
||||
fadd st1,st0
|
||||
fxch st0,st3
|
||||
|
||||
fld st2
|
||||
fxch st0,st1
|
||||
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
|
||||
fxch st0,st1
|
||||
fsub st0,st2
|
||||
fxch st0,st3
|
||||
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_541)]
|
||||
fxch st0,st3
|
||||
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_382)]
|
||||
fxch st0,st2
|
||||
fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_306)]
|
||||
fxch st0,st2
|
||||
fadd st3,st0
|
||||
faddp st2,st0
|
||||
|
||||
fld st3 ; st3 = st3 + st0, st0 = st3 - st0
|
||||
fsub st0,st1
|
||||
fxch st0,st1
|
||||
faddp st4,st0
|
||||
|
||||
fld st2 ; st0 = st0 + st2, st2 = st0 - st2
|
||||
fsubr st0,st1
|
||||
fxch st0,st3
|
||||
faddp st1,st0
|
||||
fld st1 ; st3 = st3 + st1, st1 = st3 - st1
|
||||
fsubr st0,st4
|
||||
fxch st0,st2
|
||||
faddp st4,st0
|
||||
|
||||
fstp FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
|
||||
fstp FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
|
||||
fstp FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
|
||||
fstp FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
add edx, byte SIZEOF_FAST_FLOAT ; advance pointer to next column
|
||||
dec ecx
|
||||
jnz near .columnloop
|
||||
|
||||
; pop edi ; unused
|
||||
; pop esi ; unused
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
poppic ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
%endif ; DCT_FLOAT_SUPPORTED
|
||||
Reference in New Issue
Block a user