Independent JPEG Group's JPEG software release 6b with x86 SIMD extension for IJG JPEG library version 1.02
289 lines
6.9 KiB
NASM
289 lines
6.9 KiB
NASM
;
|
|
; jfdctflt.asm - floating-point FDCT (non-SIMD)
|
|
;
|
|
; x86 SIMD extension for IJG JPEG library
|
|
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
|
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
|
;
|
|
; This file should be assembled with NASM (Netwide Assembler),
|
|
; can *not* be assembled with Microsoft's MASM or any compatible
|
|
; assembler (including Borland's Turbo Assembler).
|
|
; NASM is available from http://nasm.sourceforge.net/ or
|
|
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
|
;
|
|
; This file contains a floating-point implementation of the forward DCT
|
|
; (Discrete Cosine Transform). The following code is based directly on
|
|
; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
|
|
;
|
|
; Last Modified : October 17, 2004
|
|
;
|
|
; [TAB8]
|
|
|
|
%include "jsimdext.inc"
|
|
%include "jdct.inc"
|
|
|
|
%ifdef DCT_FLOAT_SUPPORTED
|
|
|
|
; This module is specialized to the case DCTSIZE = 8.
|
|
;
|
|
%if DCTSIZE != 8
|
|
%error "Sorry, this code only copes with 8x8 DCTs."
|
|
%endif
|
|
|
|
; --------------------------------------------------------------------------
|
|
SECTION SEG_CONST
|
|
|
|
%define ROTATOR_TYPE FP32 ; float
|
|
|
|
alignz 16
|
|
global EXTN(jconst_fdct_float)
|
|
|
|
EXTN(jconst_fdct_float):
|
|
|
|
F_0_382 dd 0.382683432365089771728460 ; cos(PI*3/8)
|
|
F_0_707 dd 0.707106781186547524400844 ; cos(PI*1/4)
|
|
F_0_541 dd 0.541196100146196984399723 ; cos(PI*1/8)-cos(PI*3/8)
|
|
F_1_306 dd 1.306562964876376527856643 ; cos(PI*1/8)+cos(PI*3/8)
|
|
|
|
alignz 16
|
|
|
|
; --------------------------------------------------------------------------
|
|
SECTION SEG_TEXT
|
|
BITS 32
|
|
;
|
|
; Perform the forward DCT on one block of samples.
|
|
;
|
|
; GLOBAL(void)
|
|
; jpeg_fdct_float (FAST_FLOAT * data)
|
|
;
|
|
|
|
%define data(b) (b)+8 ; FAST_FLOAT * data
|
|
|
|
align 16
|
|
global EXTN(jpeg_fdct_float)
|
|
|
|
EXTN(jpeg_fdct_float):
|
|
push ebp
|
|
mov ebp,esp
|
|
pushpic ebx
|
|
; push ecx ; need not be preserved
|
|
; push edx ; need not be preserved
|
|
; push esi ; unused
|
|
; push edi ; unused
|
|
|
|
get_GOT ebx ; get GOT address
|
|
|
|
; ---- Pass 1: process rows.
|
|
|
|
mov edx, POINTER [data(ebp)] ; (FAST_FLOAT *)
|
|
mov ecx, DCTSIZE
|
|
alignx 16,7
|
|
.rowloop:
|
|
fld FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
|
|
fadd FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
|
|
fld FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
|
|
fadd FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
|
|
fld FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
|
|
fadd FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
|
|
fld FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
|
|
fadd FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
|
|
|
|
; -- Even part
|
|
|
|
fld st2 ; st2 = st2 + st1, st1 = st2 - st1
|
|
fsub st0,st2
|
|
fxch st0,st2
|
|
faddp st3,st0
|
|
fld st3 ; st3 = st3 + st0, st0 = st3 - st0
|
|
fsub st0,st1
|
|
fxch st0,st1
|
|
faddp st4,st0
|
|
|
|
fadd st0,st1
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
|
|
|
|
fld st2 ; st3 = st2 + st3, st2 = st2 - st3
|
|
fsub st0,st4
|
|
fxch st0,st3
|
|
faddp st4,st0
|
|
fld st1 ; st0 = st1 + st0, st1 = st1 - st0
|
|
fsub st0,st1
|
|
fxch st0,st2
|
|
faddp st1,st0
|
|
|
|
fld FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
|
|
fsub FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
|
|
fxch st0,st4
|
|
fld FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
|
|
fsub FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
|
|
fxch st0,st4
|
|
fld FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
|
|
fsub FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
|
|
fxch st0,st4
|
|
fld FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
|
|
fsub FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
|
|
fxch st0,st4
|
|
|
|
fstp FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
|
|
|
|
; -- Odd part
|
|
|
|
fadd st2,st0
|
|
fadd st0,st1
|
|
fxch st0,st3
|
|
fadd st1,st0
|
|
fxch st0,st3
|
|
|
|
fld st2
|
|
fxch st0,st1
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
|
|
fxch st0,st1
|
|
fsub st0,st2
|
|
fxch st0,st3
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_541)]
|
|
fxch st0,st3
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_382)]
|
|
fxch st0,st2
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_306)]
|
|
fxch st0,st2
|
|
fadd st3,st0
|
|
faddp st2,st0
|
|
|
|
fld st3 ; st3 = st3 + st0, st0 = st3 - st0
|
|
fsub st0,st1
|
|
fxch st0,st1
|
|
faddp st4,st0
|
|
|
|
fld st2 ; st0 = st0 + st2, st2 = st0 - st2
|
|
fsubr st0,st1
|
|
fxch st0,st3
|
|
faddp st1,st0
|
|
fld st1 ; st3 = st3 + st1, st1 = st3 - st1
|
|
fsubr st0,st4
|
|
fxch st0,st2
|
|
faddp st4,st0
|
|
|
|
fstp FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
|
|
|
|
add edx, byte DCTSIZE*SIZEOF_FAST_FLOAT
|
|
dec ecx ; advance pointer to next row
|
|
jnz near .rowloop
|
|
|
|
; ---- Pass 2: process columns.
|
|
|
|
mov edx, POINTER [data(ebp)] ; (FAST_FLOAT *)
|
|
mov ecx, DCTSIZE
|
|
alignx 16,7
|
|
.columnloop:
|
|
fld FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
|
|
fadd FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
|
|
fld FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
|
|
fadd FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
|
|
fld FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
|
|
fadd FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
|
|
fld FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
|
|
fadd FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
|
|
|
|
; -- Even part
|
|
|
|
fld st2 ; st2 = st2 + st1, st1 = st2 - st1
|
|
fsub st0,st2
|
|
fxch st0,st2
|
|
faddp st3,st0
|
|
fld st3 ; st3 = st3 + st0, st0 = st3 - st0
|
|
fsub st0,st1
|
|
fxch st0,st1
|
|
faddp st4,st0
|
|
|
|
fadd st0,st1
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
|
|
|
|
fld st2 ; st3 = st2 + st3, st2 = st2 - st3
|
|
fsub st0,st4
|
|
fxch st0,st3
|
|
faddp st4,st0
|
|
fld st1 ; st0 = st1 + st0, st1 = st1 - st0
|
|
fsub st0,st1
|
|
fxch st0,st2
|
|
faddp st1,st0
|
|
|
|
fld FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
|
|
fsub FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
|
|
fxch st0,st4
|
|
fld FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
|
|
fsub FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
|
|
fxch st0,st4
|
|
fld FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
|
|
fsub FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
|
|
fxch st0,st4
|
|
fld FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
|
|
fsub FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
|
|
fxch st0,st4
|
|
|
|
fstp FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
|
|
|
|
; -- Odd part
|
|
|
|
fadd st2,st0
|
|
fadd st0,st1
|
|
fxch st0,st3
|
|
fadd st1,st0
|
|
fxch st0,st3
|
|
|
|
fld st2
|
|
fxch st0,st1
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
|
|
fxch st0,st1
|
|
fsub st0,st2
|
|
fxch st0,st3
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_541)]
|
|
fxch st0,st3
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_382)]
|
|
fxch st0,st2
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_306)]
|
|
fxch st0,st2
|
|
fadd st3,st0
|
|
faddp st2,st0
|
|
|
|
fld st3 ; st3 = st3 + st0, st0 = st3 - st0
|
|
fsub st0,st1
|
|
fxch st0,st1
|
|
faddp st4,st0
|
|
|
|
fld st2 ; st0 = st0 + st2, st2 = st0 - st2
|
|
fsubr st0,st1
|
|
fxch st0,st3
|
|
faddp st1,st0
|
|
fld st1 ; st3 = st3 + st1, st1 = st3 - st1
|
|
fsubr st0,st4
|
|
fxch st0,st2
|
|
faddp st4,st0
|
|
|
|
fstp FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
|
|
|
|
add edx, byte SIZEOF_FAST_FLOAT ; advance pointer to next column
|
|
dec ecx
|
|
jnz near .columnloop
|
|
|
|
; pop edi ; unused
|
|
; pop esi ; unused
|
|
; pop edx ; need not be preserved
|
|
; pop ecx ; need not be preserved
|
|
poppic ebx
|
|
pop ebp
|
|
ret
|
|
|
|
%endif ; DCT_FLOAT_SUPPORTED
|