;
; jfdctflt.asm - floating-point FDCT (non-SIMD)
;
; x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains a floating-point implementation of the forward DCT
; (Discrete Cosine Transform). The following code is based directly on
; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
;
; Last Modified : October 17, 2004
;
; [TAB8]

%include "jsimdext.inc"
%include "jdct.inc"

%ifdef DCT_FLOAT_SUPPORTED

; This module is specialized to the case DCTSIZE = 8.
;
%if DCTSIZE != 8
%error "Sorry, this code only copes with 8x8 DCTs."
%endif

; --------------------------------------------------------------------------
	SECTION	SEG_CONST

%define ROTATOR_TYPE	FP32	; float

	alignz	16
	global	EXTN(jconst_fdct_float)

EXTN(jconst_fdct_float):

F_0_382	dd	0.382683432365089771728460	; cos(PI*3/8)
F_0_707	dd	0.707106781186547524400844	; cos(PI*1/4)
F_0_541	dd	0.541196100146196984399723	; cos(PI*1/8)-cos(PI*3/8)
F_1_306	dd	1.306562964876376527856643	; cos(PI*1/8)+cos(PI*3/8)

	alignz	16

; --------------------------------------------------------------------------
	SECTION	SEG_TEXT
	BITS	32
;
; Perform the forward DCT on one block of samples.
;
; GLOBAL(void)
; jpeg_fdct_float (FAST_FLOAT * data)
;

%define data(b)	(b)+8		; FAST_FLOAT * data

	align	16
	global	EXTN(jpeg_fdct_float)

EXTN(jpeg_fdct_float):
	push	ebp
	mov	ebp,esp
	pushpic	ebx
;	push	ecx		; need not be preserved
;	push	edx		; need not be preserved
;	push	esi		; unused
;	push	edi		; unused

	get_GOT	ebx		; get GOT address

	; ---- Pass 1: process rows.

	mov	edx, POINTER [data(ebp)]	; (FAST_FLOAT *)
	mov	ecx, DCTSIZE
	alignx	16,7
.rowloop:
	fld	FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
	fadd	FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
	fld	FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
	fadd	FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
	fld	FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
	fadd	FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
	fld	FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
	fadd	FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]

	; -- Even part

	fld	st2	; st2 = st2 + st1, st1 = st2 - st1
	fsub	st0,st2
	fxch	st0,st2
	faddp	st3,st0
	fld	st3	; st3 = st3 + st0, st0 = st3 - st0
	fsub	st0,st1
	fxch	st0,st1
	faddp	st4,st0

	fadd	st0,st1
	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]

	fld	st2	; st3 = st2 + st3, st2 = st2 - st3
	fsub	st0,st4
	fxch	st0,st3
	faddp	st4,st0
	fld	st1	; st0 = st1 + st0, st1 = st1 - st0
	fsub	st0,st1
	fxch	st0,st2
	faddp	st1,st0

	fld	FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
	fsub	FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
	fxch	st0,st4
	fld	FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
	fsub	FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
	fxch	st0,st4
	fld	FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
	fsub	FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
	fxch	st0,st4
	fld	FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
	fsub	FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
	fxch	st0,st4

	fstp	FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
	fstp	FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
	fstp	FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
	fstp	FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]

	; -- Odd part

	fadd	st2,st0
	fadd	st0,st1
	fxch	st0,st3
	fadd	st1,st0
	fxch	st0,st3

	fld	st2
	fxch	st0,st1
	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
	fxch	st0,st1
	fsub	st0,st2
	fxch	st0,st3
	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_541)]
	fxch	st0,st3
	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_382)]
	fxch	st0,st2
	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_306)]
	fxch	st0,st2
	fadd	st3,st0
	faddp	st2,st0

	fld	st3	; st3 = st3 + st0, st0 = st3 - st0
	fsub	st0,st1
	fxch	st0,st1
	faddp	st4,st0

	fld	st2	; st0 = st0 + st2, st2 = st0 - st2
	fsubr	st0,st1
	fxch	st0,st3
	faddp	st1,st0
	fld	st1	; st3 = st3 + st1, st1 = st3 - st1
	fsubr	st0,st4
	fxch	st0,st2
	faddp	st4,st0

	fstp	FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
	fstp	FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
	fstp	FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
	fstp	FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]

	add	edx, byte DCTSIZE*SIZEOF_FAST_FLOAT
	dec	ecx				; advance pointer to next row
	jnz	near .rowloop

	; ---- Pass 2: process columns.

	mov	edx, POINTER [data(ebp)]	; (FAST_FLOAT *)
	mov	ecx, DCTSIZE
	alignx	16,7
.columnloop:
	fld	FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
	fadd	FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
	fld	FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
	fadd	FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
	fld	FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
	fadd	FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
	fld	FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
	fadd	FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]

	; -- Even part

	fld	st2	; st2 = st2 + st1, st1 = st2 - st1
	fsub	st0,st2
	fxch	st0,st2
	faddp	st3,st0
	fld	st3	; st3 = st3 + st0, st0 = st3 - st0
	fsub	st0,st1
	fxch	st0,st1
	faddp	st4,st0

	fadd	st0,st1
	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]

	fld	st2	; st3 = st2 + st3, st2 = st2 - st3
	fsub	st0,st4
	fxch	st0,st3
	faddp	st4,st0
	fld	st1	; st0 = st1 + st0, st1 = st1 - st0
	fsub	st0,st1
	fxch	st0,st2
	faddp	st1,st0

	fld	FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
	fsub	FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
	fxch	st0,st4
	fld	FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
	fsub	FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
	fxch	st0,st4
	fld	FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
	fsub	FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
	fxch	st0,st4
	fld	FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
	fsub	FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
	fxch	st0,st4

	fstp	FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
	fstp	FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
	fstp	FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
	fstp	FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]

	; -- Odd part

	fadd	st2,st0
	fadd	st0,st1
	fxch	st0,st3
	fadd	st1,st0
	fxch	st0,st3

	fld	st2
	fxch	st0,st1
	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
	fxch	st0,st1
	fsub	st0,st2
	fxch	st0,st3
	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_541)]
	fxch	st0,st3
	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_382)]
	fxch	st0,st2
	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_306)]
	fxch	st0,st2
	fadd	st3,st0
	faddp	st2,st0

	fld	st3	; st3 = st3 + st0, st0 = st3 - st0
	fsub	st0,st1
	fxch	st0,st1
	faddp	st4,st0

	fld	st2	; st0 = st0 + st2, st2 = st0 - st2
	fsubr	st0,st1
	fxch	st0,st3
	faddp	st1,st0
	fld	st1	; st3 = st3 + st1, st1 = st3 - st1
	fsubr	st0,st4
	fxch	st0,st2
	faddp	st4,st0

	fstp	FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
	fstp	FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
	fstp	FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
	fstp	FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]

	add	edx, byte SIZEOF_FAST_FLOAT ; advance pointer to next column
	dec	ecx
	jnz	near .columnloop

;	pop	edi		; unused
;	pop	esi		; unused
;	pop	edx		; need not be preserved
;	pop	ecx		; need not be preserved
	poppic	ebx
	pop	ebp
	ret

%endif ; DCT_FLOAT_SUPPORTED