;
; jfdctfst.asm - fast integer FDCT (non-SIMD)
;
; x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains a fast, not so accurate integer implementation of
; the forward DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jfdctfst.c; see the jfdctfst.c for
; more details.
;
; Last Modified : October 17, 2004
;
; [TAB8]

%include "jsimdext.inc"
%include "jdct.inc"

%ifdef DCT_IFAST_SUPPORTED

; This module is specialized to the case DCTSIZE = 8.
;
%if DCTSIZE != 8
%error "Sorry, this code only copes with 8x8 DCTs."
%endif

; --------------------------------------------------------------------------

; We can gain a little more speed, with a further compromise in accuracy,
; by omitting the addition in a descaling shift.  This yields an
; incorrectly rounded result half the time...
;
%macro	descale 2
%ifdef USE_ACCURATE_ROUNDING
%if (%2)<=7
	add	%1, byte (1<<((%2)-1))	; add reg32,imm8
%else
	add	%1, (1<<((%2)-1))	; add reg32,imm32
%endif
%endif
	sar	%1,%2
%endmacro

; --------------------------------------------------------------------------

%define CONST_BITS	8

%if CONST_BITS == 8
F_0_382	equ	 98		; FIX(0.382683433)
F_0_541	equ	139		; FIX(0.541196100)
F_0_707	equ	181		; FIX(0.707106781)
F_1_306	equ	334		; FIX(1.306562965)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
%endif

; --------------------------------------------------------------------------
	SECTION	SEG_TEXT
	BITS	32
;
; Perform the forward DCT on one block of samples.
;
; GLOBAL(void)
; jpeg_fdct_ifast (DCTELEM * data)
;

%define data(b)	(b)+8		; DCTELEM * data

	align	16
	global	EXTN(jpeg_fdct_ifast)

EXTN(jpeg_fdct_ifast):
	push	ebp
	mov	ebp,esp
	push	ebx
;	push	ecx		; need not be preserved
;	push	edx		; need not be preserved
	push	esi
	push	edi

	; ---- Pass 1: process rows.

	mov	ecx, DCTSIZE
	mov	edx, POINTER [data(ebp)]	; (DCTELEM *)
	alignx	16,7
.rowloop:
	push	ecx		; ctr
	push	edx		; dataptr

	movsx	eax, DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)]
	movsx	edi, DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)]
	lea	esi,[eax+edi]	; esi=tmp0
	sub	eax,edi		; eax=tmp7
	push	eax

	movsx	ebx, DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)]
	movsx	ecx, DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)]
	lea	edi,[ebx+ecx]	; edi=tmp1
	sub	ebx,ecx		; ebx=tmp6
	push	ebx

	movsx	eax, DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)]
	movsx	ecx, DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)]
	lea	ebx,[eax+ecx]	; ebx=tmp2
	sub	eax,ecx		; eax=tmp5
	push	eax

	movsx	ecx, DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)]
	movsx	eax, DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)]
	lea	edx,[ecx+eax]	; edx=tmp3
	sub	ecx,eax		; ecx=tmp4
	push	ecx

	; -- Even part

	lea	eax,[esi+edx]	; eax=tmp10
	lea	ecx,[edi+ebx]	; ecx=tmp11
	sub	esi,edx		; esi=tmp13
	sub	edi,ebx		; edi=tmp12

	mov	edx, POINTER [esp+16]	; dataptr

	add	edi,esi
	imul	edi,(F_0_707)	; edi=z1
	descale	edi,CONST_BITS

	lea	ebx,[eax+ecx]	; ebx=data0
	sub	eax,ecx		; eax=data4
	mov	DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)], bx
	mov	DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)], ax

	lea	ecx,[esi+edi]	; ecx=data2
	sub	esi,edi		; esi=data6
	mov	DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)], cx
	mov	DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)], si

	; -- Odd part

	pop	eax	; eax=tmp4
	pop	edx	; edx=tmp5
	pop	ebx	; ebx=tmp6
	pop	edi	; edi=tmp7

	add	eax,edx		; eax=tmp10
	add	edx,ebx		; edx=tmp11
	add	ebx,edi		; ebx=tmp12, edi=tmp7

	imul	edx,(F_0_707)	; edx=z3
	descale	edx,CONST_BITS
	lea	esi,[edi+edx]	; esi=z11
	sub	edi,edx		; edi=z13

	mov	ecx,eax		; ecx=tmp10
	sub	eax,ebx
	imul	eax,(F_0_382)	; eax=z5
	imul	ecx,(F_0_541)	; ecx=MULTIPLY(tmp10,FIX_0_541196100)
	imul	ebx,(F_1_306)	; ebx=MULTIPLY(tmp12,FIX_1_306562965)
	descale	eax,CONST_BITS
	descale	ecx,CONST_BITS
	descale	ebx,CONST_BITS
	add	ecx,eax		; ecx=z2
	add	ebx,eax		; ebx=z4

	pop	edx		; dataptr

	lea	eax,[edi+ecx]	; eax=data5
	sub	edi,ecx		; edi=data3
	mov	DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)], ax
	mov	DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)], di

	lea	ecx,[esi+ebx]	; ecx=data1
	sub	esi,ebx		; esi=data7
	mov	DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)], cx
	mov	DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)], si

	pop	ecx		; ctr

	add	edx, byte DCTSIZE*SIZEOF_DCTELEM
	dec	ecx			; advance pointer to next row
	jnz	near .rowloop

	; ---- Pass 2: process columns.

	mov	ecx, DCTSIZE
	mov	edx, POINTER [data(ebp)]	; (DCTELEM *)
	alignx	16,7
.columnloop:
	push	ecx		; ctr
	push	edx		; dataptr

	movsx	eax, DCTELEM [COL(0,edx,SIZEOF_DCTELEM)]
	movsx	edi, DCTELEM [COL(7,edx,SIZEOF_DCTELEM)]
	lea	esi,[eax+edi]	; esi=tmp0
	sub	eax,edi		; eax=tmp7
	push	eax

	movsx	ebx, DCTELEM [COL(1,edx,SIZEOF_DCTELEM)]
	movsx	ecx, DCTELEM [COL(6,edx,SIZEOF_DCTELEM)]
	lea	edi,[ebx+ecx]	; edi=tmp1
	sub	ebx,ecx		; ebx=tmp6
	push	ebx

	movsx	eax, DCTELEM [COL(2,edx,SIZEOF_DCTELEM)]
	movsx	ecx, DCTELEM [COL(5,edx,SIZEOF_DCTELEM)]
	lea	ebx,[eax+ecx]	; ebx=tmp2
	sub	eax,ecx		; eax=tmp5
	push	eax

	movsx	ecx, DCTELEM [COL(3,edx,SIZEOF_DCTELEM)]
	movsx	eax, DCTELEM [COL(4,edx,SIZEOF_DCTELEM)]
	lea	edx,[ecx+eax]	; edx=tmp3
	sub	ecx,eax		; ecx=tmp4
	push	ecx

	; -- Even part

	lea	eax,[esi+edx]	; eax=tmp10
	lea	ecx,[edi+ebx]	; ecx=tmp11
	sub	esi,edx		; esi=tmp13
	sub	edi,ebx		; edi=tmp12

	mov	edx, POINTER [esp+16]	; dataptr

	add	edi,esi
	imul	edi,(F_0_707)	; edi=z1
	descale	edi,CONST_BITS

	lea	ebx,[eax+ecx]	; ebx=data0
	sub	eax,ecx		; eax=data4
	mov	DCTELEM [COL(0,edx,SIZEOF_DCTELEM)], bx
	mov	DCTELEM [COL(4,edx,SIZEOF_DCTELEM)], ax

	lea	ecx,[esi+edi]	; ecx=data2
	sub	esi,edi		; esi=data6
	mov	DCTELEM [COL(2,edx,SIZEOF_DCTELEM)], cx
	mov	DCTELEM [COL(6,edx,SIZEOF_DCTELEM)], si

	; -- Odd part

	pop	eax	; eax=tmp4
	pop	edx	; edx=tmp5
	pop	ebx	; ebx=tmp6
	pop	edi	; edi=tmp7

	add	eax,edx		; eax=tmp10
	add	edx,ebx		; edx=tmp11
	add	ebx,edi		; ebx=tmp12, edi=tmp7

	imul	edx,(F_0_707)	; edx=z3
	descale	edx,CONST_BITS
	lea	esi,[edi+edx]	; esi=z11
	sub	edi,edx		; edi=z13

	mov	ecx,eax		; ecx=tmp10
	sub	eax,ebx
	imul	eax,(F_0_382)	; eax=z5
	imul	ecx,(F_0_541)	; ecx=MULTIPLY(tmp10,FIX_0_541196100)
	imul	ebx,(F_1_306)	; ebx=MULTIPLY(tmp12,FIX_1_306562965)
	descale	eax,CONST_BITS
	descale	ecx,CONST_BITS
	descale	ebx,CONST_BITS
	add	ecx,eax		; ecx=z2
	add	ebx,eax		; ebx=z4

	pop	edx		; dataptr

	lea	eax,[edi+ecx]	; eax=data5
	sub	edi,ecx		; edi=data3
	mov	DCTELEM [COL(5,edx,SIZEOF_DCTELEM)], ax
	mov	DCTELEM [COL(3,edx,SIZEOF_DCTELEM)], di

	lea	ecx,[esi+ebx]	; ecx=data1
	sub	esi,ebx		; esi=data7
	mov	DCTELEM [COL(1,edx,SIZEOF_DCTELEM)], cx
	mov	DCTELEM [COL(7,edx,SIZEOF_DCTELEM)], si

	pop	ecx		; ctr

	add	edx, byte SIZEOF_DCTELEM    ; advance pointer to next column
	dec	ecx
	jnz	near .columnloop

	pop	edi
	pop	esi
;	pop	edx		; need not be preserved
;	pop	ecx		; need not be preserved
	pop	ebx
	pop	ebp
	ret

%endif ; DCT_IFAST_SUPPORTED