Independent JPEG Group's JPEG software release 6b with x86 SIMD extension for IJG JPEG library version 1.02
474 lines
11 KiB
NASM
474 lines
11 KiB
NASM
;
|
|
; jidctflt.asm - floating-point IDCT (non-SIMD)
|
|
;
|
|
; x86 SIMD extension for IJG JPEG library
|
|
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
|
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
|
;
|
|
; This file should be assembled with NASM (Netwide Assembler),
|
|
; can *not* be assembled with Microsoft's MASM or any compatible
|
|
; assembler (including Borland's Turbo Assembler).
|
|
; NASM is available from http://nasm.sourceforge.net/ or
|
|
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
|
;
|
|
; This file contains a floating-point implementation of the inverse DCT
|
|
; (Discrete Cosine Transform). The following code is based directly on
|
|
; the IJG's original jidctflt.c; see the jidctflt.c for more details.
|
|
;
|
|
; Last Modified : October 17, 2004
|
|
;
|
|
; [TAB8]
|
|
|
|
%include "jsimdext.inc"
|
|
%include "jdct.inc"
|
|
|
|
%ifdef DCT_FLOAT_SUPPORTED
|
|
|
|
; This module is specialized to the case DCTSIZE = 8.
|
|
;
|
|
%if DCTSIZE != 8
|
|
%error "Sorry, this code only copes with 8x8 DCTs."
|
|
%endif
|
|
|
|
; --------------------------------------------------------------------------
|
|
SECTION SEG_CONST
|
|
|
|
%define ROTATOR_TYPE FP32 ; float
|
|
|
|
alignz 16
|
|
global EXTN(jconst_idct_float)
|
|
|
|
EXTN(jconst_idct_float):
|
|
|
|
F_1_414 dd 1.414213562373095048801689 ; 2*cos(PI*1/4)
|
|
F_1_847 dd 1.847759065022573512256366 ; 2*cos(PI*1/8)
|
|
F_1_082 dd 1.082392200292393968799446 ; 2*(cos(PI*1/8)-cos(PI*3/8))
|
|
F_2_613 dd 2.613125929752753055713286 ; 2*(cos(PI*1/8)+cos(PI*3/8))
|
|
|
|
alignz 16
|
|
|
|
; --------------------------------------------------------------------------
|
|
SECTION SEG_TEXT
|
|
BITS 32
|
|
;
|
|
; Perform dequantization and inverse DCT on one block of coefficients.
|
|
;
|
|
; GLOBAL(void)
|
|
; jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
|
; JCOEFPTR coef_block,
|
|
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
|
;
|
|
|
|
%define cinfo(b) (b)+8 ; j_decompress_ptr cinfo
|
|
%define compptr(b) (b)+12 ; jpeg_component_info * compptr
|
|
%define coef_block(b) (b)+16 ; JCOEFPTR coef_block
|
|
%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
|
|
%define output_col(b) (b)+24 ; JDIMENSION output_col
|
|
|
|
%define tmp ebp-SIZEOF_FP64 ; double tmp
|
|
%define workspace tmp-DCTSIZE2*SIZEOF_FAST_FLOAT
|
|
; FAST_FLOAT workspace[DCTSIZE2]
|
|
%define rndint_magic workspace-SIZEOF_FP32
|
|
; float rndint_magic = 100663296.0F
|
|
%define gotptr rndint_magic-SIZEOF_POINTER ; void * gotptr
|
|
|
|
align 16
|
|
global EXTN(jpeg_idct_float)
|
|
|
|
EXTN(jpeg_idct_float):
|
|
push ebp
|
|
mov ebp,esp
|
|
lea esp, [workspace]
|
|
push FP32 0x4CC00000 ; (float)(0x00C00000 << 3)
|
|
pushpic eax ; make a room for GOT address
|
|
push ebx
|
|
; push ecx ; need not be preserved
|
|
; push edx ; need not be preserved
|
|
push esi
|
|
push edi
|
|
|
|
get_GOT ebx ; get GOT address
|
|
movpic POINTER [gotptr], ebx ; save GOT address
|
|
|
|
; ---- Pass 1: process columns from input, store into work array.
|
|
|
|
mov edx, POINTER [compptr(ebp)]
|
|
mov edx, POINTER [jcompinfo_dct_table(edx)] ; quantptr
|
|
mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
|
|
lea edi, [workspace] ; FAST_FLOAT * wsptr
|
|
mov ecx, DCTSIZE ; ctr
|
|
alignx 16,7
|
|
.columnloop:
|
|
mov ax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
|
|
or ax, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
|
|
jnz short .columnDCT
|
|
|
|
mov bx, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
|
|
mov ax, JCOEF [COL(4,esi,SIZEOF_JCOEF)]
|
|
or bx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
|
|
or ax, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
|
|
or bx, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
|
|
or ax,bx
|
|
jnz short .columnDCT
|
|
|
|
; -- AC terms all zero
|
|
|
|
fild JCOEF [COL(0,esi,SIZEOF_JCOEF)]
|
|
fmul FLOAT_MULT_TYPE [COL(0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
|
|
|
fst FAST_FLOAT [COL(0,edi,SIZEOF_FAST_FLOAT)]
|
|
fst FAST_FLOAT [COL(1,edi,SIZEOF_FAST_FLOAT)]
|
|
fst FAST_FLOAT [COL(2,edi,SIZEOF_FAST_FLOAT)]
|
|
fst FAST_FLOAT [COL(3,edi,SIZEOF_FAST_FLOAT)]
|
|
fst FAST_FLOAT [COL(4,edi,SIZEOF_FAST_FLOAT)]
|
|
fst FAST_FLOAT [COL(5,edi,SIZEOF_FAST_FLOAT)]
|
|
fst FAST_FLOAT [COL(6,edi,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [COL(7,edi,SIZEOF_FAST_FLOAT)]
|
|
jmp near .nextcolumn
|
|
alignx 16,7
|
|
|
|
.columnDCT:
|
|
movpic ebx, POINTER [gotptr] ; load GOT address
|
|
|
|
; -- Even part
|
|
|
|
fild JCOEF [COL(2,esi,SIZEOF_JCOEF)]
|
|
fild JCOEF [COL(6,esi,SIZEOF_JCOEF)]
|
|
fild JCOEF [COL(4,esi,SIZEOF_JCOEF)]
|
|
fild JCOEF [COL(0,esi,SIZEOF_JCOEF)]
|
|
|
|
fxch st0,st3
|
|
|
|
fmul FLOAT_MULT_TYPE [COL(2,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
|
fxch st0,st2
|
|
fmul FLOAT_MULT_TYPE [COL(6,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
|
fxch st0,st1
|
|
fmul FLOAT_MULT_TYPE [COL(4,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
|
fxch st0,st3
|
|
fmul FLOAT_MULT_TYPE [COL(0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
|
fxch st0,st1
|
|
|
|
fld st2 ; st2 = st2 + st0, st0 = st2 - st0
|
|
fsub st0,st1
|
|
fxch st0,st1
|
|
faddp st3,st0
|
|
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]
|
|
|
|
fld st3 ; st1 = st1 + st3, st3 = st1 - st3
|
|
fsubr st0,st2
|
|
fxch st0,st4
|
|
faddp st2,st0
|
|
|
|
fsub st0,st2
|
|
|
|
fld st1 ; st2 = st1 + st2, st1 = st1 - st2
|
|
fsub st0,st3
|
|
fxch st0,st2
|
|
faddp st3,st0
|
|
fld st3 ; st0 = st3 + st0, st3 = st3 - st0
|
|
fsub st0,st1
|
|
fxch st0,st4
|
|
faddp st1,st0
|
|
|
|
; -- Odd part
|
|
|
|
fild JCOEF [COL(1,esi,SIZEOF_JCOEF)]
|
|
fild JCOEF [COL(7,esi,SIZEOF_JCOEF)]
|
|
fild JCOEF [COL(3,esi,SIZEOF_JCOEF)]
|
|
fild JCOEF [COL(5,esi,SIZEOF_JCOEF)]
|
|
|
|
fxch st0,st3
|
|
|
|
fmul FLOAT_MULT_TYPE [COL(1,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
|
fxch st0,st2
|
|
fmul FLOAT_MULT_TYPE [COL(7,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
|
fxch st0,st1
|
|
fmul FLOAT_MULT_TYPE [COL(3,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
|
fxch st0,st6
|
|
fxch st3,st0
|
|
fmul FLOAT_MULT_TYPE [COL(5,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
|
fxch st0,st5
|
|
fstp FP64 [tmp]
|
|
|
|
fld st1 ; st1 = st1 + st0, st0 = st1 - st0
|
|
fsub st0,st1
|
|
fxch st0,st1
|
|
faddp st2,st0
|
|
fld st5 ; st4 = st4 + st5, st5 = st4 - st5
|
|
fsubr st0,st5
|
|
fxch st0,st6
|
|
faddp st5,st0
|
|
|
|
fld st1 ; st1 = st1 + st4, st4 = st1 - st4
|
|
fsub st0,st5
|
|
fxch st0,st5
|
|
faddp st2,st0
|
|
|
|
fld st5
|
|
fadd st0,st1
|
|
fxch st0,st5
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]
|
|
fxch st0,st5
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_847)]
|
|
fxch st0,st6
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_2_613)]
|
|
fxch st0,st1
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_082)]
|
|
fxch st0,st6
|
|
fsubr st1,st0
|
|
fsubp st6,st0
|
|
|
|
; -- Final output stage
|
|
|
|
fsub st0,st1
|
|
fld st2 ; st1 = st2 + st1, st2 = st2 - st1
|
|
fsub st0,st2
|
|
fxch st0,st3
|
|
faddp st2,st0
|
|
fsub st4,st0
|
|
fld st3 ; st0 = st3 + st0, st3 = st3 - st0
|
|
fsub st0,st1
|
|
fxch st0,st4
|
|
faddp st1,st0
|
|
|
|
fxch st0,st2
|
|
|
|
fstp FAST_FLOAT [COL(7,edi,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [COL(0,edi,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [COL(1,edi,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [COL(6,edi,SIZEOF_FAST_FLOAT)]
|
|
|
|
fadd st1,st0
|
|
fld FP64 [tmp]
|
|
fld st1 ; st3 = st3 + st1, st1 = st3 - st1
|
|
fsubr st0,st4
|
|
fxch st0,st2
|
|
faddp st4,st0
|
|
fld st0 ; st0 = st0 + st2, st2 = st0 - st2
|
|
fsub st0,st3
|
|
fxch st0,st3
|
|
faddp st1,st0
|
|
|
|
fxch st0,st3
|
|
|
|
fstp FAST_FLOAT [COL(2,edi,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [COL(5,edi,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [COL(3,edi,SIZEOF_FAST_FLOAT)]
|
|
fstp FAST_FLOAT [COL(4,edi,SIZEOF_FAST_FLOAT)]
|
|
|
|
.nextcolumn:
|
|
add esi, byte SIZEOF_JCOEF ; advance pointers to next column
|
|
add edx, byte SIZEOF_FLOAT_MULT_TYPE
|
|
add edi, byte SIZEOF_FAST_FLOAT
|
|
dec ecx
|
|
jnz near .columnloop
|
|
|
|
; ---- Pass 2: process rows from work array, store into output array.
|
|
|
|
mov edx, POINTER [cinfo(ebp)]
|
|
mov edx, POINTER [jdstruct_sample_range_limit(edx)]
|
|
sub edx, byte -CENTERJSAMPLE*SIZEOF_JSAMPLE ; JSAMPLE * range_limit
|
|
|
|
lea esi, [workspace] ; FAST_FLOAT * wsptr
|
|
mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
|
|
mov ecx, DCTSIZE ; ctr
|
|
alignx 16,7
|
|
.rowloop:
|
|
push edi
|
|
mov edi, JSAMPROW [edi] ; (JSAMPLE *)
|
|
add edi, JDIMENSION [output_col(ebp)] ; edi=outptr
|
|
|
|
%ifndef NO_ZERO_ROW_TEST_FLOAT
|
|
mov eax, FAST_FLOAT [ROW(1,esi,SIZEOF_FAST_FLOAT)]
|
|
add eax,eax ; shl eax,1 (shift out the sign bit)
|
|
jnz short .rowDCT
|
|
|
|
mov eax, FAST_FLOAT [ROW(2,esi,SIZEOF_FAST_FLOAT)]
|
|
mov ebx, FAST_FLOAT [ROW(3,esi,SIZEOF_FAST_FLOAT)]
|
|
or eax, FAST_FLOAT [ROW(4,esi,SIZEOF_FAST_FLOAT)]
|
|
or ebx, FAST_FLOAT [ROW(5,esi,SIZEOF_FAST_FLOAT)]
|
|
or eax, FAST_FLOAT [ROW(6,esi,SIZEOF_FAST_FLOAT)]
|
|
or ebx, FAST_FLOAT [ROW(7,esi,SIZEOF_FAST_FLOAT)]
|
|
or eax,ebx
|
|
add eax,eax ; shl eax,1 (shift out the sign bit)
|
|
jnz short .rowDCT
|
|
|
|
; -- AC terms all zero
|
|
|
|
push eax
|
|
|
|
fld FAST_FLOAT [ROW(0,esi,SIZEOF_FAST_FLOAT)]
|
|
fadd FP32 [rndint_magic]
|
|
fstp FP32 [esp]
|
|
|
|
pop eax
|
|
and eax,RANGE_MASK
|
|
mov al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
|
|
mov JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
|
|
mov JSAMPLE [edi+1*SIZEOF_JSAMPLE], al
|
|
mov JSAMPLE [edi+2*SIZEOF_JSAMPLE], al
|
|
mov JSAMPLE [edi+3*SIZEOF_JSAMPLE], al
|
|
mov JSAMPLE [edi+4*SIZEOF_JSAMPLE], al
|
|
mov JSAMPLE [edi+5*SIZEOF_JSAMPLE], al
|
|
mov JSAMPLE [edi+6*SIZEOF_JSAMPLE], al
|
|
mov JSAMPLE [edi+7*SIZEOF_JSAMPLE], al
|
|
jmp near .nextrow
|
|
alignx 16,7
|
|
%endif
|
|
.rowDCT:
|
|
movpic ebx, POINTER [gotptr] ; load GOT address
|
|
|
|
; -- Even part
|
|
|
|
fld FAST_FLOAT [ROW(4,esi,SIZEOF_FAST_FLOAT)]
|
|
fld FAST_FLOAT [ROW(2,esi,SIZEOF_FAST_FLOAT)]
|
|
fld FAST_FLOAT [ROW(0,esi,SIZEOF_FAST_FLOAT)]
|
|
fld FAST_FLOAT [ROW(6,esi,SIZEOF_FAST_FLOAT)]
|
|
|
|
fld st2 ; st2 = st2 + st0, st0 = st2 - st0
|
|
fsub st0,st1
|
|
fxch st0,st1
|
|
faddp st3,st0
|
|
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]
|
|
|
|
fld st3 ; st1 = st1 + st3, st3 = st1 - st3
|
|
fsubr st0,st2
|
|
fxch st0,st4
|
|
faddp st2,st0
|
|
|
|
fsub st0,st2
|
|
|
|
fld st1 ; st2 = st1 + st2, st1 = st1 - st2
|
|
fsub st0,st3
|
|
fxch st0,st2
|
|
faddp st3,st0
|
|
fld st3 ; st0 = st3 + st0, st3 = st3 - st0
|
|
fsub st0,st1
|
|
fxch st0,st4
|
|
faddp st1,st0
|
|
|
|
; -- Odd part
|
|
|
|
fld FAST_FLOAT [ROW(3,esi,SIZEOF_FAST_FLOAT)]
|
|
fxch st0,st3
|
|
fld FAST_FLOAT [ROW(1,esi,SIZEOF_FAST_FLOAT)]
|
|
fld FAST_FLOAT [ROW(7,esi,SIZEOF_FAST_FLOAT)]
|
|
fld FAST_FLOAT [ROW(5,esi,SIZEOF_FAST_FLOAT)]
|
|
fxch st0,st5
|
|
fstp FP64 [tmp]
|
|
|
|
fld st1 ; st1 = st1 + st0, st0 = st1 - st0
|
|
fsub st0,st1
|
|
fxch st0,st1
|
|
faddp st2,st0
|
|
fld st5 ; st4 = st4 + st5, st5 = st4 - st5
|
|
fsubr st0,st5
|
|
fxch st0,st6
|
|
faddp st5,st0
|
|
|
|
fld st1 ; st1 = st1 + st4, st4 = st1 - st4
|
|
fsub st0,st5
|
|
fxch st0,st5
|
|
faddp st2,st0
|
|
|
|
fld st5
|
|
fadd st0,st1
|
|
fxch st0,st5
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]
|
|
fxch st0,st5
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_847)]
|
|
fxch st0,st6
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_2_613)]
|
|
fxch st0,st1
|
|
fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_082)]
|
|
fxch st0,st6
|
|
fsubr st1,st0
|
|
fsubp st6,st0
|
|
|
|
; -- Final output stage
|
|
|
|
sub esp, byte DCTSIZE*SIZEOF_FP32
|
|
|
|
fsub st0,st1
|
|
fld st2 ; st1 = st2 + st1, st2 = st2 - st1
|
|
fsub st0,st2
|
|
fxch st0,st3
|
|
faddp st2,st0
|
|
fsub st4,st0
|
|
fld st3 ; st0 = st3 + st0, st3 = st3 - st0
|
|
fsub st0,st1
|
|
fxch st0,st4
|
|
faddp st1,st0
|
|
|
|
fld FP32 [rndint_magic]
|
|
|
|
fadd st4,st0
|
|
fadd st1,st0
|
|
fadd st2,st0
|
|
fadd st3,st0
|
|
|
|
fxch st0,st4
|
|
|
|
fstp FP32 [esp+6*SIZEOF_FP32]
|
|
fstp FP32 [esp+1*SIZEOF_FP32]
|
|
fstp FP32 [esp+0*SIZEOF_FP32]
|
|
fstp FP32 [esp+7*SIZEOF_FP32]
|
|
|
|
fxch st0,st1
|
|
|
|
fadd st2,st0
|
|
fld FP64 [tmp]
|
|
fld st1 ; st4 = st4 + st1, st1 = st4 - st1
|
|
fsubr st0,st5
|
|
fxch st0,st2
|
|
faddp st5,st0
|
|
fld st0 ; st0 = st0 + st3, st3 = st0 - st3
|
|
fsub st0,st4
|
|
fxch st0,st4
|
|
faddp st1,st0
|
|
|
|
fxch st0,st2
|
|
|
|
fadd st1,st0
|
|
fadd st2,st0
|
|
fadd st3,st0
|
|
faddp st4,st0
|
|
|
|
fstp FP32 [esp+5*SIZEOF_FP32]
|
|
fstp FP32 [esp+4*SIZEOF_FP32]
|
|
fstp FP32 [esp+3*SIZEOF_FP32]
|
|
fstp FP32 [esp+2*SIZEOF_FP32]
|
|
|
|
%assign i 0 ; i=0;
|
|
%rep 4 ; -- repeat 4 times ---
|
|
pop eax
|
|
pop ebx
|
|
and eax,RANGE_MASK
|
|
and ebx,RANGE_MASK
|
|
mov al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
|
|
mov bl, JSAMPLE [edx+ebx*SIZEOF_JSAMPLE]
|
|
mov JSAMPLE [edi+(i+0)*SIZEOF_JSAMPLE], al
|
|
mov JSAMPLE [edi+(i+1)*SIZEOF_JSAMPLE], bl
|
|
%assign i i+2 ; i+=2;
|
|
%endrep ; -- repeat end ---
|
|
|
|
.nextrow:
|
|
pop edi
|
|
add esi, byte DCTSIZE*SIZEOF_FAST_FLOAT
|
|
add edi, byte SIZEOF_JSAMPROW ; advance pointer to next row
|
|
dec ecx
|
|
jnz near .rowloop
|
|
|
|
pop edi
|
|
pop esi
|
|
; pop edx ; need not be preserved
|
|
; pop ecx ; need not be preserved
|
|
pop ebx
|
|
mov esp,ebp
|
|
pop ebp
|
|
ret
|
|
|
|
%endif ; DCT_FLOAT_SUPPORTED
|