Add MMX SIMD implementation of computationally intensive routines.
This commit is contained in:
@@ -20,7 +20,12 @@ BUILT_SOURCES = simd/jsimdcfg.inc
|
||||
EXTRA_DIST = nasm_lt.sh
|
||||
|
||||
libjpeg_la_SOURCES += simd/jsimd.h simd/jsimdcfg.inc.h \
|
||||
simd/jsimdext.inc simd/jsimdcpu.asm
|
||||
simd/jsimdext.inc simd/jcolsamp.inc simd/jdct.inc \
|
||||
simd/jsimdcpu.asm \
|
||||
simd/jccolmmx.asm simd/jdcolmmx.asm \
|
||||
simd/jcsammmx.asm simd/jdsammmx.asm simd/jdmermmx.asm \
|
||||
simd/jcqntmmx.asm simd/jfmmxfst.asm simd/jfmmxint.asm \
|
||||
simd/jimmxred.asm simd/jimmxint.asm simd/jimmxfst.asm
|
||||
|
||||
endif
|
||||
|
||||
|
||||
6
jdct.h
6
jdct.h
@@ -28,10 +28,16 @@
|
||||
*/
|
||||
|
||||
#if BITS_IN_JSAMPLE == 8
|
||||
#ifndef WITH_SIMD
|
||||
typedef int DCTELEM; /* 16 or 32 bits is fine */
|
||||
typedef unsigned int UDCTELEM;
|
||||
typedef unsigned long long UDCTELEM2;
|
||||
#else
|
||||
typedef short DCTELEM; /* prefer 16 bit with SIMD for parellelism */
|
||||
typedef unsigned short UDCTELEM;
|
||||
typedef unsigned int UDCTELEM2;
|
||||
#endif
|
||||
#else
|
||||
typedef INT32 DCTELEM; /* must have 32 bits */
|
||||
typedef UINT32 UDCTELEM;
|
||||
typedef unsigned long long UDCTELEM2;
|
||||
|
||||
@@ -325,7 +325,11 @@ typedef int boolean;
|
||||
*/
|
||||
|
||||
#ifndef MULTIPLIER
|
||||
#ifndef WITH_SIMD
|
||||
#define MULTIPLIER int /* type for fastest integer multiply */
|
||||
#else
|
||||
#define MULTIPLIER short /* prefer 16-bit with SIMD for parellelism */
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
282
jsimd.c
282
jsimd.c
@@ -43,6 +43,17 @@ jsimd_can_rgb_ycc (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -51,6 +62,17 @@ jsimd_can_ycc_rgb (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -59,6 +81,11 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows)
|
||||
{
|
||||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_rgb_ycc_convert_mmx(cinfo->image_width, input_buf,
|
||||
output_buf, output_row, num_rows);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -66,6 +93,11 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows)
|
||||
{
|
||||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_ycc_rgb_convert_mmx(cinfo->output_width, input_buf,
|
||||
input_row, output_buf, num_rows);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
@@ -73,6 +105,15 @@ jsimd_can_h2v2_downsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -81,6 +122,15 @@ jsimd_can_h2v1_downsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -88,12 +138,24 @@ GLOBAL(void)
|
||||
jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY output_data)
|
||||
{
|
||||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
|
||||
compptr->v_samp_factor, compptr->width_in_blocks,
|
||||
input_data, output_data);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY output_data)
|
||||
{
|
||||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
|
||||
compptr->v_samp_factor, compptr->width_in_blocks,
|
||||
input_data, output_data);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
@@ -101,6 +163,15 @@ jsimd_can_h2v2_upsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -109,6 +180,15 @@ jsimd_can_h2v1_upsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -118,6 +198,11 @@ jsimd_h2v2_upsample (j_decompress_ptr cinfo,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY * output_data_ptr)
|
||||
{
|
||||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor,
|
||||
cinfo->output_width, input_data, output_data_ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -126,6 +211,11 @@ jsimd_h2v1_upsample (j_decompress_ptr cinfo,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY * output_data_ptr)
|
||||
{
|
||||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor,
|
||||
cinfo->output_width, input_data, output_data_ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
@@ -133,6 +223,15 @@ jsimd_can_h2v2_fancy_upsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -141,6 +240,15 @@ jsimd_can_h2v1_fancy_upsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -150,6 +258,11 @@ jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY * output_data_ptr)
|
||||
{
|
||||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data, output_data_ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -158,6 +271,11 @@ jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY * output_data_ptr)
|
||||
{
|
||||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data, output_data_ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
@@ -165,6 +283,15 @@ jsimd_can_h2v2_merged_upsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -173,6 +300,15 @@ jsimd_can_h2v1_merged_upsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -182,6 +318,11 @@ jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
|
||||
JDIMENSION in_row_group_ctr,
|
||||
JSAMPARRAY output_buf)
|
||||
{
|
||||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_h2v2_merged_upsample_mmx(cinfo->output_width, input_buf,
|
||||
in_row_group_ctr, output_buf);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -190,6 +331,11 @@ jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
|
||||
JDIMENSION in_row_group_ctr,
|
||||
JSAMPARRAY output_buf)
|
||||
{
|
||||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_h2v1_merged_upsample_mmx(cinfo->output_width, input_buf,
|
||||
in_row_group_ctr, output_buf);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
@@ -197,6 +343,19 @@ jsimd_can_convsamp (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -212,6 +371,10 @@ GLOBAL(void)
|
||||
jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
DCTELEM * workspace)
|
||||
{
|
||||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_convsamp_mmx(sample_data, start_col, workspace);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -225,6 +388,15 @@ jsimd_can_fdct_islow (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -233,6 +405,15 @@ jsimd_can_fdct_ifast (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -247,11 +428,19 @@ jsimd_can_fdct_float (void)
|
||||
GLOBAL(void)
|
||||
jsimd_fdct_islow (DCTELEM * data)
|
||||
{
|
||||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_fdct_islow_mmx(data);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_fdct_ifast (DCTELEM * data)
|
||||
{
|
||||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_fdct_ifast_mmx(data);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -264,6 +453,17 @@ jsimd_can_quantize (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -279,6 +479,10 @@ GLOBAL(void)
|
||||
jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
|
||||
DCTELEM * workspace)
|
||||
{
|
||||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_quantize_mmx(coef_block, divisors, workspace);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -292,6 +496,21 @@ jsimd_can_idct_2x2 (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (sizeof(ISLOW_MULT_TYPE) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -300,6 +519,21 @@ jsimd_can_idct_4x4 (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (sizeof(ISLOW_MULT_TYPE) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -308,6 +542,10 @@ jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
#if WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -315,6 +553,10 @@ jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
#if WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
@@ -322,6 +564,21 @@ jsimd_can_idct_islow (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (sizeof(ISLOW_MULT_TYPE) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -330,6 +587,23 @@ jsimd_can_idct_ifast (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (sizeof(IFAST_MULT_TYPE) != 2)
|
||||
return 0;
|
||||
if (IFAST_SCALE_BITS != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -346,6 +620,10 @@ jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
#if WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
@@ -353,6 +631,10 @@ jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
#if WITH_SIMD
|
||||
if (simd_support & JSIMD_MMX)
|
||||
jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
|
||||
508
simd/jccolmmx.asm
Normal file
508
simd/jccolmmx.asm
Normal file
@@ -0,0 +1,508 @@
|
||||
;
|
||||
; jccolmmx.asm - colorspace conversion (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "simd/jsimdext.inc"
|
||||
%include "simd/jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_081 equ 5329 ; FIX(0.08131)
|
||||
F_0_114 equ 7471 ; FIX(0.11400)
|
||||
F_0_168 equ 11059 ; FIX(0.16874)
|
||||
F_0_250 equ 16384 ; FIX(0.25000)
|
||||
F_0_299 equ 19595 ; FIX(0.29900)
|
||||
F_0_331 equ 21709 ; FIX(0.33126)
|
||||
F_0_418 equ 27439 ; FIX(0.41869)
|
||||
F_0_587 equ 38470 ; FIX(0.58700)
|
||||
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_rgb_ycc_convert_mmx)
|
||||
|
||||
EXTN(jconst_rgb_ycc_convert_mmx):
|
||||
|
||||
PW_F0299_F0337 times 2 dw F_0_299, F_0_337
|
||||
PW_F0114_F0250 times 2 dw F_0_114, F_0_250
|
||||
PW_MF016_MF033 times 2 dw -F_0_168,-F_0_331
|
||||
PW_MF008_MF041 times 2 dw -F_0_081,-F_0_418
|
||||
PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
|
||||
PD_ONEHALF times 2 dd (1 << (SCALEBITS-1))
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
|
||||
; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
; JDIMENSION output_row, int num_rows);
|
||||
;
|
||||
|
||||
%define img_width(b) (b)+8 ; JDIMENSION img_width
|
||||
%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
|
||||
%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
|
||||
%define output_row(b) (b)+20 ; JDIMENSION output_row
|
||||
%define num_rows(b) (b)+24 ; int num_rows
|
||||
|
||||
%define original_ebp ebp+0
|
||||
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
|
||||
%define WK_NUM 8
|
||||
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_rgb_ycc_convert_mmx)
|
||||
|
||||
EXTN(jsimd_rgb_ycc_convert_mmx):
|
||||
push ebp
|
||||
mov eax,esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp],eax
|
||||
mov ebp,esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [img_width(eax)] ; num_cols
|
||||
test ecx,ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov esi, JSAMPIMAGE [output_buf(eax)]
|
||||
mov ecx, JDIMENSION [output_row(eax)]
|
||||
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
|
||||
mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
|
||||
mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
|
||||
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
|
||||
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
|
||||
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop ecx
|
||||
|
||||
mov esi, JSAMPARRAY [input_buf(eax)]
|
||||
mov eax, INT [num_rows(eax)]
|
||||
test eax,eax
|
||||
jle near .return
|
||||
alignx 16,7
|
||||
.rowloop:
|
||||
pushpic eax
|
||||
push edx
|
||||
push ebx
|
||||
push edi
|
||||
push esi
|
||||
push ecx ; col
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr0
|
||||
mov ebx, JSAMPROW [ebx] ; outptr1
|
||||
mov edx, JSAMPROW [edx] ; outptr2
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jae short .columnloop
|
||||
alignx 16,7
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
.column_ld1:
|
||||
push eax
|
||||
push edx
|
||||
lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
|
||||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_BYTE
|
||||
xor eax,eax
|
||||
mov al, BYTE [esi+ecx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
xor edx,edx
|
||||
mov dx, WORD [esi+ecx]
|
||||
shl eax, WORD_BIT
|
||||
or eax,edx
|
||||
.column_ld4:
|
||||
movd mmA,eax
|
||||
pop edx
|
||||
pop eax
|
||||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
movd mmG, DWORD [esi+ecx]
|
||||
psllq mmA, DWORD_BIT
|
||||
por mmA,mmG
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_MMWORD
|
||||
jz short .column_ld16
|
||||
movq mmG,mmA
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
mov ecx, SIZEOF_MMWORD
|
||||
jmp short .rgb_ycc_cnv
|
||||
.column_ld16:
|
||||
test cl, 2*SIZEOF_MMWORD
|
||||
mov ecx, SIZEOF_MMWORD
|
||||
jz short .rgb_ycc_cnv
|
||||
movq mmF,mmA
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
jmp short .rgb_ycc_cnv
|
||||
alignx 16,7
|
||||
|
||||
.columnloop:
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
|
||||
|
||||
.rgb_ycc_cnv:
|
||||
; mmA=(00 10 20 01 11 21 02 12)
|
||||
; mmG=(22 03 13 23 04 14 24 05)
|
||||
; mmF=(15 25 06 16 26 07 17 27)
|
||||
|
||||
movq mmD,mmA
|
||||
psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01)
|
||||
psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --)
|
||||
|
||||
punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05)
|
||||
psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23)
|
||||
|
||||
punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16)
|
||||
punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27)
|
||||
|
||||
movq mmE,mmA
|
||||
psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14)
|
||||
psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --)
|
||||
|
||||
punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16)
|
||||
psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25)
|
||||
|
||||
punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07)
|
||||
punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27)
|
||||
|
||||
pxor mmH,mmH
|
||||
|
||||
movq mmC,mmA
|
||||
punpcklbw mmA,mmH ; mmA=(00 02 04 06)
|
||||
punpckhbw mmC,mmH ; mmC=(10 12 14 16)
|
||||
|
||||
movq mmB,mmE
|
||||
punpcklbw mmE,mmH ; mmE=(20 22 24 26)
|
||||
punpckhbw mmB,mmH ; mmB=(01 03 05 07)
|
||||
|
||||
movq mmF,mmD
|
||||
punpcklbw mmD,mmH ; mmD=(11 13 15 17)
|
||||
punpckhbw mmF,mmH ; mmF=(21 23 25 27)
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
.column_ld1:
|
||||
test cl, SIZEOF_MMWORD/8
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_MMWORD/8
|
||||
movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_MMWORD/4
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_MMWORD/4
|
||||
movq mmF,mmA
|
||||
movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld4:
|
||||
test cl, SIZEOF_MMWORD/2
|
||||
mov ecx, SIZEOF_MMWORD
|
||||
jz short .rgb_ycc_cnv
|
||||
movq mmD,mmA
|
||||
movq mmC,mmF
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
jmp short .rgb_ycc_cnv
|
||||
alignx 16,7
|
||||
|
||||
.columnloop:
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
|
||||
movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
|
||||
|
||||
.rgb_ycc_cnv:
|
||||
; mmA=(00 10 20 30 01 11 21 31)
|
||||
; mmF=(02 12 22 32 03 13 23 33)
|
||||
; mmD=(04 14 24 34 05 15 25 35)
|
||||
; mmC=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movq mmB,mmA
|
||||
punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32)
|
||||
punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33)
|
||||
|
||||
movq mmG,mmD
|
||||
punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36)
|
||||
punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37)
|
||||
|
||||
movq mmE,mmA
|
||||
punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16)
|
||||
punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36)
|
||||
|
||||
movq mmH,mmB
|
||||
punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17)
|
||||
punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37)
|
||||
|
||||
pxor mmF,mmF
|
||||
|
||||
movq mmC,mmA
|
||||
punpcklbw mmA,mmF ; mmA=(00 02 04 06)
|
||||
punpckhbw mmC,mmF ; mmC=(10 12 14 16)
|
||||
|
||||
movq mmD,mmB
|
||||
punpcklbw mmB,mmF ; mmB=(01 03 05 07)
|
||||
punpckhbw mmD,mmF ; mmD=(11 13 15 17)
|
||||
|
||||
movq mmG,mmE
|
||||
punpcklbw mmE,mmF ; mmE=(20 22 24 26)
|
||||
punpckhbw mmG,mmF ; mmG=(30 32 34 36)
|
||||
|
||||
punpcklbw mmF,mmH
|
||||
punpckhbw mmH,mmH
|
||||
psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27)
|
||||
psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37)
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
|
||||
; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
|
||||
|
||||
; (Original)
|
||||
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
;
|
||||
; (This implementation)
|
||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
|
||||
movq MMWORD [wk(0)], mm0 ; wk(0)=RE
|
||||
movq MMWORD [wk(1)], mm1 ; wk(1)=RO
|
||||
movq MMWORD [wk(2)], mm4 ; wk(2)=BE
|
||||
movq MMWORD [wk(3)], mm5 ; wk(3)=BO
|
||||
|
||||
movq mm6,mm1
|
||||
punpcklwd mm1,mm3
|
||||
punpckhwd mm6,mm3
|
||||
movq mm7,mm1
|
||||
movq mm4,mm6
|
||||
pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
pmaddwd mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
|
||||
pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
|
||||
|
||||
movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
pxor mm1,mm1
|
||||
pxor mm6,mm6
|
||||
punpcklwd mm1,mm5 ; mm1=BOL
|
||||
punpckhwd mm6,mm5 ; mm6=BOH
|
||||
psrld mm1,1 ; mm1=BOL*FIX(0.500)
|
||||
psrld mm6,1 ; mm6=BOH*FIX(0.500)
|
||||
|
||||
movq mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd mm7,mm1
|
||||
paddd mm4,mm6
|
||||
paddd mm7,mm5
|
||||
paddd mm4,mm5
|
||||
psrld mm7,SCALEBITS ; mm7=CbOL
|
||||
psrld mm4,SCALEBITS ; mm4=CbOH
|
||||
packssdw mm7,mm4 ; mm7=CbO
|
||||
|
||||
movq mm1, MMWORD [wk(2)] ; mm1=BE
|
||||
|
||||
movq mm6,mm0
|
||||
punpcklwd mm0,mm2
|
||||
punpckhwd mm6,mm2
|
||||
movq mm5,mm0
|
||||
movq mm4,mm6
|
||||
pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
pmaddwd mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
|
||||
pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
|
||||
|
||||
movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
pxor mm0,mm0
|
||||
pxor mm6,mm6
|
||||
punpcklwd mm0,mm1 ; mm0=BEL
|
||||
punpckhwd mm6,mm1 ; mm6=BEH
|
||||
psrld mm0,1 ; mm0=BEL*FIX(0.500)
|
||||
psrld mm6,1 ; mm6=BEH*FIX(0.500)
|
||||
|
||||
movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd mm5,mm0
|
||||
paddd mm4,mm6
|
||||
paddd mm5,mm1
|
||||
paddd mm4,mm1
|
||||
psrld mm5,SCALEBITS ; mm5=CbEL
|
||||
psrld mm4,SCALEBITS ; mm4=CbEH
|
||||
packssdw mm5,mm4 ; mm5=CbE
|
||||
|
||||
psllw mm7,BYTE_BIT
|
||||
por mm5,mm7 ; mm5=Cb
|
||||
movq MMWORD [ebx], mm5 ; Save Cb
|
||||
|
||||
movq mm0, MMWORD [wk(3)] ; mm0=BO
|
||||
movq mm6, MMWORD [wk(2)] ; mm6=BE
|
||||
movq mm1, MMWORD [wk(1)] ; mm1=RO
|
||||
|
||||
movq mm4,mm0
|
||||
punpcklwd mm0,mm3
|
||||
punpckhwd mm4,mm3
|
||||
movq mm7,mm0
|
||||
movq mm5,mm4
|
||||
pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||
pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||
pmaddwd mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
|
||||
pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
|
||||
|
||||
movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF]
|
||||
|
||||
paddd mm0, MMWORD [wk(4)]
|
||||
paddd mm4, MMWORD [wk(5)]
|
||||
paddd mm0,mm3
|
||||
paddd mm4,mm3
|
||||
psrld mm0,SCALEBITS ; mm0=YOL
|
||||
psrld mm4,SCALEBITS ; mm4=YOH
|
||||
packssdw mm0,mm4 ; mm0=YO
|
||||
|
||||
pxor mm3,mm3
|
||||
pxor mm4,mm4
|
||||
punpcklwd mm3,mm1 ; mm3=ROL
|
||||
punpckhwd mm4,mm1 ; mm4=ROH
|
||||
psrld mm3,1 ; mm3=ROL*FIX(0.500)
|
||||
psrld mm4,1 ; mm4=ROH*FIX(0.500)
|
||||
|
||||
movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd mm7,mm3
|
||||
paddd mm5,mm4
|
||||
paddd mm7,mm1
|
||||
paddd mm5,mm1
|
||||
psrld mm7,SCALEBITS ; mm7=CrOL
|
||||
psrld mm5,SCALEBITS ; mm5=CrOH
|
||||
packssdw mm7,mm5 ; mm7=CrO
|
||||
|
||||
movq mm3, MMWORD [wk(0)] ; mm3=RE
|
||||
|
||||
movq mm4,mm6
|
||||
punpcklwd mm6,mm2
|
||||
punpckhwd mm4,mm2
|
||||
movq mm1,mm6
|
||||
movq mm5,mm4
|
||||
pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||
pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||
pmaddwd mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
|
||||
pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
|
||||
|
||||
movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF]
|
||||
|
||||
paddd mm6, MMWORD [wk(6)]
|
||||
paddd mm4, MMWORD [wk(7)]
|
||||
paddd mm6,mm2
|
||||
paddd mm4,mm2
|
||||
psrld mm6,SCALEBITS ; mm6=YEL
|
||||
psrld mm4,SCALEBITS ; mm4=YEH
|
||||
packssdw mm6,mm4 ; mm6=YE
|
||||
|
||||
psllw mm0,BYTE_BIT
|
||||
por mm6,mm0 ; mm6=Y
|
||||
movq MMWORD [edi], mm6 ; Save Y
|
||||
|
||||
pxor mm2,mm2
|
||||
pxor mm4,mm4
|
||||
punpcklwd mm2,mm3 ; mm2=REL
|
||||
punpckhwd mm4,mm3 ; mm4=REH
|
||||
psrld mm2,1 ; mm2=REL*FIX(0.500)
|
||||
psrld mm4,1 ; mm4=REH*FIX(0.500)
|
||||
|
||||
movq mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd mm1,mm2
|
||||
paddd mm5,mm4
|
||||
paddd mm1,mm0
|
||||
paddd mm5,mm0
|
||||
psrld mm1,SCALEBITS ; mm1=CrEL
|
||||
psrld mm5,SCALEBITS ; mm5=CrEH
|
||||
packssdw mm1,mm5 ; mm1=CrE
|
||||
|
||||
psllw mm7,BYTE_BIT
|
||||
por mm1,mm7 ; mm1=Cr
|
||||
movq MMWORD [edx], mm1 ; Save Cr
|
||||
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr
|
||||
add edi, byte SIZEOF_MMWORD ; outptr0
|
||||
add ebx, byte SIZEOF_MMWORD ; outptr1
|
||||
add edx, byte SIZEOF_MMWORD ; outptr2
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jae near .columnloop
|
||||
test ecx,ecx
|
||||
jnz near .column_ld1
|
||||
|
||||
pop ecx ; col
|
||||
pop esi
|
||||
pop edi
|
||||
pop ebx
|
||||
pop edx
|
||||
poppic eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_buf
|
||||
add edi, byte SIZEOF_JSAMPROW
|
||||
add ebx, byte SIZEOF_JSAMPROW
|
||||
add edx, byte SIZEOF_JSAMPROW
|
||||
dec eax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp,ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
73
simd/jcolsamp.inc
Normal file
73
simd/jcolsamp.inc
Normal file
@@ -0,0 +1,73 @@
|
||||
;
|
||||
; jcolsamp.inc - private declarations for color conversion & up/downsampling
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
; pseudo-resisters to make ordering of RGB configurable
|
||||
;
|
||||
%if RGB_RED == 0
|
||||
%define mmA mm0
|
||||
%define mmB mm1
|
||||
%elif RGB_GREEN == 0
|
||||
%define mmA mm2
|
||||
%define mmB mm3
|
||||
%elif RGB_BLUE == 0
|
||||
%define mmA mm4
|
||||
%define mmB mm5
|
||||
%else
|
||||
%define mmA mm6
|
||||
%define mmB mm7
|
||||
%endif
|
||||
|
||||
%if RGB_RED == 1
|
||||
%define mmC mm0
|
||||
%define mmD mm1
|
||||
%elif RGB_GREEN == 1
|
||||
%define mmC mm2
|
||||
%define mmD mm3
|
||||
%elif RGB_BLUE == 1
|
||||
%define mmC mm4
|
||||
%define mmD mm5
|
||||
%else
|
||||
%define mmC mm6
|
||||
%define mmD mm7
|
||||
%endif
|
||||
|
||||
%if RGB_RED == 2
|
||||
%define mmE mm0
|
||||
%define mmF mm1
|
||||
%elif RGB_GREEN == 2
|
||||
%define mmE mm2
|
||||
%define mmF mm3
|
||||
%elif RGB_BLUE == 2
|
||||
%define mmE mm4
|
||||
%define mmF mm5
|
||||
%else
|
||||
%define mmE mm6
|
||||
%define mmF mm7
|
||||
%endif
|
||||
|
||||
%if RGB_RED == 3
|
||||
%define mmG mm0
|
||||
%define mmH mm1
|
||||
%elif RGB_GREEN == 3
|
||||
%define mmG mm2
|
||||
%define mmH mm3
|
||||
%elif RGB_BLUE == 3
|
||||
%define mmG mm4
|
||||
%define mmH mm5
|
||||
%else
|
||||
%define mmG mm6
|
||||
%define mmH mm7
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
271
simd/jcqntmmx.asm
Normal file
271
simd/jcqntmmx.asm
Normal file
@@ -0,0 +1,271 @@
|
||||
;
|
||||
; jcqntmmx.asm - sample data conversion and quantization (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "simd/jsimdext.inc"
|
||||
%include "simd/jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Load data into workspace, applying unsigned->signed conversion
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
; DCTELEM * workspace);
|
||||
;
|
||||
|
||||
%define sample_data ebp+8 ; JSAMPARRAY sample_data
|
||||
%define start_col ebp+12 ; JDIMENSION start_col
|
||||
%define workspace ebp+16 ; DCTELEM * workspace
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_convsamp_mmx)
|
||||
|
||||
EXTN(jsimd_convsamp_mmx):
|
||||
push ebp
|
||||
mov ebp,esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
pxor mm6,mm6 ; mm6=(all 0's)
|
||||
pcmpeqw mm7,mm7
|
||||
psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
|
||||
|
||||
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [start_col]
|
||||
mov edi, POINTER [workspace] ; (DCTELEM *)
|
||||
mov ecx, DCTSIZE/4
|
||||
alignx 16,7
|
||||
.convloop:
|
||||
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
|
||||
movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567)
|
||||
movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF)
|
||||
|
||||
mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
|
||||
movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN)
|
||||
movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV)
|
||||
|
||||
movq mm4,mm0
|
||||
punpcklbw mm0,mm6 ; mm0=(0123)
|
||||
punpckhbw mm4,mm6 ; mm4=(4567)
|
||||
movq mm5,mm1
|
||||
punpcklbw mm1,mm6 ; mm1=(89AB)
|
||||
punpckhbw mm5,mm6 ; mm5=(CDEF)
|
||||
|
||||
paddw mm0,mm7
|
||||
paddw mm4,mm7
|
||||
paddw mm1,mm7
|
||||
paddw mm5,mm7
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
|
||||
movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
|
||||
|
||||
movq mm0,mm2
|
||||
punpcklbw mm2,mm6 ; mm2=(GHIJ)
|
||||
punpckhbw mm0,mm6 ; mm0=(KLMN)
|
||||
movq mm4,mm3
|
||||
punpcklbw mm3,mm6 ; mm3=(OPQR)
|
||||
punpckhbw mm4,mm6 ; mm4=(STUV)
|
||||
|
||||
paddw mm2,mm7
|
||||
paddw mm0,mm7
|
||||
paddw mm3,mm7
|
||||
paddw mm4,mm7
|
||||
|
||||
movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
|
||||
movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
|
||||
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
|
||||
movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
|
||||
|
||||
add esi, byte 4*SIZEOF_JSAMPROW
|
||||
add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
|
||||
dec ecx
|
||||
jnz short .convloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Quantize/descale the coefficients, and store into coef_block
|
||||
;
|
||||
; This implementation is based on an algorithm described in
|
||||
; "How to optimize for the Pentium family of microprocessors"
|
||||
; (http://www.agner.org/assem/).
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM * divisors,
|
||||
; DCTELEM * workspace);
|
||||
;
|
||||
|
||||
%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
|
||||
%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
|
||||
%define SCALE(m,n,b) MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
|
||||
%define SHIFT(m,n,b) MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)
|
||||
|
||||
%define coef_block ebp+8 ; JCOEFPTR coef_block
|
||||
%define divisors ebp+12 ; DCTELEM * divisors
|
||||
%define workspace ebp+16 ; DCTELEM * workspace
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_quantize_mmx)
|
||||
|
||||
EXTN(jsimd_quantize_mmx):
|
||||
push ebp
|
||||
mov ebp,esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; unused
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov esi, POINTER [workspace]
|
||||
mov edx, POINTER [divisors]
|
||||
mov edi, JCOEFPTR [coef_block]
|
||||
mov ah, 2
|
||||
alignx 16,7
|
||||
.quantloop1:
|
||||
mov al, DCTSIZE2/8/2
|
||||
alignx 16,7
|
||||
.quantloop2:
|
||||
movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
|
||||
|
||||
movq mm0,mm2
|
||||
movq mm1,mm3
|
||||
|
||||
psraw mm2,(WORD_BIT-1) ; -1 if value < 0, 0 otherwise
|
||||
psraw mm3,(WORD_BIT-1)
|
||||
|
||||
pxor mm0,mm2 ; val = -val
|
||||
pxor mm1,mm3
|
||||
psubw mm0,mm2
|
||||
psubw mm1,mm3
|
||||
|
||||
;
|
||||
; MMX is an annoyingly crappy instruction set. It has two
|
||||
; misfeatures that are causing problems here:
|
||||
;
|
||||
; - All multiplications are signed.
|
||||
;
|
||||
; - The second operand for the shifts is not treated as packed.
|
||||
;
|
||||
;
|
||||
; We work around the first problem by implementing this algorithm:
|
||||
;
|
||||
; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
|
||||
; {
|
||||
; enum { SHORT_BIT = 16 };
|
||||
; signed short sx = (signed short) x;
|
||||
; signed short sy = (signed short) y;
|
||||
; signed long sz;
|
||||
;
|
||||
; sz = (long) sx * (long) sy; /* signed multiply */
|
||||
;
|
||||
; if (sx < 0) sz += (long) sy << SHORT_BIT;
|
||||
; if (sy < 0) sz += (long) sx << SHORT_BIT;
|
||||
;
|
||||
; return (unsigned long) sz;
|
||||
; }
|
||||
;
|
||||
; (note that a negative sx adds _sy_ and vice versa)
|
||||
;
|
||||
; For the second problem, we replace the shift by a multiplication.
|
||||
; Unfortunately that means we have to deal with the signed issue again.
|
||||
;
|
||||
|
||||
paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
|
||||
paddw mm1, MMWORD [CORRECTION(0,1,edx)]
|
||||
|
||||
movq mm4,mm0 ; store current value for later
|
||||
movq mm5,mm1
|
||||
pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
|
||||
pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)]
|
||||
paddw mm0,mm4 ; reciprocal is always negative (MSB=1),
|
||||
paddw mm1,mm5 ; so we always need to add the initial value
|
||||
; (input value is never negative as we
|
||||
; inverted it at the start of this routine)
|
||||
|
||||
; here it gets a bit tricky as both scale
|
||||
; and mm0/mm1 can be negative
|
||||
movq mm6, MMWORD [SCALE(0,0,edx)] ; scale
|
||||
movq mm7, MMWORD [SCALE(0,1,edx)]
|
||||
movq mm4,mm0
|
||||
movq mm5,mm1
|
||||
pmulhw mm0,mm6
|
||||
pmulhw mm1,mm7
|
||||
|
||||
psraw mm6,(WORD_BIT-1) ; determine if scale is negative
|
||||
psraw mm7,(WORD_BIT-1)
|
||||
|
||||
pand mm6,mm4 ; and add input if it is
|
||||
pand mm7,mm5
|
||||
paddw mm0,mm6
|
||||
paddw mm1,mm7
|
||||
|
||||
psraw mm4,(WORD_BIT-1) ; then check if negative input
|
||||
psraw mm5,(WORD_BIT-1)
|
||||
|
||||
pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is
|
||||
pand mm5, MMWORD [SCALE(0,1,edx)]
|
||||
paddw mm0,mm4
|
||||
paddw mm1,mm5
|
||||
|
||||
pxor mm0,mm2 ; val = -val
|
||||
pxor mm1,mm3
|
||||
psubw mm0,mm2
|
||||
psubw mm1,mm3
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
|
||||
|
||||
add esi, byte 8*SIZEOF_DCTELEM
|
||||
add edx, byte 8*SIZEOF_DCTELEM
|
||||
add edi, byte 8*SIZEOF_JCOEF
|
||||
dec al
|
||||
jnz near .quantloop2
|
||||
dec ah
|
||||
jnz near .quantloop1 ; to avoid branch misprediction
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; unused
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
321
simd/jcsammmx.asm
Normal file
321
simd/jcsammmx.asm
Normal file
@@ -0,0 +1,321 @@
|
||||
;
|
||||
; jcsammmx.asm - downsampling (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "simd/jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Downsample pixel values of a single component.
|
||||
; This version handles the common case of 2:1 horizontal and 1:1 vertical,
|
||||
; without smoothing.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
|
||||
; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
|
||||
; JSAMPARRAY input_data, JSAMPARRAY output_data);
|
||||
;
|
||||
|
||||
%define img_width(b) (b)+8 ; JDIMENSION image_width
|
||||
%define max_v_samp(b) (b)+12 ; int max_v_samp_factor
|
||||
%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
|
||||
%define width_blks(b) (b)+20 ; JDIMENSION width_blocks
|
||||
%define input_data(b) (b)+24 ; JSAMPARRAY input_data
|
||||
%define output_data(b) (b)+28 ; JSAMPARRAY output_data
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_h2v1_downsample_mmx)
|
||||
|
||||
EXTN(jsimd_h2v1_downsample_mmx):
|
||||
push ebp
|
||||
mov ebp,esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov ecx, JDIMENSION [width_blks(ebp)]
|
||||
shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
|
||||
jz near .return
|
||||
|
||||
mov edx, JDIMENSION [img_width(ebp)]
|
||||
|
||||
; -- expand_right_edge
|
||||
|
||||
push ecx
|
||||
shl ecx,1 ; output_cols * 2
|
||||
sub ecx,edx
|
||||
jle short .expand_end
|
||||
|
||||
mov eax, INT [max_v_samp(ebp)]
|
||||
test eax,eax
|
||||
jle short .expand_end
|
||||
|
||||
cld
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
alignx 16,7
|
||||
.expandloop:
|
||||
push eax
|
||||
push ecx
|
||||
|
||||
mov edi, JSAMPROW [esi]
|
||||
add edi,edx
|
||||
mov al, JSAMPLE [edi-1]
|
||||
|
||||
rep stosb
|
||||
|
||||
pop ecx
|
||||
pop eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW
|
||||
dec eax
|
||||
jg short .expandloop
|
||||
|
||||
.expand_end:
|
||||
pop ecx ; output_cols
|
||||
|
||||
; -- h2v1_downsample
|
||||
|
||||
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
|
||||
test eax,eax
|
||||
jle short .return
|
||||
|
||||
mov edx, 0x00010000 ; bias pattern
|
||||
movd mm7,edx
|
||||
pcmpeqw mm6,mm6
|
||||
punpckldq mm7,mm7 ; mm7={0, 1, 0, 1}
|
||||
psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
|
||||
alignx 16,7
|
||||
.rowloop:
|
||||
push ecx
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
alignx 16,7
|
||||
.columnloop:
|
||||
|
||||
movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mm1, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
movq mm2,mm0
|
||||
movq mm3,mm1
|
||||
|
||||
pand mm0,mm6
|
||||
psrlw mm2,BYTE_BIT
|
||||
pand mm1,mm6
|
||||
psrlw mm3,BYTE_BIT
|
||||
|
||||
paddw mm0,mm2
|
||||
paddw mm1,mm3
|
||||
paddw mm0,mm7
|
||||
paddw mm1,mm7
|
||||
psrlw mm0,1
|
||||
psrlw mm1,1
|
||||
|
||||
packuswb mm0,mm1
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
|
||||
|
||||
add esi, byte 2*SIZEOF_MMWORD ; inptr
|
||||
add edi, byte 1*SIZEOF_MMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_MMWORD ; outcol
|
||||
jnz short .columnloop
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
pop ecx
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec eax ; rowctr
|
||||
jg short .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Downsample pixel values of a single component.
|
||||
; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
|
||||
; without smoothing.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
|
||||
; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
|
||||
; JSAMPARRAY input_data, JSAMPARRAY output_data);
|
||||
;
|
||||
|
||||
%define img_width(b) (b)+8 ; JDIMENSION image_width
|
||||
%define max_v_samp(b) (b)+12 ; int max_v_samp_factor
|
||||
%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
|
||||
%define width_blks(b) (b)+20 ; JDIMENSION width_blocks
|
||||
%define input_data(b) (b)+24 ; JSAMPARRAY input_data
|
||||
%define output_data(b) (b)+28 ; JSAMPARRAY output_data
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_h2v2_downsample_mmx)
|
||||
|
||||
EXTN(jsimd_h2v2_downsample_mmx):
|
||||
push ebp
|
||||
mov ebp,esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov ecx, JDIMENSION [width_blks(ebp)]
|
||||
shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
|
||||
jz near .return
|
||||
|
||||
mov edx, JDIMENSION [img_width(ebp)]
|
||||
|
||||
; -- expand_right_edge
|
||||
|
||||
push ecx
|
||||
shl ecx,1 ; output_cols * 2
|
||||
sub ecx,edx
|
||||
jle short .expand_end
|
||||
|
||||
mov eax, INT [max_v_samp(ebp)]
|
||||
test eax,eax
|
||||
jle short .expand_end
|
||||
|
||||
cld
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
alignx 16,7
|
||||
.expandloop:
|
||||
push eax
|
||||
push ecx
|
||||
|
||||
mov edi, JSAMPROW [esi]
|
||||
add edi,edx
|
||||
mov al, JSAMPLE [edi-1]
|
||||
|
||||
rep stosb
|
||||
|
||||
pop ecx
|
||||
pop eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW
|
||||
dec eax
|
||||
jg short .expandloop
|
||||
|
||||
.expand_end:
|
||||
pop ecx ; output_cols
|
||||
|
||||
; -- h2v2_downsample
|
||||
|
||||
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
|
||||
test eax,eax
|
||||
jle near .return
|
||||
|
||||
mov edx, 0x00020001 ; bias pattern
|
||||
movd mm7,edx
|
||||
pcmpeqw mm6,mm6
|
||||
punpckldq mm7,mm7 ; mm7={1, 2, 1, 2}
|
||||
psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
|
||||
alignx 16,7
|
||||
.rowloop:
|
||||
push ecx
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
alignx 16,7
|
||||
.columnloop:
|
||||
|
||||
movq mm0, MMWORD [edx+0*SIZEOF_MMWORD]
|
||||
movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mm2, MMWORD [edx+1*SIZEOF_MMWORD]
|
||||
movq mm3, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
|
||||
movq mm4,mm0
|
||||
movq mm5,mm1
|
||||
pand mm0,mm6
|
||||
psrlw mm4,BYTE_BIT
|
||||
pand mm1,mm6
|
||||
psrlw mm5,BYTE_BIT
|
||||
paddw mm0,mm4
|
||||
paddw mm1,mm5
|
||||
|
||||
movq mm4,mm2
|
||||
movq mm5,mm3
|
||||
pand mm2,mm6
|
||||
psrlw mm4,BYTE_BIT
|
||||
pand mm3,mm6
|
||||
psrlw mm5,BYTE_BIT
|
||||
paddw mm2,mm4
|
||||
paddw mm3,mm5
|
||||
|
||||
paddw mm0,mm1
|
||||
paddw mm2,mm3
|
||||
paddw mm0,mm7
|
||||
paddw mm2,mm7
|
||||
psrlw mm0,2
|
||||
psrlw mm2,2
|
||||
|
||||
packuswb mm0,mm2
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
|
||||
|
||||
add edx, byte 2*SIZEOF_MMWORD ; inptr0
|
||||
add esi, byte 2*SIZEOF_MMWORD ; inptr1
|
||||
add edi, byte 1*SIZEOF_MMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_MMWORD ; outcol
|
||||
jnz near .columnloop
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
pop ecx
|
||||
|
||||
add esi, byte 2*SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte 1*SIZEOF_JSAMPROW ; output_data
|
||||
dec eax ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
433
simd/jdcolmmx.asm
Normal file
433
simd/jdcolmmx.asm
Normal file
@@ -0,0 +1,433 @@
|
||||
;
|
||||
; jdcolmmx.asm - colorspace conversion (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "simd/jsimdext.inc"
|
||||
%include "simd/jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_344 equ 22554 ; FIX(0.34414)
|
||||
F_0_714 equ 46802 ; FIX(0.71414)
|
||||
F_1_402 equ 91881 ; FIX(1.40200)
|
||||
F_1_772 equ 116130 ; FIX(1.77200)
|
||||
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
|
||||
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
|
||||
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_ycc_rgb_convert_mmx)
|
||||
|
||||
EXTN(jconst_ycc_rgb_convert_mmx):
|
||||
|
||||
PW_F0402 times 4 dw F_0_402
|
||||
PW_MF0228 times 4 dw -F_0_228
|
||||
PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
|
||||
PW_ONE times 4 dw 1
|
||||
PD_ONEHALF times 2 dd 1 << (SCALEBITS-1)
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
|
||||
; JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
; JSAMPARRAY output_buf, int num_rows)
|
||||
;
|
||||
|
||||
%define out_width(b) (b)+8 ; JDIMENSION out_width
|
||||
%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
|
||||
%define input_row(b) (b)+16 ; JDIMENSION input_row
|
||||
%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
|
||||
%define num_rows(b) (b)+24 ; int num_rows
|
||||
|
||||
%define original_ebp ebp+0
|
||||
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_ycc_rgb_convert_mmx)
|
||||
|
||||
EXTN(jsimd_ycc_rgb_convert_mmx):
|
||||
push ebp
|
||||
mov eax,esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp],eax
|
||||
mov ebp,esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [out_width(eax)] ; num_cols
|
||||
test ecx,ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov edi, JSAMPIMAGE [input_buf(eax)]
|
||||
mov ecx, JDIMENSION [input_row(eax)]
|
||||
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
|
||||
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
|
||||
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
|
||||
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
|
||||
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
|
||||
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop ecx
|
||||
|
||||
mov edi, JSAMPARRAY [output_buf(eax)]
|
||||
mov eax, INT [num_rows(eax)]
|
||||
test eax,eax
|
||||
jle near .return
|
||||
alignx 16,7
|
||||
.rowloop:
|
||||
push eax
|
||||
push edi
|
||||
push edx
|
||||
push ebx
|
||||
push esi
|
||||
push ecx ; col
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr0
|
||||
mov ebx, JSAMPROW [ebx] ; inptr1
|
||||
mov edx, JSAMPROW [edx] ; inptr2
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
alignx 16,7
|
||||
.columnloop:
|
||||
|
||||
movq mm5, MMWORD [ebx] ; mm5=Cb(01234567)
|
||||
movq mm1, MMWORD [edx] ; mm1=Cr(01234567)
|
||||
|
||||
pcmpeqw mm4,mm4
|
||||
pcmpeqw mm7,mm7
|
||||
psrlw mm4,BYTE_BIT
|
||||
psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
|
||||
movq mm0,mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
pand mm4,mm5 ; mm4=Cb(0246)=CbE
|
||||
psrlw mm5,BYTE_BIT ; mm5=Cb(1357)=CbO
|
||||
pand mm0,mm1 ; mm0=Cr(0246)=CrE
|
||||
psrlw mm1,BYTE_BIT ; mm1=Cr(1357)=CrO
|
||||
|
||||
paddw mm4,mm7
|
||||
paddw mm5,mm7
|
||||
paddw mm0,mm7
|
||||
paddw mm1,mm7
|
||||
|
||||
; (Original)
|
||||
; R = Y + 1.40200 * Cr
|
||||
; G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
; B = Y + 1.77200 * Cb
|
||||
;
|
||||
; (This implementation)
|
||||
; R = Y + 0.40200 * Cr + Cr
|
||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||
|
||||
movq mm2,mm4 ; mm2=CbE
|
||||
movq mm3,mm5 ; mm3=CbO
|
||||
paddw mm4,mm4 ; mm4=2*CbE
|
||||
paddw mm5,mm5 ; mm5=2*CbO
|
||||
movq mm6,mm0 ; mm6=CrE
|
||||
movq mm7,mm1 ; mm7=CrO
|
||||
paddw mm0,mm0 ; mm0=2*CrE
|
||||
paddw mm1,mm1 ; mm1=2*CrO
|
||||
|
||||
pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800))
|
||||
pmulhw mm5,[GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800))
|
||||
pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200))
|
||||
pmulhw mm1,[GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200))
|
||||
|
||||
paddw mm4,[GOTOFF(eax,PW_ONE)]
|
||||
paddw mm5,[GOTOFF(eax,PW_ONE)]
|
||||
psraw mm4,1 ; mm4=(CbE * -FIX(0.22800))
|
||||
psraw mm5,1 ; mm5=(CbO * -FIX(0.22800))
|
||||
paddw mm0,[GOTOFF(eax,PW_ONE)]
|
||||
paddw mm1,[GOTOFF(eax,PW_ONE)]
|
||||
psraw mm0,1 ; mm0=(CrE * FIX(0.40200))
|
||||
psraw mm1,1 ; mm1=(CrO * FIX(0.40200))
|
||||
|
||||
paddw mm4,mm2
|
||||
paddw mm5,mm3
|
||||
paddw mm4,mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E
|
||||
paddw mm5,mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O
|
||||
paddw mm0,mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E
|
||||
paddw mm1,mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O
|
||||
|
||||
movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E
|
||||
movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O
|
||||
|
||||
movq mm4,mm2
|
||||
movq mm5,mm3
|
||||
punpcklwd mm2,mm6
|
||||
punpckhwd mm4,mm6
|
||||
pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)]
|
||||
pmaddwd mm4,[GOTOFF(eax,PW_MF0344_F0285)]
|
||||
punpcklwd mm3,mm7
|
||||
punpckhwd mm5,mm7
|
||||
pmaddwd mm3,[GOTOFF(eax,PW_MF0344_F0285)]
|
||||
pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)]
|
||||
|
||||
paddd mm2,[GOTOFF(eax,PD_ONEHALF)]
|
||||
paddd mm4,[GOTOFF(eax,PD_ONEHALF)]
|
||||
psrad mm2,SCALEBITS
|
||||
psrad mm4,SCALEBITS
|
||||
paddd mm3,[GOTOFF(eax,PD_ONEHALF)]
|
||||
paddd mm5,[GOTOFF(eax,PD_ONEHALF)]
|
||||
psrad mm3,SCALEBITS
|
||||
psrad mm5,SCALEBITS
|
||||
|
||||
packssdw mm2,mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
|
||||
packssdw mm3,mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
|
||||
psubw mm2,mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
|
||||
psubw mm3,mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
|
||||
|
||||
movq mm5, MMWORD [esi] ; mm5=Y(01234567)
|
||||
|
||||
pcmpeqw mm4,mm4
|
||||
psrlw mm4,BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..}
|
||||
pand mm4,mm5 ; mm4=Y(0246)=YE
|
||||
psrlw mm5,BYTE_BIT ; mm5=Y(1357)=YO
|
||||
|
||||
paddw mm0,mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
|
||||
paddw mm1,mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
|
||||
packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
|
||||
packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
|
||||
|
||||
paddw mm2,mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
|
||||
paddw mm3,mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
|
||||
packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
|
||||
packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
|
||||
|
||||
paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
|
||||
paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
|
||||
packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
|
||||
packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
|
||||
; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
|
||||
; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
|
||||
; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
|
||||
|
||||
punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
|
||||
punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07)
|
||||
punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27)
|
||||
|
||||
movq mmG,mmA
|
||||
movq mmH,mmA
|
||||
punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03)
|
||||
punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07)
|
||||
|
||||
psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
|
||||
psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
|
||||
|
||||
movq mmC,mmD
|
||||
movq mmB,mmD
|
||||
punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14)
|
||||
punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --)
|
||||
|
||||
psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
|
||||
|
||||
movq mmF,mmE
|
||||
punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25)
|
||||
punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --)
|
||||
|
||||
punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12)
|
||||
punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05)
|
||||
punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27)
|
||||
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jb short .column_st16
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
|
||||
movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
|
||||
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add esi, byte SIZEOF_MMWORD ; inptr0
|
||||
add ebx, byte SIZEOF_MMWORD ; inptr1
|
||||
add edx, byte SIZEOF_MMWORD ; inptr2
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
|
||||
jmp near .columnloop
|
||||
alignx 16,7
|
||||
|
||||
.column_st16:
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp ecx, byte 2*SIZEOF_MMWORD
|
||||
jb short .column_st8
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
|
||||
movq mmA,mmC
|
||||
sub ecx, byte 2*SIZEOF_MMWORD
|
||||
add edi, byte 2*SIZEOF_MMWORD
|
||||
jmp short .column_st4
|
||||
.column_st8:
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jb short .column_st4
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq mmA,mmE
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
add edi, byte SIZEOF_MMWORD
|
||||
.column_st4:
|
||||
movd eax,mmA
|
||||
cmp ecx, byte SIZEOF_DWORD
|
||||
jb short .column_st2
|
||||
mov DWORD [edi+0*SIZEOF_DWORD], eax
|
||||
psrlq mmA,DWORD_BIT
|
||||
movd eax,mmA
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
add edi, byte SIZEOF_DWORD
|
||||
.column_st2:
|
||||
cmp ecx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov WORD [edi+0*SIZEOF_WORD], ax
|
||||
shr eax,WORD_BIT
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
add edi, byte SIZEOF_WORD
|
||||
.column_st1:
|
||||
cmp ecx, byte SIZEOF_BYTE
|
||||
jb short .nextrow
|
||||
mov BYTE [edi+0*SIZEOF_BYTE], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
%ifdef RGBX_FILLER_0XFF
|
||||
pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
|
||||
pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
|
||||
%else
|
||||
pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
|
||||
pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
|
||||
%endif
|
||||
; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
|
||||
; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
|
||||
; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
|
||||
; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
|
||||
|
||||
punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
|
||||
punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36)
|
||||
punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17)
|
||||
punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37)
|
||||
|
||||
movq mmC,mmA
|
||||
punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32)
|
||||
punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36)
|
||||
movq mmG,mmB
|
||||
punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33)
|
||||
punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37)
|
||||
|
||||
movq mmD,mmA
|
||||
punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33)
|
||||
movq mmH,mmC
|
||||
punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37)
|
||||
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jb short .column_st16
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
|
||||
movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
|
||||
movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
|
||||
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add esi, byte SIZEOF_MMWORD ; inptr0
|
||||
add ebx, byte SIZEOF_MMWORD ; inptr1
|
||||
add edx, byte SIZEOF_MMWORD ; inptr2
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
|
||||
jmp near .columnloop
|
||||
alignx 16,7
|
||||
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_MMWORD/2
|
||||
jb short .column_st8
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
|
||||
movq mmA,mmC
|
||||
movq mmD,mmH
|
||||
sub ecx, byte SIZEOF_MMWORD/2
|
||||
add edi, byte 2*SIZEOF_MMWORD
|
||||
.column_st8:
|
||||
cmp ecx, byte SIZEOF_MMWORD/4
|
||||
jb short .column_st4
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq mmA,mmD
|
||||
sub ecx, byte SIZEOF_MMWORD/4
|
||||
add edi, byte 1*SIZEOF_MMWORD
|
||||
.column_st4:
|
||||
cmp ecx, byte SIZEOF_MMWORD/8
|
||||
jb short .nextrow
|
||||
movd DWORD [edi+0*SIZEOF_DWORD], mmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
alignx 16,7
|
||||
|
||||
.nextrow:
|
||||
pop ecx
|
||||
pop esi
|
||||
pop ebx
|
||||
pop edx
|
||||
pop edi
|
||||
pop eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW
|
||||
add ebx, byte SIZEOF_JSAMPROW
|
||||
add edx, byte SIZEOF_JSAMPROW
|
||||
add edi, byte SIZEOF_JSAMPROW ; output_buf
|
||||
dec eax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp,ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
27
simd/jdct.inc
Normal file
27
simd/jdct.inc
Normal file
@@ -0,0 +1,27 @@
|
||||
;
|
||||
; jdct.inc - private declarations for forward & reverse DCT subsystems
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
; Each IDCT routine is responsible for range-limiting its results and
|
||||
; converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could
|
||||
; be quite far out of range if the input data is corrupt, so a bulletproof
|
||||
; range-limiting step is required. We use a mask-and-table-lookup method
|
||||
; to do the combined operations quickly.
|
||||
;
|
||||
%define RANGE_MASK (MAXJSAMPLE * 4 + 3) ; 2 bits wider than legal samples
|
||||
|
||||
%define ROW(n,b,s) ((b)+(n)*(s))
|
||||
%define COL(n,b,s) ((b)+(n)*(s)*DCTSIZE)
|
||||
|
||||
%define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
|
||||
%define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
492
simd/jdmermmx.asm
Normal file
492
simd/jdmermmx.asm
Normal file
@@ -0,0 +1,492 @@
|
||||
;
|
||||
; jdmermmx.asm - merged upsampling/color conversion (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "simd/jsimdext.inc"
|
||||
%include "simd/jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_344 equ 22554 ; FIX(0.34414)
|
||||
F_0_714 equ 46802 ; FIX(0.71414)
|
||||
F_1_402 equ 91881 ; FIX(1.40200)
|
||||
F_1_772 equ 116130 ; FIX(1.77200)
|
||||
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
|
||||
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
|
||||
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_merged_upsample_mmx)
|
||||
|
||||
EXTN(jconst_merged_upsample_mmx):
|
||||
|
||||
PW_F0402 times 4 dw F_0_402
|
||||
PW_MF0228 times 4 dw -F_0_228
|
||||
PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
|
||||
PW_ONE times 4 dw 1
|
||||
PD_ONEHALF times 2 dd 1 << (SCALEBITS-1)
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
|
||||
; JSAMPIMAGE input_buf,
|
||||
; JDIMENSION in_row_group_ctr,
|
||||
; JSAMPARRAY output_buf);
|
||||
;
|
||||
|
||||
%define output_width(b) (b)+8 ; JDIMENSION output_width
|
||||
%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
|
||||
%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
|
||||
%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
|
||||
|
||||
%define original_ebp ebp+0
|
||||
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
|
||||
%define WK_NUM 3
|
||||
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_h2v1_merged_upsample_mmx)
|
||||
|
||||
EXTN(jsimd_h2v1_merged_upsample_mmx):
|
||||
push ebp
|
||||
mov eax,esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp],eax
|
||||
mov ebp,esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [output_width(eax)] ; col
|
||||
test ecx,ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov edi, JSAMPIMAGE [input_buf(eax)]
|
||||
mov ecx, JDIMENSION [in_row_group_ctr(eax)]
|
||||
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
|
||||
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
|
||||
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
|
||||
mov edi, JSAMPARRAY [output_buf(eax)]
|
||||
mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
|
||||
mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
|
||||
pop ecx ; col
|
||||
|
||||
alignx 16,7
|
||||
.columnloop:
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
|
||||
movq mm6, MMWORD [ebx] ; mm6=Cb(01234567)
|
||||
movq mm7, MMWORD [edx] ; mm7=Cr(01234567)
|
||||
|
||||
pxor mm1,mm1 ; mm1=(all 0's)
|
||||
pcmpeqw mm3,mm3
|
||||
psllw mm3,7 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
|
||||
|
||||
movq mm4,mm6
|
||||
punpckhbw mm6,mm1 ; mm6=Cb(4567)=CbH
|
||||
punpcklbw mm4,mm1 ; mm4=Cb(0123)=CbL
|
||||
movq mm0,mm7
|
||||
punpckhbw mm7,mm1 ; mm7=Cr(4567)=CrH
|
||||
punpcklbw mm0,mm1 ; mm0=Cr(0123)=CrL
|
||||
|
||||
paddw mm6,mm3
|
||||
paddw mm4,mm3
|
||||
paddw mm7,mm3
|
||||
paddw mm0,mm3
|
||||
|
||||
; (Original)
|
||||
; R = Y + 1.40200 * Cr
|
||||
; G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
; B = Y + 1.77200 * Cb
|
||||
;
|
||||
; (This implementation)
|
||||
; R = Y + 0.40200 * Cr + Cr
|
||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||
|
||||
movq mm5,mm6 ; mm5=CbH
|
||||
movq mm2,mm4 ; mm2=CbL
|
||||
paddw mm6,mm6 ; mm6=2*CbH
|
||||
paddw mm4,mm4 ; mm4=2*CbL
|
||||
movq mm1,mm7 ; mm1=CrH
|
||||
movq mm3,mm0 ; mm3=CrL
|
||||
paddw mm7,mm7 ; mm7=2*CrH
|
||||
paddw mm0,mm0 ; mm0=2*CrL
|
||||
|
||||
pmulhw mm6,[GOTOFF(eax,PW_MF0228)] ; mm6=(2*CbH * -FIX(0.22800))
|
||||
pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbL * -FIX(0.22800))
|
||||
pmulhw mm7,[GOTOFF(eax,PW_F0402)] ; mm7=(2*CrH * FIX(0.40200))
|
||||
pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrL * FIX(0.40200))
|
||||
|
||||
paddw mm6,[GOTOFF(eax,PW_ONE)]
|
||||
paddw mm4,[GOTOFF(eax,PW_ONE)]
|
||||
psraw mm6,1 ; mm6=(CbH * -FIX(0.22800))
|
||||
psraw mm4,1 ; mm4=(CbL * -FIX(0.22800))
|
||||
paddw mm7,[GOTOFF(eax,PW_ONE)]
|
||||
paddw mm0,[GOTOFF(eax,PW_ONE)]
|
||||
psraw mm7,1 ; mm7=(CrH * FIX(0.40200))
|
||||
psraw mm0,1 ; mm0=(CrL * FIX(0.40200))
|
||||
|
||||
paddw mm6,mm5
|
||||
paddw mm4,mm2
|
||||
paddw mm6,mm5 ; mm6=(CbH * FIX(1.77200))=(B-Y)H
|
||||
paddw mm4,mm2 ; mm4=(CbL * FIX(1.77200))=(B-Y)L
|
||||
paddw mm7,mm1 ; mm7=(CrH * FIX(1.40200))=(R-Y)H
|
||||
paddw mm0,mm3 ; mm0=(CrL * FIX(1.40200))=(R-Y)L
|
||||
|
||||
movq MMWORD [wk(0)], mm6 ; wk(0)=(B-Y)H
|
||||
movq MMWORD [wk(1)], mm7 ; wk(1)=(R-Y)H
|
||||
|
||||
movq mm6,mm5
|
||||
movq mm7,mm2
|
||||
punpcklwd mm5,mm1
|
||||
punpckhwd mm6,mm1
|
||||
pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)]
|
||||
pmaddwd mm6,[GOTOFF(eax,PW_MF0344_F0285)]
|
||||
punpcklwd mm2,mm3
|
||||
punpckhwd mm7,mm3
|
||||
pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)]
|
||||
pmaddwd mm7,[GOTOFF(eax,PW_MF0344_F0285)]
|
||||
|
||||
paddd mm5,[GOTOFF(eax,PD_ONEHALF)]
|
||||
paddd mm6,[GOTOFF(eax,PD_ONEHALF)]
|
||||
psrad mm5,SCALEBITS
|
||||
psrad mm6,SCALEBITS
|
||||
paddd mm2,[GOTOFF(eax,PD_ONEHALF)]
|
||||
paddd mm7,[GOTOFF(eax,PD_ONEHALF)]
|
||||
psrad mm2,SCALEBITS
|
||||
psrad mm7,SCALEBITS
|
||||
|
||||
packssdw mm5,mm6 ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
|
||||
packssdw mm2,mm7 ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
|
||||
psubw mm5,mm1 ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
|
||||
psubw mm2,mm3 ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
|
||||
|
||||
movq MMWORD [wk(2)], mm5 ; wk(2)=(G-Y)H
|
||||
|
||||
mov al,2 ; Yctr
|
||||
jmp short .Yloop_1st
|
||||
alignx 16,7
|
||||
|
||||
.Yloop_2nd:
|
||||
movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H
|
||||
movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H
|
||||
movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H
|
||||
alignx 16,7
|
||||
|
||||
.Yloop_1st:
|
||||
movq mm7, MMWORD [esi] ; mm7=Y(01234567)
|
||||
|
||||
pcmpeqw mm6,mm6
|
||||
psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
pand mm6,mm7 ; mm6=Y(0246)=YE
|
||||
psrlw mm7,BYTE_BIT ; mm7=Y(1357)=YO
|
||||
|
||||
movq mm1,mm0 ; mm1=mm0=(R-Y)(L/H)
|
||||
movq mm3,mm2 ; mm3=mm2=(G-Y)(L/H)
|
||||
movq mm5,mm4 ; mm5=mm4=(B-Y)(L/H)
|
||||
|
||||
paddw mm0,mm6 ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
|
||||
paddw mm1,mm7 ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
|
||||
packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
|
||||
packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
|
||||
|
||||
paddw mm2,mm6 ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
|
||||
paddw mm3,mm7 ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
|
||||
packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
|
||||
packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
|
||||
|
||||
paddw mm4,mm6 ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
|
||||
paddw mm5,mm7 ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
|
||||
packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
|
||||
packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
|
||||
; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
|
||||
; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
|
||||
; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
|
||||
|
||||
punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
|
||||
punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07)
|
||||
punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27)
|
||||
|
||||
movq mmG,mmA
|
||||
movq mmH,mmA
|
||||
punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03)
|
||||
punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07)
|
||||
|
||||
psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
|
||||
psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
|
||||
|
||||
movq mmC,mmD
|
||||
movq mmB,mmD
|
||||
punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14)
|
||||
punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --)
|
||||
|
||||
psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
|
||||
|
||||
movq mmF,mmE
|
||||
punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25)
|
||||
punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --)
|
||||
|
||||
punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12)
|
||||
punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05)
|
||||
punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27)
|
||||
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jb short .column_st16
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
|
||||
movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
|
||||
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
jz short .endcolumn
|
||||
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
|
||||
add esi, byte SIZEOF_MMWORD ; inptr0
|
||||
dec al ; Yctr
|
||||
jnz near .Yloop_2nd
|
||||
|
||||
add ebx, byte SIZEOF_MMWORD ; inptr1
|
||||
add edx, byte SIZEOF_MMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
alignx 16,7
|
||||
|
||||
.column_st16:
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp ecx, byte 2*SIZEOF_MMWORD
|
||||
jb short .column_st8
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
|
||||
movq mmA,mmC
|
||||
sub ecx, byte 2*SIZEOF_MMWORD
|
||||
add edi, byte 2*SIZEOF_MMWORD
|
||||
jmp short .column_st4
|
||||
.column_st8:
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jb short .column_st4
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq mmA,mmE
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
add edi, byte SIZEOF_MMWORD
|
||||
.column_st4:
|
||||
movd eax,mmA
|
||||
cmp ecx, byte SIZEOF_DWORD
|
||||
jb short .column_st2
|
||||
mov DWORD [edi+0*SIZEOF_DWORD], eax
|
||||
psrlq mmA,DWORD_BIT
|
||||
movd eax,mmA
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
add edi, byte SIZEOF_DWORD
|
||||
.column_st2:
|
||||
cmp ecx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov WORD [edi+0*SIZEOF_WORD], ax
|
||||
shr eax,WORD_BIT
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
add edi, byte SIZEOF_WORD
|
||||
.column_st1:
|
||||
cmp ecx, byte SIZEOF_BYTE
|
||||
jb short .endcolumn
|
||||
mov BYTE [edi+0*SIZEOF_BYTE], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
%ifdef RGBX_FILLER_0XFF
|
||||
pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
|
||||
pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
|
||||
%else
|
||||
pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
|
||||
pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
|
||||
%endif
|
||||
; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
|
||||
; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
|
||||
; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
|
||||
; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
|
||||
|
||||
punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
|
||||
punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36)
|
||||
punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17)
|
||||
punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37)
|
||||
|
||||
movq mmC,mmA
|
||||
punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32)
|
||||
punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36)
|
||||
movq mmG,mmB
|
||||
punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33)
|
||||
punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37)
|
||||
|
||||
movq mmD,mmA
|
||||
punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33)
|
||||
movq mmH,mmC
|
||||
punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37)
|
||||
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jb short .column_st16
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
|
||||
movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
|
||||
movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
|
||||
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
jz short .endcolumn
|
||||
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
|
||||
add esi, byte SIZEOF_MMWORD ; inptr0
|
||||
dec al ; Yctr
|
||||
jnz near .Yloop_2nd
|
||||
|
||||
add ebx, byte SIZEOF_MMWORD ; inptr1
|
||||
add edx, byte SIZEOF_MMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
alignx 16,7
|
||||
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_MMWORD/2
|
||||
jb short .column_st8
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
|
||||
movq mmA,mmC
|
||||
movq mmD,mmH
|
||||
sub ecx, byte SIZEOF_MMWORD/2
|
||||
add edi, byte 2*SIZEOF_MMWORD
|
||||
.column_st8:
|
||||
cmp ecx, byte SIZEOF_MMWORD/4
|
||||
jb short .column_st4
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq mmA,mmD
|
||||
sub ecx, byte SIZEOF_MMWORD/4
|
||||
add edi, byte 1*SIZEOF_MMWORD
|
||||
.column_st4:
|
||||
cmp ecx, byte SIZEOF_MMWORD/8
|
||||
jb short .endcolumn
|
||||
movd DWORD [edi+0*SIZEOF_DWORD], mmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
.endcolumn:
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp,ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
|
||||
; JSAMPIMAGE input_buf,
|
||||
; JDIMENSION in_row_group_ctr,
|
||||
; JSAMPARRAY output_buf);
|
||||
;
|
||||
|
||||
%define output_width(b) (b)+8 ; JDIMENSION output_width
|
||||
%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
|
||||
%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
|
||||
%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_h2v2_merged_upsample_mmx)
|
||||
|
||||
EXTN(jsimd_h2v2_merged_upsample_mmx):
|
||||
push ebp
|
||||
mov ebp,esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov eax, JDIMENSION [output_width(ebp)]
|
||||
|
||||
mov edi, JSAMPIMAGE [input_buf(ebp)]
|
||||
mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
|
||||
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
|
||||
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
|
||||
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
|
||||
mov edi, JSAMPARRAY [output_buf(ebp)]
|
||||
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
push edx ; inptr2
|
||||
push ebx ; inptr1
|
||||
push esi ; inptr00
|
||||
mov ebx,esp
|
||||
|
||||
push edi ; output_buf (outptr0)
|
||||
push ecx ; in_row_group_ctr
|
||||
push ebx ; input_buf
|
||||
push eax ; output_width
|
||||
|
||||
call near EXTN(jsimd_h2v1_merged_upsample_mmx)
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; inptr01
|
||||
add edi, byte SIZEOF_JSAMPROW ; outptr1
|
||||
mov POINTER [ebx+0*SIZEOF_POINTER], esi
|
||||
mov POINTER [ebx-1*SIZEOF_POINTER], edi
|
||||
|
||||
call near EXTN(jsimd_h2v1_merged_upsample_mmx)
|
||||
|
||||
add esp, byte 7*SIZEOF_DWORD
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
734
simd/jdsammmx.asm
Normal file
734
simd/jdsammmx.asm
Normal file
@@ -0,0 +1,734 @@
|
||||
;
|
||||
; jdsammmx.asm - upsampling (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "simd/jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_fancy_upsample_mmx)
|
||||
|
||||
EXTN(jconst_fancy_upsample_mmx):
|
||||
|
||||
PW_ONE times 4 dw 1
|
||||
PW_TWO times 4 dw 2
|
||||
PW_THREE times 4 dw 3
|
||||
PW_SEVEN times 4 dw 7
|
||||
PW_EIGHT times 4 dw 8
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
|
||||
;
|
||||
; The upsampling algorithm is linear interpolation between pixel centers,
|
||||
; also known as a "triangle filter". This is a good compromise between
|
||||
; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
|
||||
; of the way between input pixel centers.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
|
||||
; JDIMENSION downsampled_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY * output_data_ptr);
|
||||
;
|
||||
|
||||
%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
|
||||
%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
|
||||
%define input_data(b) (b)+16 ; JSAMPARRAY input_data
|
||||
%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_h2v1_fancy_upsample_mmx)
|
||||
|
||||
EXTN(jsimd_h2v1_fancy_upsample_mmx):
|
||||
push ebp
|
||||
mov ebp,esp
|
||||
pushpic ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
|
||||
test eax,eax
|
||||
jz near .return
|
||||
|
||||
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
||||
test ecx,ecx
|
||||
jz near .return
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, POINTER [output_data_ptr(ebp)]
|
||||
mov edi, JSAMPARRAY [edi] ; output_data
|
||||
alignx 16,7
|
||||
.rowloop:
|
||||
push eax ; colctr
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
|
||||
test eax, SIZEOF_MMWORD-1
|
||||
jz short .skip
|
||||
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
||||
.skip:
|
||||
pxor mm0,mm0 ; mm0=(all 0's)
|
||||
pcmpeqb mm7,mm7
|
||||
psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
|
||||
pand mm7, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
|
||||
add eax, byte SIZEOF_MMWORD-1
|
||||
and eax, byte -SIZEOF_MMWORD
|
||||
cmp eax, byte SIZEOF_MMWORD
|
||||
ja short .columnloop
|
||||
alignx 16,7
|
||||
|
||||
.columnloop_last:
|
||||
pcmpeqb mm6,mm6
|
||||
psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
|
||||
pand mm6, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
jmp short .upsample
|
||||
alignx 16,7
|
||||
|
||||
.columnloop:
|
||||
movq mm6, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
|
||||
|
||||
.upsample:
|
||||
movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mm2,mm1
|
||||
movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7)
|
||||
psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6)
|
||||
psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -)
|
||||
|
||||
por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6)
|
||||
por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8)
|
||||
|
||||
movq mm7,mm1
|
||||
psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -)
|
||||
|
||||
movq mm4,mm1
|
||||
punpcklbw mm1,mm0 ; mm1=( 0 1 2 3)
|
||||
punpckhbw mm4,mm0 ; mm4=( 4 5 6 7)
|
||||
movq mm5,mm2
|
||||
punpcklbw mm2,mm0 ; mm2=(-1 0 1 2)
|
||||
punpckhbw mm5,mm0 ; mm5=( 3 4 5 6)
|
||||
movq mm6,mm3
|
||||
punpcklbw mm3,mm0 ; mm3=( 1 2 3 4)
|
||||
punpckhbw mm6,mm0 ; mm6=( 5 6 7 8)
|
||||
|
||||
pmullw mm1,[GOTOFF(ebx,PW_THREE)]
|
||||
pmullw mm4,[GOTOFF(ebx,PW_THREE)]
|
||||
paddw mm2,[GOTOFF(ebx,PW_ONE)]
|
||||
paddw mm5,[GOTOFF(ebx,PW_ONE)]
|
||||
paddw mm3,[GOTOFF(ebx,PW_TWO)]
|
||||
paddw mm6,[GOTOFF(ebx,PW_TWO)]
|
||||
|
||||
paddw mm2,mm1
|
||||
paddw mm5,mm4
|
||||
psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6)
|
||||
psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14)
|
||||
paddw mm3,mm1
|
||||
paddw mm6,mm4
|
||||
psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7)
|
||||
psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15)
|
||||
|
||||
psllw mm3,BYTE_BIT
|
||||
psllw mm6,BYTE_BIT
|
||||
por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7)
|
||||
por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15)
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mm5
|
||||
|
||||
sub eax, byte SIZEOF_MMWORD
|
||||
add esi, byte 1*SIZEOF_MMWORD ; inptr
|
||||
add edi, byte 2*SIZEOF_MMWORD ; outptr
|
||||
cmp eax, byte SIZEOF_MMWORD
|
||||
ja near .columnloop
|
||||
test eax,eax
|
||||
jnz near .columnloop_last
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
pop eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec ecx ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
poppic ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
|
||||
; Again a triangle filter; see comments for h2v1 case, above.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
|
||||
; JDIMENSION downsampled_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY * output_data_ptr);
|
||||
;
|
||||
|
||||
%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
|
||||
%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
|
||||
%define input_data(b) (b)+16 ; JSAMPARRAY input_data
|
||||
%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
|
||||
|
||||
%define original_ebp ebp+0
|
||||
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
|
||||
%define WK_NUM 4
|
||||
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_h2v2_fancy_upsample_mmx)
|
||||
|
||||
EXTN(jsimd_h2v2_fancy_upsample_mmx):
|
||||
push ebp
|
||||
mov eax,esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp],eax
|
||||
mov ebp,esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov edx,eax ; edx = original ebp
|
||||
mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
|
||||
test eax,eax
|
||||
jz near .return
|
||||
|
||||
mov ecx, INT [max_v_samp(edx)] ; rowctr
|
||||
test ecx,ecx
|
||||
jz near .return
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(edx)] ; input_data
|
||||
mov edi, POINTER [output_data_ptr(edx)]
|
||||
mov edi, JSAMPARRAY [edi] ; output_data
|
||||
alignx 16,7
|
||||
.rowloop:
|
||||
push eax ; colctr
|
||||
push ecx
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
|
||||
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
|
||||
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
|
||||
|
||||
test eax, SIZEOF_MMWORD-1
|
||||
jz short .skip
|
||||
push edx
|
||||
mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
|
||||
mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
|
||||
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
||||
pop edx
|
||||
.skip:
|
||||
; -- process the first column block
|
||||
|
||||
movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0]
|
||||
movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0]
|
||||
movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0]
|
||||
|
||||
pushpic ebx
|
||||
movpic ebx, POINTER [gotptr] ; load GOT address
|
||||
|
||||
pxor mm3,mm3 ; mm3=(all 0's)
|
||||
movq mm4,mm0
|
||||
punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3)
|
||||
punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7)
|
||||
movq mm5,mm1
|
||||
punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3)
|
||||
punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7)
|
||||
movq mm6,mm2
|
||||
punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3)
|
||||
punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7)
|
||||
|
||||
pmullw mm0,[GOTOFF(ebx,PW_THREE)]
|
||||
pmullw mm4,[GOTOFF(ebx,PW_THREE)]
|
||||
|
||||
pcmpeqb mm7,mm7
|
||||
psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
|
||||
|
||||
paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3)
|
||||
paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7)
|
||||
paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3)
|
||||
paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7)
|
||||
|
||||
movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save
|
||||
movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mm6
|
||||
|
||||
pand mm1,mm7 ; mm1=( 0 - - -)
|
||||
pand mm2,mm7 ; mm2=( 0 - - -)
|
||||
|
||||
movq MMWORD [wk(0)], mm1
|
||||
movq MMWORD [wk(1)], mm2
|
||||
|
||||
poppic ebx
|
||||
|
||||
add eax, byte SIZEOF_MMWORD-1
|
||||
and eax, byte -SIZEOF_MMWORD
|
||||
cmp eax, byte SIZEOF_MMWORD
|
||||
ja short .columnloop
|
||||
alignx 16,7
|
||||
|
||||
.columnloop_last:
|
||||
; -- process the last column block
|
||||
|
||||
pushpic ebx
|
||||
movpic ebx, POINTER [gotptr] ; load GOT address
|
||||
|
||||
pcmpeqb mm1,mm1
|
||||
psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
|
||||
movq mm2,mm1
|
||||
|
||||
pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7)
|
||||
pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7)
|
||||
|
||||
movq MMWORD [wk(2)], mm1
|
||||
movq MMWORD [wk(3)], mm2
|
||||
|
||||
jmp short .upsample
|
||||
alignx 16,7
|
||||
|
||||
.columnloop:
|
||||
; -- process the next column block
|
||||
|
||||
movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1]
|
||||
movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1]
|
||||
movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1]
|
||||
|
||||
pushpic ebx
|
||||
movpic ebx, POINTER [gotptr] ; load GOT address
|
||||
|
||||
pxor mm3,mm3 ; mm3=(all 0's)
|
||||
movq mm4,mm0
|
||||
punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3)
|
||||
punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7)
|
||||
movq mm5,mm1
|
||||
punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3)
|
||||
punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7)
|
||||
movq mm6,mm2
|
||||
punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3)
|
||||
punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7)
|
||||
|
||||
pmullw mm0,[GOTOFF(ebx,PW_THREE)]
|
||||
pmullw mm4,[GOTOFF(ebx,PW_THREE)]
|
||||
|
||||
paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3)
|
||||
paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7)
|
||||
paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3)
|
||||
paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7)
|
||||
|
||||
movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save
|
||||
movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data
|
||||
movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
|
||||
movq MMWORD [edi+3*SIZEOF_MMWORD], mm6
|
||||
|
||||
psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0)
|
||||
psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0)
|
||||
|
||||
movq MMWORD [wk(2)], mm1
|
||||
movq MMWORD [wk(3)], mm2
|
||||
|
||||
.upsample:
|
||||
; -- process the upper row
|
||||
|
||||
movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3)
|
||||
movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7)
|
||||
|
||||
movq mm0,mm7
|
||||
movq mm4,mm3
|
||||
psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -)
|
||||
psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4)
|
||||
movq mm5,mm7
|
||||
movq mm6,mm3
|
||||
psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -)
|
||||
psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6)
|
||||
|
||||
por mm0,mm4 ; mm0=( 1 2 3 4)
|
||||
por mm5,mm6 ; mm5=( 3 4 5 6)
|
||||
|
||||
movq mm1,mm7
|
||||
movq mm2,mm3
|
||||
psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2)
|
||||
psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -)
|
||||
movq mm4,mm3
|
||||
psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -)
|
||||
|
||||
por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2)
|
||||
por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8)
|
||||
|
||||
movq MMWORD [wk(0)], mm4
|
||||
|
||||
pmullw mm7,[GOTOFF(ebx,PW_THREE)]
|
||||
pmullw mm3,[GOTOFF(ebx,PW_THREE)]
|
||||
paddw mm1,[GOTOFF(ebx,PW_EIGHT)]
|
||||
paddw mm5,[GOTOFF(ebx,PW_EIGHT)]
|
||||
paddw mm0,[GOTOFF(ebx,PW_SEVEN)]
|
||||
paddw mm2,[GOTOFF(ebx,PW_SEVEN)]
|
||||
|
||||
paddw mm1,mm7
|
||||
paddw mm5,mm3
|
||||
psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6)
|
||||
psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14)
|
||||
paddw mm0,mm7
|
||||
paddw mm2,mm3
|
||||
psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7)
|
||||
psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15)
|
||||
|
||||
psllw mm0,BYTE_BIT
|
||||
psllw mm2,BYTE_BIT
|
||||
por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7)
|
||||
por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15)
|
||||
|
||||
movq MMWORD [edx+0*SIZEOF_MMWORD], mm1
|
||||
movq MMWORD [edx+1*SIZEOF_MMWORD], mm5
|
||||
|
||||
; -- process the lower row
|
||||
|
||||
movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3)
|
||||
movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7)
|
||||
|
||||
movq mm7,mm6
|
||||
movq mm3,mm4
|
||||
psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -)
|
||||
psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4)
|
||||
movq mm0,mm6
|
||||
movq mm2,mm4
|
||||
psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -)
|
||||
psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6)
|
||||
|
||||
por mm7,mm3 ; mm7=( 1 2 3 4)
|
||||
por mm0,mm2 ; mm0=( 3 4 5 6)
|
||||
|
||||
movq mm1,mm6
|
||||
movq mm5,mm4
|
||||
psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2)
|
||||
psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -)
|
||||
movq mm3,mm4
|
||||
psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -)
|
||||
|
||||
por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2)
|
||||
por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8)
|
||||
|
||||
movq MMWORD [wk(1)], mm3
|
||||
|
||||
pmullw mm6,[GOTOFF(ebx,PW_THREE)]
|
||||
pmullw mm4,[GOTOFF(ebx,PW_THREE)]
|
||||
paddw mm1,[GOTOFF(ebx,PW_EIGHT)]
|
||||
paddw mm0,[GOTOFF(ebx,PW_EIGHT)]
|
||||
paddw mm7,[GOTOFF(ebx,PW_SEVEN)]
|
||||
paddw mm5,[GOTOFF(ebx,PW_SEVEN)]
|
||||
|
||||
paddw mm1,mm6
|
||||
paddw mm0,mm4
|
||||
psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6)
|
||||
psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14)
|
||||
paddw mm7,mm6
|
||||
paddw mm5,mm4
|
||||
psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7)
|
||||
psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15)
|
||||
|
||||
psllw mm7,BYTE_BIT
|
||||
psllw mm5,BYTE_BIT
|
||||
por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7)
|
||||
por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15)
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mm1
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mm0
|
||||
|
||||
poppic ebx
|
||||
|
||||
sub eax, byte SIZEOF_MMWORD
|
||||
add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above)
|
||||
add ebx, byte 1*SIZEOF_MMWORD ; inptr0
|
||||
add esi, byte 1*SIZEOF_MMWORD ; inptr1(below)
|
||||
add edx, byte 2*SIZEOF_MMWORD ; outptr0
|
||||
add edi, byte 2*SIZEOF_MMWORD ; outptr1
|
||||
cmp eax, byte SIZEOF_MMWORD
|
||||
ja near .columnloop
|
||||
test eax,eax
|
||||
jnz near .columnloop_last
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
pop ecx
|
||||
pop eax
|
||||
|
||||
add esi, byte 1*SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte 2*SIZEOF_JSAMPROW ; output_data
|
||||
sub ecx, byte 2 ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp,ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
|
||||
; It's still a box filter.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
|
||||
; JDIMENSION output_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY * output_data_ptr);
|
||||
;
|
||||
|
||||
%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
|
||||
%define output_width(b) (b)+12 ; JDIMENSION output_width
|
||||
%define input_data(b) (b)+16 ; JSAMPARRAY input_data
|
||||
%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_h2v1_upsample_mmx)
|
||||
|
||||
EXTN(jsimd_h2v1_upsample_mmx):
|
||||
push ebp
|
||||
mov ebp,esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov edx, JDIMENSION [output_width(ebp)]
|
||||
add edx, byte (2*SIZEOF_MMWORD)-1
|
||||
and edx, byte -(2*SIZEOF_MMWORD)
|
||||
jz short .return
|
||||
|
||||
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
||||
test ecx,ecx
|
||||
jz short .return
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, POINTER [output_data_ptr(ebp)]
|
||||
mov edi, JSAMPARRAY [edi] ; output_data
|
||||
alignx 16,7
|
||||
.rowloop:
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
mov eax,edx ; colctr
|
||||
alignx 16,7
|
||||
.columnloop:
|
||||
|
||||
movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
|
||||
movq mm1,mm0
|
||||
punpcklbw mm0,mm0
|
||||
punpckhbw mm1,mm1
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
|
||||
|
||||
sub eax, byte 2*SIZEOF_MMWORD
|
||||
jz short .nextrow
|
||||
|
||||
movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
|
||||
movq mm3,mm2
|
||||
punpcklbw mm2,mm2
|
||||
punpckhbw mm3,mm3
|
||||
|
||||
movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
|
||||
movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
|
||||
|
||||
sub eax, byte 2*SIZEOF_MMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add esi, byte 2*SIZEOF_MMWORD ; inptr
|
||||
add edi, byte 4*SIZEOF_MMWORD ; outptr
|
||||
jmp short .columnloop
|
||||
alignx 16,7
|
||||
|
||||
.nextrow:
|
||||
pop esi
|
||||
pop edi
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec ecx ; rowctr
|
||||
jg short .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
|
||||
; It's still a box filter.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
|
||||
; JDIMENSION output_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY * output_data_ptr);
|
||||
;
|
||||
|
||||
%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
|
||||
%define output_width(b) (b)+12 ; JDIMENSION output_width
|
||||
%define input_data(b) (b)+16 ; JSAMPARRAY input_data
|
||||
%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_h2v2_upsample_mmx)
|
||||
|
||||
EXTN(jsimd_h2v2_upsample_mmx):
|
||||
push ebp
|
||||
mov ebp,esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov edx, JDIMENSION [output_width(ebp)]
|
||||
add edx, byte (2*SIZEOF_MMWORD)-1
|
||||
and edx, byte -(2*SIZEOF_MMWORD)
|
||||
jz near .return
|
||||
|
||||
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
||||
test ecx,ecx
|
||||
jz short .return
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, POINTER [output_data_ptr(ebp)]
|
||||
mov edi, JSAMPARRAY [edi] ; output_data
|
||||
alignx 16,7
|
||||
.rowloop:
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
|
||||
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
|
||||
mov eax,edx ; colctr
|
||||
alignx 16,7
|
||||
.columnloop:
|
||||
|
||||
movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
|
||||
movq mm1,mm0
|
||||
punpcklbw mm0,mm0
|
||||
punpckhbw mm1,mm1
|
||||
|
||||
movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0
|
||||
movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
|
||||
|
||||
sub eax, byte 2*SIZEOF_MMWORD
|
||||
jz short .nextrow
|
||||
|
||||
movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
|
||||
movq mm3,mm2
|
||||
punpcklbw mm2,mm2
|
||||
punpckhbw mm3,mm3
|
||||
|
||||
movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2
|
||||
movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3
|
||||
movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
|
||||
movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
|
||||
|
||||
sub eax, byte 2*SIZEOF_MMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add esi, byte 2*SIZEOF_MMWORD ; inptr
|
||||
add ebx, byte 4*SIZEOF_MMWORD ; outptr0
|
||||
add edi, byte 4*SIZEOF_MMWORD ; outptr1
|
||||
jmp short .columnloop
|
||||
alignx 16,7
|
||||
|
||||
.nextrow:
|
||||
pop esi
|
||||
pop edi
|
||||
|
||||
add esi, byte 1*SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte 2*SIZEOF_JSAMPROW ; output_data
|
||||
sub ecx, byte 2 ; rowctr
|
||||
jg short .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
394
simd/jfmmxfst.asm
Normal file
394
simd/jfmmxfst.asm
Normal file
@@ -0,0 +1,394 @@
|
||||
;
|
||||
; jfmmxfst.asm - fast integer FDCT (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a fast, not so accurate integer implementation of
|
||||
; the forward DCT (Discrete Cosine Transform). The following code is
|
||||
; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
|
||||
; for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "simd/jsimdext.inc"
|
||||
%include "simd/jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 8 ; 14 is also OK.
|
||||
|
||||
%if CONST_BITS == 8
|
||||
F_0_382 equ 98 ; FIX(0.382683433)
|
||||
F_0_541 equ 139 ; FIX(0.541196100)
|
||||
F_0_707 equ 181 ; FIX(0.707106781)
|
||||
F_1_306 equ 334 ; FIX(1.306562965)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||
F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433)
|
||||
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781)
|
||||
F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
|
||||
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
|
||||
|
||||
%define PRE_MULTIPLY_SCALE_BITS 2
|
||||
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_fdct_ifast_mmx)
|
||||
|
||||
EXTN(jconst_fdct_ifast_mmx):
|
||||
|
||||
PW_F0707 times 4 dw F_0_707 << CONST_SHIFT
|
||||
PW_F0382 times 4 dw F_0_382 << CONST_SHIFT
|
||||
PW_F0541 times 4 dw F_0_541 << CONST_SHIFT
|
||||
PW_F1306 times 4 dw F_1_306 << CONST_SHIFT
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_fdct_ifast_mmx (DCTELEM * data)
|
||||
;
|
||||
|
||||
%define data(b) (b)+8 ; DCTELEM * data
|
||||
|
||||
%define original_ebp ebp+0
|
||||
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_fdct_ifast_mmx)
|
||||
|
||||
EXTN(jsimd_fdct_ifast_mmx):
|
||||
push ebp
|
||||
mov eax,esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp],eax
|
||||
mov ebp,esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
; push esi ; unused
|
||||
; push edi ; unused
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
mov edx, POINTER [data(eax)] ; (DCTELEM *)
|
||||
mov ecx, DCTSIZE/4
|
||||
alignx 16,7
|
||||
.rowloop:
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; mm0=(20 21 22 23), mm2=(24 25 26 27)
|
||||
; mm1=(30 31 32 33), mm3=(34 35 36 37)
|
||||
|
||||
movq mm4,mm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm0,mm1 ; mm0=(20 30 21 31)
|
||||
punpckhwd mm4,mm1 ; mm4=(22 32 23 33)
|
||||
movq mm5,mm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm2,mm3 ; mm2=(24 34 25 35)
|
||||
punpckhwd mm5,mm3 ; mm5=(26 36 27 37)
|
||||
|
||||
movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; mm6=(00 01 02 03), mm1=(04 05 06 07)
|
||||
; mm7=(10 11 12 13), mm3=(14 15 16 17)
|
||||
|
||||
movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33)
|
||||
movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35)
|
||||
|
||||
movq mm4,mm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm6,mm7 ; mm6=(00 10 01 11)
|
||||
punpckhwd mm4,mm7 ; mm4=(02 12 03 13)
|
||||
movq mm2,mm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm1,mm3 ; mm1=(04 14 05 15)
|
||||
punpckhwd mm2,mm3 ; mm2=(06 16 07 17)
|
||||
|
||||
movq mm7,mm6 ; transpose coefficients(phase 2)
|
||||
punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0
|
||||
punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1
|
||||
movq mm3,mm2 ; transpose coefficients(phase 2)
|
||||
punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6
|
||||
punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7
|
||||
|
||||
movq mm0,mm7
|
||||
movq mm5,mm6
|
||||
psubw mm7,mm2 ; mm7=data1-data6=tmp6
|
||||
psubw mm6,mm3 ; mm6=data0-data7=tmp7
|
||||
paddw mm0,mm2 ; mm0=data1+data6=tmp1
|
||||
paddw mm5,mm3 ; mm5=data0+data7=tmp0
|
||||
|
||||
movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33)
|
||||
movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35)
|
||||
movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
|
||||
movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
|
||||
|
||||
movq mm7,mm4 ; transpose coefficients(phase 2)
|
||||
punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2
|
||||
punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3
|
||||
movq mm6,mm1 ; transpose coefficients(phase 2)
|
||||
punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4
|
||||
punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5
|
||||
|
||||
movq mm2,mm7
|
||||
movq mm3,mm4
|
||||
paddw mm7,mm1 ; mm7=data3+data4=tmp3
|
||||
paddw mm4,mm6 ; mm4=data2+data5=tmp2
|
||||
psubw mm2,mm1 ; mm2=data3-data4=tmp4
|
||||
psubw mm3,mm6 ; mm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm1,mm5
|
||||
movq mm6,mm0
|
||||
psubw mm5,mm7 ; mm5=tmp13
|
||||
psubw mm0,mm4 ; mm0=tmp12
|
||||
paddw mm1,mm7 ; mm1=tmp10
|
||||
paddw mm6,mm4 ; mm6=tmp11
|
||||
|
||||
paddw mm0,mm5
|
||||
psllw mm0,PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
|
||||
|
||||
movq mm7,mm1
|
||||
movq mm4,mm5
|
||||
psubw mm1,mm6 ; mm1=data4
|
||||
psubw mm5,mm0 ; mm5=data6
|
||||
paddw mm7,mm6 ; mm7=data0
|
||||
paddw mm4,mm0 ; mm4=data2
|
||||
|
||||
movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
|
||||
movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
|
||||
movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
|
||||
movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm6, MMWORD [wk(0)] ; mm6=tmp6
|
||||
movq mm0, MMWORD [wk(1)] ; mm0=tmp7
|
||||
|
||||
paddw mm2,mm3 ; mm2=tmp10
|
||||
paddw mm3,mm6 ; mm3=tmp11
|
||||
paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7
|
||||
|
||||
psllw mm2,PRE_MULTIPLY_SCALE_BITS
|
||||
psllw mm6,PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
psllw mm3,PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
|
||||
|
||||
movq mm1,mm2 ; mm1=tmp10
|
||||
psubw mm2,mm6
|
||||
pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
|
||||
pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
|
||||
pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
|
||||
paddw mm1,mm2 ; mm1=z2
|
||||
paddw mm6,mm2 ; mm6=z4
|
||||
|
||||
movq mm5,mm0
|
||||
psubw mm0,mm3 ; mm0=z13
|
||||
paddw mm5,mm3 ; mm5=z11
|
||||
|
||||
movq mm7,mm0
|
||||
movq mm4,mm5
|
||||
psubw mm0,mm1 ; mm0=data3
|
||||
psubw mm5,mm6 ; mm5=data7
|
||||
paddw mm7,mm1 ; mm7=data5
|
||||
paddw mm4,mm6 ; mm4=data1
|
||||
|
||||
movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
|
||||
movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
|
||||
movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
|
||||
movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
|
||||
|
||||
add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
|
||||
dec ecx
|
||||
jnz near .rowloop
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
mov edx, POINTER [data(eax)] ; (DCTELEM *)
|
||||
mov ecx, DCTSIZE/4
|
||||
alignx 16,7
|
||||
.columnloop:
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; mm0=(02 12 22 32), mm2=(42 52 62 72)
|
||||
; mm1=(03 13 23 33), mm3=(43 53 63 73)
|
||||
|
||||
movq mm4,mm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm0,mm1 ; mm0=(02 03 12 13)
|
||||
punpckhwd mm4,mm1 ; mm4=(22 23 32 33)
|
||||
movq mm5,mm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm2,mm3 ; mm2=(42 43 52 53)
|
||||
punpckhwd mm5,mm3 ; mm5=(62 63 72 73)
|
||||
|
||||
movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; mm6=(00 10 20 30), mm1=(40 50 60 70)
|
||||
; mm7=(01 11 21 31), mm3=(41 51 61 71)
|
||||
|
||||
movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33)
|
||||
movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53)
|
||||
|
||||
movq mm4,mm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm6,mm7 ; mm6=(00 01 10 11)
|
||||
punpckhwd mm4,mm7 ; mm4=(20 21 30 31)
|
||||
movq mm2,mm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm1,mm3 ; mm1=(40 41 50 51)
|
||||
punpckhwd mm2,mm3 ; mm2=(60 61 70 71)
|
||||
|
||||
movq mm7,mm6 ; transpose coefficients(phase 2)
|
||||
punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0
|
||||
punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1
|
||||
movq mm3,mm2 ; transpose coefficients(phase 2)
|
||||
punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6
|
||||
punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7
|
||||
|
||||
movq mm0,mm7
|
||||
movq mm5,mm6
|
||||
psubw mm7,mm2 ; mm7=data1-data6=tmp6
|
||||
psubw mm6,mm3 ; mm6=data0-data7=tmp7
|
||||
paddw mm0,mm2 ; mm0=data1+data6=tmp1
|
||||
paddw mm5,mm3 ; mm5=data0+data7=tmp0
|
||||
|
||||
movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33)
|
||||
movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53)
|
||||
movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
|
||||
movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
|
||||
|
||||
movq mm7,mm4 ; transpose coefficients(phase 2)
|
||||
punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2
|
||||
punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3
|
||||
movq mm6,mm1 ; transpose coefficients(phase 2)
|
||||
punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4
|
||||
punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5
|
||||
|
||||
movq mm2,mm7
|
||||
movq mm3,mm4
|
||||
paddw mm7,mm1 ; mm7=data3+data4=tmp3
|
||||
paddw mm4,mm6 ; mm4=data2+data5=tmp2
|
||||
psubw mm2,mm1 ; mm2=data3-data4=tmp4
|
||||
psubw mm3,mm6 ; mm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm1,mm5
|
||||
movq mm6,mm0
|
||||
psubw mm5,mm7 ; mm5=tmp13
|
||||
psubw mm0,mm4 ; mm0=tmp12
|
||||
paddw mm1,mm7 ; mm1=tmp10
|
||||
paddw mm6,mm4 ; mm6=tmp11
|
||||
|
||||
paddw mm0,mm5
|
||||
psllw mm0,PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
|
||||
|
||||
movq mm7,mm1
|
||||
movq mm4,mm5
|
||||
psubw mm1,mm6 ; mm1=data4
|
||||
psubw mm5,mm0 ; mm5=data6
|
||||
paddw mm7,mm6 ; mm7=data0
|
||||
paddw mm4,mm0 ; mm4=data2
|
||||
|
||||
movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
|
||||
movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
|
||||
movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
|
||||
movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm6, MMWORD [wk(0)] ; mm6=tmp6
|
||||
movq mm0, MMWORD [wk(1)] ; mm0=tmp7
|
||||
|
||||
paddw mm2,mm3 ; mm2=tmp10
|
||||
paddw mm3,mm6 ; mm3=tmp11
|
||||
paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7
|
||||
|
||||
psllw mm2,PRE_MULTIPLY_SCALE_BITS
|
||||
psllw mm6,PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
psllw mm3,PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
|
||||
|
||||
movq mm1,mm2 ; mm1=tmp10
|
||||
psubw mm2,mm6
|
||||
pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
|
||||
pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
|
||||
pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
|
||||
paddw mm1,mm2 ; mm1=z2
|
||||
paddw mm6,mm2 ; mm6=z4
|
||||
|
||||
movq mm5,mm0
|
||||
psubw mm0,mm3 ; mm0=z13
|
||||
paddw mm5,mm3 ; mm5=z11
|
||||
|
||||
movq mm7,mm0
|
||||
movq mm4,mm5
|
||||
psubw mm0,mm1 ; mm0=data3
|
||||
psubw mm5,mm6 ; mm5=data7
|
||||
paddw mm7,mm1 ; mm7=data5
|
||||
paddw mm4,mm6 ; mm4=data1
|
||||
|
||||
movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
|
||||
movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
|
||||
movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
|
||||
movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
|
||||
|
||||
add edx, byte 4*SIZEOF_DCTELEM
|
||||
dec ecx
|
||||
jnz near .columnloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
; pop edi ; unused
|
||||
; pop esi ; unused
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
poppic ebx
|
||||
mov esp,ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
619
simd/jfmmxint.asm
Normal file
619
simd/jfmmxint.asm
Normal file
@@ -0,0 +1,619 @@
|
||||
;
|
||||
; jfmmxint.asm - accurate integer FDCT (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a slow-but-accurate integer implementation of the
|
||||
; forward DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
|
||||
; more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "simd/jsimdext.inc"
|
||||
%include "simd/jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
|
||||
%define DESCALE_P2 (CONST_BITS+PASS1_BITS)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_298 equ 2446 ; FIX(0.298631336)
|
||||
F_0_390 equ 3196 ; FIX(0.390180644)
|
||||
F_0_541 equ 4433 ; FIX(0.541196100)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_175 equ 9633 ; FIX(1.175875602)
|
||||
F_1_501 equ 12299 ; FIX(1.501321110)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_1_961 equ 16069 ; FIX(1.961570560)
|
||||
F_2_053 equ 16819 ; FIX(2.053119869)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_072 equ 25172 ; FIX(3.072711026)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||
F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
|
||||
F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
|
||||
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
|
||||
F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
|
||||
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
|
||||
F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
|
||||
F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
|
||||
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_fdct_islow_mmx)
|
||||
|
||||
EXTN(jconst_fdct_islow_mmx):
|
||||
|
||||
PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541
|
||||
PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847)
|
||||
PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175
|
||||
PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390)
|
||||
PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899
|
||||
PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899)
|
||||
PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562
|
||||
PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562)
|
||||
PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1)
|
||||
PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1)
|
||||
PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS-1)
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_fdct_islow_mmx (DCTELEM * data)
|
||||
;
|
||||
|
||||
%define data(b) (b)+8 ; DCTELEM * data
|
||||
|
||||
%define original_ebp ebp+0
|
||||
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_fdct_islow_mmx)
|
||||
|
||||
EXTN(jsimd_fdct_islow_mmx):
|
||||
push ebp
|
||||
mov eax,esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp],eax
|
||||
mov ebp,esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
; push esi ; unused
|
||||
; push edi ; unused
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
mov edx, POINTER [data(eax)] ; (DCTELEM *)
|
||||
mov ecx, DCTSIZE/4
|
||||
alignx 16,7
|
||||
.rowloop:
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; mm0=(20 21 22 23), mm2=(24 25 26 27)
|
||||
; mm1=(30 31 32 33), mm3=(34 35 36 37)
|
||||
|
||||
movq mm4,mm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm0,mm1 ; mm0=(20 30 21 31)
|
||||
punpckhwd mm4,mm1 ; mm4=(22 32 23 33)
|
||||
movq mm5,mm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm2,mm3 ; mm2=(24 34 25 35)
|
||||
punpckhwd mm5,mm3 ; mm5=(26 36 27 37)
|
||||
|
||||
movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; mm6=(00 01 02 03), mm1=(04 05 06 07)
|
||||
; mm7=(10 11 12 13), mm3=(14 15 16 17)
|
||||
|
||||
movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33)
|
||||
movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35)
|
||||
|
||||
movq mm4,mm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm6,mm7 ; mm6=(00 10 01 11)
|
||||
punpckhwd mm4,mm7 ; mm4=(02 12 03 13)
|
||||
movq mm2,mm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm1,mm3 ; mm1=(04 14 05 15)
|
||||
punpckhwd mm2,mm3 ; mm2=(06 16 07 17)
|
||||
|
||||
movq mm7,mm6 ; transpose coefficients(phase 2)
|
||||
punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0
|
||||
punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1
|
||||
movq mm3,mm2 ; transpose coefficients(phase 2)
|
||||
punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6
|
||||
punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7
|
||||
|
||||
movq mm0,mm7
|
||||
movq mm5,mm6
|
||||
psubw mm7,mm2 ; mm7=data1-data6=tmp6
|
||||
psubw mm6,mm3 ; mm6=data0-data7=tmp7
|
||||
paddw mm0,mm2 ; mm0=data1+data6=tmp1
|
||||
paddw mm5,mm3 ; mm5=data0+data7=tmp0
|
||||
|
||||
movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33)
|
||||
movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35)
|
||||
movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
|
||||
movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
|
||||
|
||||
movq mm7,mm4 ; transpose coefficients(phase 2)
|
||||
punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2
|
||||
punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3
|
||||
movq mm6,mm1 ; transpose coefficients(phase 2)
|
||||
punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4
|
||||
punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5
|
||||
|
||||
movq mm2,mm7
|
||||
movq mm3,mm4
|
||||
paddw mm7,mm1 ; mm7=data3+data4=tmp3
|
||||
paddw mm4,mm6 ; mm4=data2+data5=tmp2
|
||||
psubw mm2,mm1 ; mm2=data3-data4=tmp4
|
||||
psubw mm3,mm6 ; mm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm1,mm5
|
||||
movq mm6,mm0
|
||||
paddw mm5,mm7 ; mm5=tmp10
|
||||
paddw mm0,mm4 ; mm0=tmp11
|
||||
psubw mm1,mm7 ; mm1=tmp13
|
||||
psubw mm6,mm4 ; mm6=tmp12
|
||||
|
||||
movq mm7,mm5
|
||||
paddw mm5,mm0 ; mm5=tmp10+tmp11
|
||||
psubw mm7,mm0 ; mm7=tmp10-tmp11
|
||||
|
||||
psllw mm5,PASS1_BITS ; mm5=data0
|
||||
psllw mm7,PASS1_BITS ; mm7=data4
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
|
||||
movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
|
||||
|
||||
; (Original)
|
||||
; z1 = (tmp12 + tmp13) * 0.541196100;
|
||||
; data2 = z1 + tmp13 * 0.765366865;
|
||||
; data6 = z1 + tmp12 * -1.847759065;
|
||||
;
|
||||
; (This implementation)
|
||||
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
||||
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
||||
|
||||
movq mm4,mm1 ; mm1=tmp13
|
||||
movq mm0,mm1
|
||||
punpcklwd mm4,mm6 ; mm6=tmp12
|
||||
punpckhwd mm0,mm6
|
||||
movq mm1,mm4
|
||||
movq mm6,mm0
|
||||
pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L
|
||||
pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H
|
||||
pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L
|
||||
pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H
|
||||
|
||||
paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad mm4,DESCALE_P1
|
||||
psrad mm0,DESCALE_P1
|
||||
paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad mm1,DESCALE_P1
|
||||
psrad mm6,DESCALE_P1
|
||||
|
||||
packssdw mm4,mm0 ; mm4=data2
|
||||
packssdw mm1,mm6 ; mm1=data6
|
||||
|
||||
movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
|
||||
movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm5, MMWORD [wk(0)] ; mm5=tmp6
|
||||
movq mm7, MMWORD [wk(1)] ; mm7=tmp7
|
||||
|
||||
movq mm0,mm2 ; mm2=tmp4
|
||||
movq mm6,mm3 ; mm3=tmp5
|
||||
paddw mm0,mm5 ; mm0=z3
|
||||
paddw mm6,mm7 ; mm6=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movq mm4,mm0
|
||||
movq mm1,mm0
|
||||
punpcklwd mm4,mm6
|
||||
punpckhwd mm1,mm6
|
||||
movq mm0,mm4
|
||||
movq mm6,mm1
|
||||
pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L
|
||||
pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H
|
||||
pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L
|
||||
pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H
|
||||
|
||||
movq MMWORD [wk(0)], mm4 ; wk(0)=z3L
|
||||
movq MMWORD [wk(1)], mm1 ; wk(1)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
|
||||
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
|
||||
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
|
||||
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
|
||||
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
|
||||
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
|
||||
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
|
||||
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
||||
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
||||
|
||||
movq mm4,mm2
|
||||
movq mm1,mm2
|
||||
punpcklwd mm4,mm7
|
||||
punpckhwd mm1,mm7
|
||||
movq mm2,mm4
|
||||
movq mm7,mm1
|
||||
pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L
|
||||
pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H
|
||||
pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L
|
||||
pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H
|
||||
|
||||
paddd mm4, MMWORD [wk(0)] ; mm4=data7L
|
||||
paddd mm1, MMWORD [wk(1)] ; mm1=data7H
|
||||
paddd mm2,mm0 ; mm2=data1L
|
||||
paddd mm7,mm6 ; mm7=data1H
|
||||
|
||||
paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad mm4,DESCALE_P1
|
||||
psrad mm1,DESCALE_P1
|
||||
paddd mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad mm2,DESCALE_P1
|
||||
psrad mm7,DESCALE_P1
|
||||
|
||||
packssdw mm4,mm1 ; mm4=data7
|
||||
packssdw mm2,mm7 ; mm2=data1
|
||||
|
||||
movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
|
||||
movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
|
||||
|
||||
movq mm1,mm3
|
||||
movq mm7,mm3
|
||||
punpcklwd mm1,mm5
|
||||
punpckhwd mm7,mm5
|
||||
movq mm3,mm1
|
||||
movq mm5,mm7
|
||||
pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L
|
||||
pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H
|
||||
pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L
|
||||
pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H
|
||||
|
||||
paddd mm1,mm0 ; mm1=data5L
|
||||
paddd mm7,mm6 ; mm7=data5H
|
||||
paddd mm3, MMWORD [wk(0)] ; mm3=data3L
|
||||
paddd mm5, MMWORD [wk(1)] ; mm5=data3H
|
||||
|
||||
paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad mm1,DESCALE_P1
|
||||
psrad mm7,DESCALE_P1
|
||||
paddd mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad mm3,DESCALE_P1
|
||||
psrad mm5,DESCALE_P1
|
||||
|
||||
packssdw mm1,mm7 ; mm1=data5
|
||||
packssdw mm3,mm5 ; mm3=data3
|
||||
|
||||
movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
|
||||
movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
|
||||
|
||||
add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
|
||||
dec ecx
|
||||
jnz near .rowloop
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
mov edx, POINTER [data(eax)] ; (DCTELEM *)
|
||||
mov ecx, DCTSIZE/4
|
||||
alignx 16,7
|
||||
.columnloop:
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; mm0=(02 12 22 32), mm2=(42 52 62 72)
|
||||
; mm1=(03 13 23 33), mm3=(43 53 63 73)
|
||||
|
||||
movq mm4,mm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm0,mm1 ; mm0=(02 03 12 13)
|
||||
punpckhwd mm4,mm1 ; mm4=(22 23 32 33)
|
||||
movq mm5,mm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm2,mm3 ; mm2=(42 43 52 53)
|
||||
punpckhwd mm5,mm3 ; mm5=(62 63 72 73)
|
||||
|
||||
movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; mm6=(00 10 20 30), mm1=(40 50 60 70)
|
||||
; mm7=(01 11 21 31), mm3=(41 51 61 71)
|
||||
|
||||
movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33)
|
||||
movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53)
|
||||
|
||||
movq mm4,mm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm6,mm7 ; mm6=(00 01 10 11)
|
||||
punpckhwd mm4,mm7 ; mm4=(20 21 30 31)
|
||||
movq mm2,mm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm1,mm3 ; mm1=(40 41 50 51)
|
||||
punpckhwd mm2,mm3 ; mm2=(60 61 70 71)
|
||||
|
||||
movq mm7,mm6 ; transpose coefficients(phase 2)
|
||||
punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0
|
||||
punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1
|
||||
movq mm3,mm2 ; transpose coefficients(phase 2)
|
||||
punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6
|
||||
punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7
|
||||
|
||||
movq mm0,mm7
|
||||
movq mm5,mm6
|
||||
psubw mm7,mm2 ; mm7=data1-data6=tmp6
|
||||
psubw mm6,mm3 ; mm6=data0-data7=tmp7
|
||||
paddw mm0,mm2 ; mm0=data1+data6=tmp1
|
||||
paddw mm5,mm3 ; mm5=data0+data7=tmp0
|
||||
|
||||
movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33)
|
||||
movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53)
|
||||
movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
|
||||
movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
|
||||
|
||||
movq mm7,mm4 ; transpose coefficients(phase 2)
|
||||
punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2
|
||||
punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3
|
||||
movq mm6,mm1 ; transpose coefficients(phase 2)
|
||||
punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4
|
||||
punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5
|
||||
|
||||
movq mm2,mm7
|
||||
movq mm3,mm4
|
||||
paddw mm7,mm1 ; mm7=data3+data4=tmp3
|
||||
paddw mm4,mm6 ; mm4=data2+data5=tmp2
|
||||
psubw mm2,mm1 ; mm2=data3-data4=tmp4
|
||||
psubw mm3,mm6 ; mm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm1,mm5
|
||||
movq mm6,mm0
|
||||
paddw mm5,mm7 ; mm5=tmp10
|
||||
paddw mm0,mm4 ; mm0=tmp11
|
||||
psubw mm1,mm7 ; mm1=tmp13
|
||||
psubw mm6,mm4 ; mm6=tmp12
|
||||
|
||||
movq mm7,mm5
|
||||
paddw mm5,mm0 ; mm5=tmp10+tmp11
|
||||
psubw mm7,mm0 ; mm7=tmp10-tmp11
|
||||
|
||||
paddw mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
|
||||
paddw mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
|
||||
psraw mm5,PASS1_BITS ; mm5=data0
|
||||
psraw mm7,PASS1_BITS ; mm7=data4
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
|
||||
movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
|
||||
|
||||
; (Original)
|
||||
; z1 = (tmp12 + tmp13) * 0.541196100;
|
||||
; data2 = z1 + tmp13 * 0.765366865;
|
||||
; data6 = z1 + tmp12 * -1.847759065;
|
||||
;
|
||||
; (This implementation)
|
||||
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
||||
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
||||
|
||||
movq mm4,mm1 ; mm1=tmp13
|
||||
movq mm0,mm1
|
||||
punpcklwd mm4,mm6 ; mm6=tmp12
|
||||
punpckhwd mm0,mm6
|
||||
movq mm1,mm4
|
||||
movq mm6,mm0
|
||||
pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L
|
||||
pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H
|
||||
pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L
|
||||
pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H
|
||||
|
||||
paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad mm4,DESCALE_P2
|
||||
psrad mm0,DESCALE_P2
|
||||
paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad mm1,DESCALE_P2
|
||||
psrad mm6,DESCALE_P2
|
||||
|
||||
packssdw mm4,mm0 ; mm4=data2
|
||||
packssdw mm1,mm6 ; mm1=data6
|
||||
|
||||
movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
|
||||
movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm5, MMWORD [wk(0)] ; mm5=tmp6
|
||||
movq mm7, MMWORD [wk(1)] ; mm7=tmp7
|
||||
|
||||
movq mm0,mm2 ; mm2=tmp4
|
||||
movq mm6,mm3 ; mm3=tmp5
|
||||
paddw mm0,mm5 ; mm0=z3
|
||||
paddw mm6,mm7 ; mm6=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movq mm4,mm0
|
||||
movq mm1,mm0
|
||||
punpcklwd mm4,mm6
|
||||
punpckhwd mm1,mm6
|
||||
movq mm0,mm4
|
||||
movq mm6,mm1
|
||||
pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L
|
||||
pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H
|
||||
pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L
|
||||
pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H
|
||||
|
||||
movq MMWORD [wk(0)], mm4 ; wk(0)=z3L
|
||||
movq MMWORD [wk(1)], mm1 ; wk(1)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
|
||||
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
|
||||
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
|
||||
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
|
||||
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
|
||||
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
|
||||
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
|
||||
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
||||
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
||||
|
||||
movq mm4,mm2
|
||||
movq mm1,mm2
|
||||
punpcklwd mm4,mm7
|
||||
punpckhwd mm1,mm7
|
||||
movq mm2,mm4
|
||||
movq mm7,mm1
|
||||
pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L
|
||||
pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H
|
||||
pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L
|
||||
pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H
|
||||
|
||||
paddd mm4, MMWORD [wk(0)] ; mm4=data7L
|
||||
paddd mm1, MMWORD [wk(1)] ; mm1=data7H
|
||||
paddd mm2,mm0 ; mm2=data1L
|
||||
paddd mm7,mm6 ; mm7=data1H
|
||||
|
||||
paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad mm4,DESCALE_P2
|
||||
psrad mm1,DESCALE_P2
|
||||
paddd mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad mm2,DESCALE_P2
|
||||
psrad mm7,DESCALE_P2
|
||||
|
||||
packssdw mm4,mm1 ; mm4=data7
|
||||
packssdw mm2,mm7 ; mm2=data1
|
||||
|
||||
movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
|
||||
movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
|
||||
|
||||
movq mm1,mm3
|
||||
movq mm7,mm3
|
||||
punpcklwd mm1,mm5
|
||||
punpckhwd mm7,mm5
|
||||
movq mm3,mm1
|
||||
movq mm5,mm7
|
||||
pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L
|
||||
pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H
|
||||
pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L
|
||||
pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H
|
||||
|
||||
paddd mm1,mm0 ; mm1=data5L
|
||||
paddd mm7,mm6 ; mm7=data5H
|
||||
paddd mm3, MMWORD [wk(0)] ; mm3=data3L
|
||||
paddd mm5, MMWORD [wk(1)] ; mm5=data3H
|
||||
|
||||
paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad mm1,DESCALE_P2
|
||||
psrad mm7,DESCALE_P2
|
||||
paddd mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad mm3,DESCALE_P2
|
||||
psrad mm5,DESCALE_P2
|
||||
|
||||
packssdw mm1,mm7 ; mm1=data5
|
||||
packssdw mm3,mm5 ; mm3=data3
|
||||
|
||||
movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
|
||||
movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
|
||||
|
||||
add edx, byte 4*SIZEOF_DCTELEM
|
||||
dec ecx
|
||||
jnz near .columnloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
; pop edi ; unused
|
||||
; pop esi ; unused
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
poppic ebx
|
||||
mov esp,ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
497
simd/jimmxfst.asm
Normal file
497
simd/jimmxfst.asm
Normal file
@@ -0,0 +1,497 @@
|
||||
;
|
||||
; jimmxfst.asm - fast integer IDCT (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a fast, not so accurate integer implementation of
|
||||
; the inverse DCT (Discrete Cosine Transform). The following code is
|
||||
; based directly on the IJG's original jidctfst.c; see the jidctfst.c
|
||||
; for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "simd/jsimdext.inc"
|
||||
%include "simd/jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 8 ; 14 is also OK.
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%if IFAST_SCALE_BITS != PASS1_BITS
|
||||
%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
|
||||
%endif
|
||||
|
||||
%if CONST_BITS == 8
|
||||
F_1_082 equ 277 ; FIX(1.082392200)
|
||||
F_1_414 equ 362 ; FIX(1.414213562)
|
||||
F_1_847 equ 473 ; FIX(1.847759065)
|
||||
F_2_613 equ 669 ; FIX(2.613125930)
|
||||
F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||
F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
|
||||
F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
|
||||
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
|
||||
F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
|
||||
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
|
||||
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
|
||||
|
||||
%define PRE_MULTIPLY_SCALE_BITS 2
|
||||
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_idct_ifast_mmx)
|
||||
|
||||
EXTN(jconst_idct_ifast_mmx):
|
||||
|
||||
PW_F1414 times 4 dw F_1_414 << CONST_SHIFT
|
||||
PW_F1847 times 4 dw F_1_847 << CONST_SHIFT
|
||||
PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT
|
||||
PW_F1082 times 4 dw F_1_082 << CONST_SHIFT
|
||||
PB_CENTERJSAMP times 8 db CENTERJSAMPLE
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_ifast_mmx (void * dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
%define dct_table(b) (b)+8 ; jpeg_component_info * compptr
|
||||
%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
|
||||
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
|
||||
%define output_col(b) (b)+20 ; JDIMENSION output_col
|
||||
|
||||
%define original_ebp ebp+0
|
||||
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF
|
||||
; JCOEF workspace[DCTSIZE2]
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_idct_ifast_mmx)
|
||||
|
||||
EXTN(jsimd_idct_ifast_mmx):
|
||||
push ebp
|
||||
mov eax,esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp],eax
|
||||
mov ebp,esp ; ebp = aligned ebp
|
||||
lea esp, [workspace]
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process columns from input, store into work array.
|
||||
|
||||
; mov eax, [original_ebp]
|
||||
mov edx, POINTER [dct_table(eax)] ; quantptr
|
||||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
lea edi, [workspace] ; JCOEF * wsptr
|
||||
mov ecx, DCTSIZE/4 ; ctr
|
||||
alignx 16,7
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
|
||||
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz short .columnDCT
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1,mm0
|
||||
packsswb mm1,mm1
|
||||
movd eax,mm1
|
||||
test eax,eax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
|
||||
movq mm2,mm0 ; mm0=in0=(00 01 02 03)
|
||||
punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
|
||||
punpckhwd mm2,mm2 ; mm2=(02 02 03 03)
|
||||
|
||||
movq mm1,mm0
|
||||
punpckldq mm0,mm0 ; mm0=(00 00 00 00)
|
||||
punpckhdq mm1,mm1 ; mm1=(01 01 01 01)
|
||||
movq mm3,mm2
|
||||
punpckldq mm2,mm2 ; mm2=(02 02 02 02)
|
||||
punpckhdq mm3,mm3 ; mm3=(03 03 03 03)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
|
||||
movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
|
||||
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
|
||||
movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
|
||||
jmp near .nextcolumn
|
||||
alignx 16,7
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
|
||||
movq mm4,mm0
|
||||
movq mm5,mm1
|
||||
psubw mm0,mm2 ; mm0=tmp11
|
||||
psubw mm1,mm3
|
||||
paddw mm4,mm2 ; mm4=tmp10
|
||||
paddw mm5,mm3 ; mm5=tmp13
|
||||
|
||||
psllw mm1,PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw mm1,[GOTOFF(ebx,PW_F1414)]
|
||||
psubw mm1,mm5 ; mm1=tmp12
|
||||
|
||||
movq mm6,mm4
|
||||
movq mm7,mm0
|
||||
psubw mm4,mm5 ; mm4=tmp3
|
||||
psubw mm0,mm1 ; mm0=tmp2
|
||||
paddw mm6,mm5 ; mm6=tmp0
|
||||
paddw mm7,mm1 ; mm7=tmp1
|
||||
|
||||
movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
|
||||
movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
|
||||
movq mm4,mm2
|
||||
movq mm0,mm5
|
||||
psubw mm2,mm1 ; mm2=z12
|
||||
psubw mm5,mm3 ; mm5=z10
|
||||
paddw mm4,mm1 ; mm4=z11
|
||||
paddw mm0,mm3 ; mm0=z13
|
||||
|
||||
movq mm1,mm5 ; mm1=z10(unscaled)
|
||||
psllw mm2,PRE_MULTIPLY_SCALE_BITS
|
||||
psllw mm5,PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
movq mm3,mm4
|
||||
psubw mm4,mm0
|
||||
paddw mm3,mm0 ; mm3=tmp7
|
||||
|
||||
psllw mm4,PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
|
||||
|
||||
; To avoid overflow...
|
||||
;
|
||||
; (Original)
|
||||
; tmp12 = -2.613125930 * z10 + z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
||||
; = -1.613125930 * z10 - z10 + z5;
|
||||
|
||||
movq mm0,mm5
|
||||
paddw mm5,mm2
|
||||
pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5
|
||||
pmulhw mm0,[GOTOFF(ebx,PW_MF1613)]
|
||||
pmulhw mm2,[GOTOFF(ebx,PW_F1082)]
|
||||
psubw mm0,mm1
|
||||
psubw mm2,mm5 ; mm2=tmp10
|
||||
paddw mm0,mm5 ; mm0=tmp12
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
psubw mm0,mm3 ; mm0=tmp6
|
||||
movq mm1,mm6
|
||||
movq mm5,mm7
|
||||
paddw mm6,mm3 ; mm6=data0=(00 01 02 03)
|
||||
paddw mm7,mm0 ; mm7=data1=(10 11 12 13)
|
||||
psubw mm1,mm3 ; mm1=data7=(70 71 72 73)
|
||||
psubw mm5,mm0 ; mm5=data6=(60 61 62 63)
|
||||
psubw mm4,mm0 ; mm4=tmp5
|
||||
|
||||
movq mm3,mm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm6,mm7 ; mm6=(00 10 01 11)
|
||||
punpckhwd mm3,mm7 ; mm3=(02 12 03 13)
|
||||
movq mm0,mm5 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm5,mm1 ; mm5=(60 70 61 71)
|
||||
punpckhwd mm0,mm1 ; mm0=(62 72 63 73)
|
||||
|
||||
movq mm7, MMWORD [wk(0)] ; mm7=tmp2
|
||||
movq mm1, MMWORD [wk(1)] ; mm1=tmp3
|
||||
|
||||
movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71)
|
||||
movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73)
|
||||
|
||||
paddw mm2,mm4 ; mm2=tmp4
|
||||
movq mm5,mm7
|
||||
movq mm0,mm1
|
||||
paddw mm7,mm4 ; mm7=data2=(20 21 22 23)
|
||||
paddw mm1,mm2 ; mm1=data4=(40 41 42 43)
|
||||
psubw mm5,mm4 ; mm5=data5=(50 51 52 53)
|
||||
psubw mm0,mm2 ; mm0=data3=(30 31 32 33)
|
||||
|
||||
movq mm4,mm7 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm7,mm0 ; mm7=(20 30 21 31)
|
||||
punpckhwd mm4,mm0 ; mm4=(22 32 23 33)
|
||||
movq mm2,mm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm1,mm5 ; mm1=(40 50 41 51)
|
||||
punpckhwd mm2,mm5 ; mm2=(42 52 43 53)
|
||||
|
||||
movq mm0,mm6 ; transpose coefficients(phase 2)
|
||||
punpckldq mm6,mm7 ; mm6=(00 10 20 30)
|
||||
punpckhdq mm0,mm7 ; mm0=(01 11 21 31)
|
||||
movq mm5,mm3 ; transpose coefficients(phase 2)
|
||||
punpckldq mm3,mm4 ; mm3=(02 12 22 32)
|
||||
punpckhdq mm5,mm4 ; mm5=(03 13 23 33)
|
||||
|
||||
movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71)
|
||||
movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
|
||||
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
|
||||
|
||||
movq mm6,mm1 ; transpose coefficients(phase 2)
|
||||
punpckldq mm1,mm7 ; mm1=(40 50 60 70)
|
||||
punpckhdq mm6,mm7 ; mm6=(41 51 61 71)
|
||||
movq mm0,mm2 ; transpose coefficients(phase 2)
|
||||
punpckldq mm2,mm4 ; mm2=(42 52 62 72)
|
||||
punpckhdq mm0,mm4 ; mm0=(43 53 63 73)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
|
||||
movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
|
||||
movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
|
||||
|
||||
.nextcolumn:
|
||||
add esi, byte 4*SIZEOF_JCOEF ; coef_block
|
||||
add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr
|
||||
add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
|
||||
dec ecx ; ctr
|
||||
jnz near .columnloop
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov eax, [original_ebp]
|
||||
lea esi, [workspace] ; JCOEF * wsptr
|
||||
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [output_col(eax)]
|
||||
mov ecx, DCTSIZE/4 ; ctr
|
||||
alignx 16,7
|
||||
.rowloop:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
movq mm4,mm0
|
||||
movq mm5,mm1
|
||||
psubw mm0,mm2 ; mm0=tmp11
|
||||
psubw mm1,mm3
|
||||
paddw mm4,mm2 ; mm4=tmp10
|
||||
paddw mm5,mm3 ; mm5=tmp13
|
||||
|
||||
psllw mm1,PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw mm1,[GOTOFF(ebx,PW_F1414)]
|
||||
psubw mm1,mm5 ; mm1=tmp12
|
||||
|
||||
movq mm6,mm4
|
||||
movq mm7,mm0
|
||||
psubw mm4,mm5 ; mm4=tmp3
|
||||
psubw mm0,mm1 ; mm0=tmp2
|
||||
paddw mm6,mm5 ; mm6=tmp0
|
||||
paddw mm7,mm1 ; mm7=tmp1
|
||||
|
||||
movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
|
||||
movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
movq mm4,mm2
|
||||
movq mm0,mm5
|
||||
psubw mm2,mm1 ; mm2=z12
|
||||
psubw mm5,mm3 ; mm5=z10
|
||||
paddw mm4,mm1 ; mm4=z11
|
||||
paddw mm0,mm3 ; mm0=z13
|
||||
|
||||
movq mm1,mm5 ; mm1=z10(unscaled)
|
||||
psllw mm2,PRE_MULTIPLY_SCALE_BITS
|
||||
psllw mm5,PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
movq mm3,mm4
|
||||
psubw mm4,mm0
|
||||
paddw mm3,mm0 ; mm3=tmp7
|
||||
|
||||
psllw mm4,PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
|
||||
|
||||
; To avoid overflow...
|
||||
;
|
||||
; (Original)
|
||||
; tmp12 = -2.613125930 * z10 + z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
||||
; = -1.613125930 * z10 - z10 + z5;
|
||||
|
||||
movq mm0,mm5
|
||||
paddw mm5,mm2
|
||||
pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5
|
||||
pmulhw mm0,[GOTOFF(ebx,PW_MF1613)]
|
||||
pmulhw mm2,[GOTOFF(ebx,PW_F1082)]
|
||||
psubw mm0,mm1
|
||||
psubw mm2,mm5 ; mm2=tmp10
|
||||
paddw mm0,mm5 ; mm0=tmp12
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
psubw mm0,mm3 ; mm0=tmp6
|
||||
movq mm1,mm6
|
||||
movq mm5,mm7
|
||||
paddw mm6,mm3 ; mm6=data0=(00 10 20 30)
|
||||
paddw mm7,mm0 ; mm7=data1=(01 11 21 31)
|
||||
psraw mm6,(PASS1_BITS+3) ; descale
|
||||
psraw mm7,(PASS1_BITS+3) ; descale
|
||||
psubw mm1,mm3 ; mm1=data7=(07 17 27 37)
|
||||
psubw mm5,mm0 ; mm5=data6=(06 16 26 36)
|
||||
psraw mm1,(PASS1_BITS+3) ; descale
|
||||
psraw mm5,(PASS1_BITS+3) ; descale
|
||||
psubw mm4,mm0 ; mm4=tmp5
|
||||
|
||||
packsswb mm6,mm5 ; mm6=(00 10 20 30 06 16 26 36)
|
||||
packsswb mm7,mm1 ; mm7=(01 11 21 31 07 17 27 37)
|
||||
|
||||
movq mm3, MMWORD [wk(0)] ; mm3=tmp2
|
||||
movq mm0, MMWORD [wk(1)] ; mm0=tmp3
|
||||
|
||||
paddw mm2,mm4 ; mm2=tmp4
|
||||
movq mm5,mm3
|
||||
movq mm1,mm0
|
||||
paddw mm3,mm4 ; mm3=data2=(02 12 22 32)
|
||||
paddw mm0,mm2 ; mm0=data4=(04 14 24 34)
|
||||
psraw mm3,(PASS1_BITS+3) ; descale
|
||||
psraw mm0,(PASS1_BITS+3) ; descale
|
||||
psubw mm5,mm4 ; mm5=data5=(05 15 25 35)
|
||||
psubw mm1,mm2 ; mm1=data3=(03 13 23 33)
|
||||
psraw mm5,(PASS1_BITS+3) ; descale
|
||||
psraw mm1,(PASS1_BITS+3) ; descale
|
||||
|
||||
movq mm4,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP]
|
||||
|
||||
packsswb mm3,mm0 ; mm3=(02 12 22 32 04 14 24 34)
|
||||
packsswb mm1,mm5 ; mm1=(03 13 23 33 05 15 25 35)
|
||||
|
||||
paddb mm6,mm4
|
||||
paddb mm7,mm4
|
||||
paddb mm3,mm4
|
||||
paddb mm1,mm4
|
||||
|
||||
movq mm2,mm6 ; transpose coefficients(phase 1)
|
||||
punpcklbw mm6,mm7 ; mm6=(00 01 10 11 20 21 30 31)
|
||||
punpckhbw mm2,mm7 ; mm2=(06 07 16 17 26 27 36 37)
|
||||
movq mm0,mm3 ; transpose coefficients(phase 1)
|
||||
punpcklbw mm3,mm1 ; mm3=(02 03 12 13 22 23 32 33)
|
||||
punpckhbw mm0,mm1 ; mm0=(04 05 14 15 24 25 34 35)
|
||||
|
||||
movq mm5,mm6 ; transpose coefficients(phase 2)
|
||||
punpcklwd mm6,mm3 ; mm6=(00 01 02 03 10 11 12 13)
|
||||
punpckhwd mm5,mm3 ; mm5=(20 21 22 23 30 31 32 33)
|
||||
movq mm4,mm0 ; transpose coefficients(phase 2)
|
||||
punpcklwd mm0,mm2 ; mm0=(04 05 06 07 14 15 16 17)
|
||||
punpckhwd mm4,mm2 ; mm4=(24 25 26 27 34 35 36 37)
|
||||
|
||||
movq mm7,mm6 ; transpose coefficients(phase 3)
|
||||
punpckldq mm6,mm0 ; mm6=(00 01 02 03 04 05 06 07)
|
||||
punpckhdq mm7,mm0 ; mm7=(10 11 12 13 14 15 16 17)
|
||||
movq mm1,mm5 ; transpose coefficients(phase 3)
|
||||
punpckldq mm5,mm4 ; mm5=(20 21 22 23 24 25 26 27)
|
||||
punpckhdq mm1,mm4 ; mm1=(30 31 32 33 34 35 36 37)
|
||||
|
||||
pushpic ebx ; save GOT address
|
||||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
|
||||
movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
|
||||
mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
|
||||
mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
|
||||
movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
|
||||
movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
|
||||
|
||||
poppic ebx ; restore GOT address
|
||||
|
||||
add esi, byte 4*SIZEOF_JCOEF ; wsptr
|
||||
add edi, byte 4*SIZEOF_JSAMPROW
|
||||
dec ecx ; ctr
|
||||
jnz near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp,ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
849
simd/jimmxint.asm
Normal file
849
simd/jimmxint.asm
Normal file
@@ -0,0 +1,849 @@
|
||||
;
|
||||
; jimmxint.asm - accurate integer IDCT (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a slow-but-accurate integer implementation of the
|
||||
; inverse DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jidctint.c; see the jidctint.c for
|
||||
; more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "simd/jsimdext.inc"
|
||||
%include "simd/jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
|
||||
%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_298 equ 2446 ; FIX(0.298631336)
|
||||
F_0_390 equ 3196 ; FIX(0.390180644)
|
||||
F_0_541 equ 4433 ; FIX(0.541196100)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_175 equ 9633 ; FIX(1.175875602)
|
||||
F_1_501 equ 12299 ; FIX(1.501321110)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_1_961 equ 16069 ; FIX(1.961570560)
|
||||
F_2_053 equ 16819 ; FIX(2.053119869)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_072 equ 25172 ; FIX(3.072711026)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||
F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
|
||||
F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
|
||||
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
|
||||
F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
|
||||
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
|
||||
F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
|
||||
F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
|
||||
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_idct_islow_mmx)
|
||||
|
||||
EXTN(jconst_idct_islow_mmx):
|
||||
|
||||
PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541
|
||||
PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847)
|
||||
PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175
|
||||
PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390)
|
||||
PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899
|
||||
PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899)
|
||||
PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562
|
||||
PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562)
|
||||
PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1)
|
||||
PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1)
|
||||
PB_CENTERJSAMP times 8 db CENTERJSAMPLE
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_islow_mmx (void * dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
%define dct_table(b) (b)+8 ; jpeg_component_info * compptr
|
||||
%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
|
||||
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
|
||||
%define output_col(b) (b)+20 ; JDIMENSION output_col
|
||||
|
||||
%define original_ebp ebp+0
|
||||
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
|
||||
%define WK_NUM 12
|
||||
%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF
|
||||
; JCOEF workspace[DCTSIZE2]
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_idct_islow_mmx)
|
||||
|
||||
EXTN(jsimd_idct_islow_mmx):
|
||||
push ebp
|
||||
mov eax,esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp],eax
|
||||
mov ebp,esp ; ebp = aligned ebp
|
||||
lea esp, [workspace]
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process columns from input, store into work array.
|
||||
|
||||
; mov eax, [original_ebp]
|
||||
mov edx, POINTER [dct_table(eax)] ; quantptr
|
||||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
lea edi, [workspace] ; JCOEF * wsptr
|
||||
mov ecx, DCTSIZE/4 ; ctr
|
||||
alignx 16,7
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
|
||||
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz short .columnDCT
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1,mm0
|
||||
packsswb mm1,mm1
|
||||
movd eax,mm1
|
||||
test eax,eax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
psllw mm0,PASS1_BITS
|
||||
|
||||
movq mm2,mm0 ; mm0=in0=(00 01 02 03)
|
||||
punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
|
||||
punpckhwd mm2,mm2 ; mm2=(02 02 03 03)
|
||||
|
||||
movq mm1,mm0
|
||||
punpckldq mm0,mm0 ; mm0=(00 00 00 00)
|
||||
punpckhdq mm1,mm1 ; mm1=(01 01 01 01)
|
||||
movq mm3,mm2
|
||||
punpckldq mm2,mm2 ; mm2=(02 02 02 02)
|
||||
punpckhdq mm3,mm3 ; mm3=(03 03 03 03)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
|
||||
movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
|
||||
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
|
||||
movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
|
||||
jmp near .nextcolumn
|
||||
alignx 16,7
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; (Original)
|
||||
; z1 = (z2 + z3) * 0.541196100;
|
||||
; tmp2 = z1 + z3 * -1.847759065;
|
||||
; tmp3 = z1 + z2 * 0.765366865;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
||||
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
||||
|
||||
movq mm4,mm1 ; mm1=in2=z2
|
||||
movq mm5,mm1
|
||||
punpcklwd mm4,mm3 ; mm3=in6=z3
|
||||
punpckhwd mm5,mm3
|
||||
movq mm1,mm4
|
||||
movq mm3,mm5
|
||||
pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L
|
||||
pmaddwd mm5,[GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H
|
||||
pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L
|
||||
pmaddwd mm3,[GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H
|
||||
|
||||
movq mm6,mm0
|
||||
paddw mm0,mm2 ; mm0=in0+in4
|
||||
psubw mm6,mm2 ; mm6=in0-in4
|
||||
|
||||
pxor mm7,mm7
|
||||
pxor mm2,mm2
|
||||
punpcklwd mm7,mm0 ; mm7=tmp0L
|
||||
punpckhwd mm2,mm0 ; mm2=tmp0H
|
||||
psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
|
||||
psrad mm2,(16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS
|
||||
|
||||
movq mm0,mm7
|
||||
paddd mm7,mm4 ; mm7=tmp10L
|
||||
psubd mm0,mm4 ; mm0=tmp13L
|
||||
movq mm4,mm2
|
||||
paddd mm2,mm5 ; mm2=tmp10H
|
||||
psubd mm4,mm5 ; mm4=tmp13H
|
||||
|
||||
movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L
|
||||
movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H
|
||||
movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L
|
||||
movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H
|
||||
|
||||
pxor mm5,mm5
|
||||
pxor mm7,mm7
|
||||
punpcklwd mm5,mm6 ; mm5=tmp1L
|
||||
punpckhwd mm7,mm6 ; mm7=tmp1H
|
||||
psrad mm5,(16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS
|
||||
psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
|
||||
|
||||
movq mm2,mm5
|
||||
paddd mm5,mm1 ; mm5=tmp11L
|
||||
psubd mm2,mm1 ; mm2=tmp12L
|
||||
movq mm0,mm7
|
||||
paddd mm7,mm3 ; mm7=tmp11H
|
||||
psubd mm0,mm3 ; mm0=tmp12H
|
||||
|
||||
movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L
|
||||
movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H
|
||||
movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L
|
||||
movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
movq mm5,mm6
|
||||
movq mm7,mm4
|
||||
paddw mm5,mm3 ; mm5=z3
|
||||
paddw mm7,mm1 ; mm7=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movq mm2,mm5
|
||||
movq mm0,mm5
|
||||
punpcklwd mm2,mm7
|
||||
punpckhwd mm0,mm7
|
||||
movq mm5,mm2
|
||||
movq mm7,mm0
|
||||
pmaddwd mm2,[GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L
|
||||
pmaddwd mm0,[GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H
|
||||
pmaddwd mm5,[GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L
|
||||
pmaddwd mm7,[GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H
|
||||
|
||||
movq MMWORD [wk(10)], mm2 ; wk(10)=z3L
|
||||
movq MMWORD [wk(11)], mm0 ; wk(11)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
|
||||
; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
|
||||
; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; tmp0 += z1 + z3; tmp1 += z2 + z4;
|
||||
; tmp2 += z2 + z3; tmp3 += z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
|
||||
; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
|
||||
; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
|
||||
; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
|
||||
; tmp0 += z3; tmp1 += z4;
|
||||
; tmp2 += z3; tmp3 += z4;
|
||||
|
||||
movq mm2,mm3
|
||||
movq mm0,mm3
|
||||
punpcklwd mm2,mm4
|
||||
punpckhwd mm0,mm4
|
||||
movq mm3,mm2
|
||||
movq mm4,mm0
|
||||
pmaddwd mm2,[GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L
|
||||
pmaddwd mm0,[GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H
|
||||
pmaddwd mm3,[GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L
|
||||
pmaddwd mm4,[GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H
|
||||
|
||||
paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L
|
||||
paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H
|
||||
paddd mm3,mm5 ; mm3=tmp3L
|
||||
paddd mm4,mm7 ; mm4=tmp3H
|
||||
|
||||
movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L
|
||||
movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H
|
||||
|
||||
movq mm2,mm1
|
||||
movq mm0,mm1
|
||||
punpcklwd mm2,mm6
|
||||
punpckhwd mm0,mm6
|
||||
movq mm1,mm2
|
||||
movq mm6,mm0
|
||||
pmaddwd mm2,[GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L
|
||||
pmaddwd mm0,[GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H
|
||||
pmaddwd mm1,[GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L
|
||||
pmaddwd mm6,[GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H
|
||||
|
||||
paddd mm2,mm5 ; mm2=tmp1L
|
||||
paddd mm0,mm7 ; mm0=tmp1H
|
||||
paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L
|
||||
paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H
|
||||
|
||||
movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L
|
||||
movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movq mm5, MMWORD [wk(0)] ; mm5=tmp10L
|
||||
movq mm7, MMWORD [wk(1)] ; mm7=tmp10H
|
||||
|
||||
movq mm2,mm5
|
||||
movq mm0,mm7
|
||||
paddd mm5,mm3 ; mm5=data0L
|
||||
paddd mm7,mm4 ; mm7=data0H
|
||||
psubd mm2,mm3 ; mm2=data7L
|
||||
psubd mm0,mm4 ; mm0=data7H
|
||||
|
||||
movq mm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1]
|
||||
|
||||
paddd mm5,mm3
|
||||
paddd mm7,mm3
|
||||
psrad mm5,DESCALE_P1
|
||||
psrad mm7,DESCALE_P1
|
||||
paddd mm2,mm3
|
||||
paddd mm0,mm3
|
||||
psrad mm2,DESCALE_P1
|
||||
psrad mm0,DESCALE_P1
|
||||
|
||||
packssdw mm5,mm7 ; mm5=data0=(00 01 02 03)
|
||||
packssdw mm2,mm0 ; mm2=data7=(70 71 72 73)
|
||||
|
||||
movq mm4, MMWORD [wk(4)] ; mm4=tmp11L
|
||||
movq mm3, MMWORD [wk(5)] ; mm3=tmp11H
|
||||
|
||||
movq mm7,mm4
|
||||
movq mm0,mm3
|
||||
paddd mm4,mm1 ; mm4=data1L
|
||||
paddd mm3,mm6 ; mm3=data1H
|
||||
psubd mm7,mm1 ; mm7=data6L
|
||||
psubd mm0,mm6 ; mm0=data6H
|
||||
|
||||
movq mm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1]
|
||||
|
||||
paddd mm4,mm1
|
||||
paddd mm3,mm1
|
||||
psrad mm4,DESCALE_P1
|
||||
psrad mm3,DESCALE_P1
|
||||
paddd mm7,mm1
|
||||
paddd mm0,mm1
|
||||
psrad mm7,DESCALE_P1
|
||||
psrad mm0,DESCALE_P1
|
||||
|
||||
packssdw mm4,mm3 ; mm4=data1=(10 11 12 13)
|
||||
packssdw mm7,mm0 ; mm7=data6=(60 61 62 63)
|
||||
|
||||
movq mm6,mm5 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm5,mm4 ; mm5=(00 10 01 11)
|
||||
punpckhwd mm6,mm4 ; mm6=(02 12 03 13)
|
||||
movq mm1,mm7 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm7,mm2 ; mm7=(60 70 61 71)
|
||||
punpckhwd mm1,mm2 ; mm1=(62 72 63 73)
|
||||
|
||||
movq mm3, MMWORD [wk(6)] ; mm3=tmp12L
|
||||
movq mm0, MMWORD [wk(7)] ; mm0=tmp12H
|
||||
movq mm4, MMWORD [wk(10)] ; mm4=tmp1L
|
||||
movq mm2, MMWORD [wk(11)] ; mm2=tmp1H
|
||||
|
||||
movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 01 11)
|
||||
movq MMWORD [wk(1)], mm6 ; wk(1)=(02 12 03 13)
|
||||
movq MMWORD [wk(4)], mm7 ; wk(4)=(60 70 61 71)
|
||||
movq MMWORD [wk(5)], mm1 ; wk(5)=(62 72 63 73)
|
||||
|
||||
movq mm5,mm3
|
||||
movq mm6,mm0
|
||||
paddd mm3,mm4 ; mm3=data2L
|
||||
paddd mm0,mm2 ; mm0=data2H
|
||||
psubd mm5,mm4 ; mm5=data5L
|
||||
psubd mm6,mm2 ; mm6=data5H
|
||||
|
||||
movq mm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1]
|
||||
|
||||
paddd mm3,mm7
|
||||
paddd mm0,mm7
|
||||
psrad mm3,DESCALE_P1
|
||||
psrad mm0,DESCALE_P1
|
||||
paddd mm5,mm7
|
||||
paddd mm6,mm7
|
||||
psrad mm5,DESCALE_P1
|
||||
psrad mm6,DESCALE_P1
|
||||
|
||||
packssdw mm3,mm0 ; mm3=data2=(20 21 22 23)
|
||||
packssdw mm5,mm6 ; mm5=data5=(50 51 52 53)
|
||||
|
||||
movq mm1, MMWORD [wk(2)] ; mm1=tmp13L
|
||||
movq mm4, MMWORD [wk(3)] ; mm4=tmp13H
|
||||
movq mm2, MMWORD [wk(8)] ; mm2=tmp0L
|
||||
movq mm7, MMWORD [wk(9)] ; mm7=tmp0H
|
||||
|
||||
movq mm0,mm1
|
||||
movq mm6,mm4
|
||||
paddd mm1,mm2 ; mm1=data3L
|
||||
paddd mm4,mm7 ; mm4=data3H
|
||||
psubd mm0,mm2 ; mm0=data4L
|
||||
psubd mm6,mm7 ; mm6=data4H
|
||||
|
||||
movq mm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1]
|
||||
|
||||
paddd mm1,mm2
|
||||
paddd mm4,mm2
|
||||
psrad mm1,DESCALE_P1
|
||||
psrad mm4,DESCALE_P1
|
||||
paddd mm0,mm2
|
||||
paddd mm6,mm2
|
||||
psrad mm0,DESCALE_P1
|
||||
psrad mm6,DESCALE_P1
|
||||
|
||||
packssdw mm1,mm4 ; mm1=data3=(30 31 32 33)
|
||||
packssdw mm0,mm6 ; mm0=data4=(40 41 42 43)
|
||||
|
||||
movq mm7, MMWORD [wk(0)] ; mm7=(00 10 01 11)
|
||||
movq mm2, MMWORD [wk(1)] ; mm2=(02 12 03 13)
|
||||
|
||||
movq mm4,mm3 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm3,mm1 ; mm3=(20 30 21 31)
|
||||
punpckhwd mm4,mm1 ; mm4=(22 32 23 33)
|
||||
movq mm6,mm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm0,mm5 ; mm0=(40 50 41 51)
|
||||
punpckhwd mm6,mm5 ; mm6=(42 52 43 53)
|
||||
|
||||
movq mm1,mm7 ; transpose coefficients(phase 2)
|
||||
punpckldq mm7,mm3 ; mm7=(00 10 20 30)
|
||||
punpckhdq mm1,mm3 ; mm1=(01 11 21 31)
|
||||
movq mm5,mm2 ; transpose coefficients(phase 2)
|
||||
punpckldq mm2,mm4 ; mm2=(02 12 22 32)
|
||||
punpckhdq mm5,mm4 ; mm5=(03 13 23 33)
|
||||
|
||||
movq mm3, MMWORD [wk(4)] ; mm3=(60 70 61 71)
|
||||
movq mm4, MMWORD [wk(5)] ; mm4=(62 72 63 73)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
|
||||
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
|
||||
|
||||
movq mm7,mm0 ; transpose coefficients(phase 2)
|
||||
punpckldq mm0,mm3 ; mm0=(40 50 60 70)
|
||||
punpckhdq mm7,mm3 ; mm7=(41 51 61 71)
|
||||
movq mm1,mm6 ; transpose coefficients(phase 2)
|
||||
punpckldq mm6,mm4 ; mm6=(42 52 62 72)
|
||||
punpckhdq mm1,mm4 ; mm1=(43 53 63 73)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
|
||||
movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
|
||||
movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
|
||||
|
||||
.nextcolumn:
|
||||
add esi, byte 4*SIZEOF_JCOEF ; coef_block
|
||||
add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr
|
||||
add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
|
||||
dec ecx ; ctr
|
||||
jnz near .columnloop
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov eax, [original_ebp]
|
||||
lea esi, [workspace] ; JCOEF * wsptr
|
||||
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [output_col(eax)]
|
||||
mov ecx, DCTSIZE/4 ; ctr
|
||||
alignx 16,7
|
||||
.rowloop:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
; (Original)
|
||||
; z1 = (z2 + z3) * 0.541196100;
|
||||
; tmp2 = z1 + z3 * -1.847759065;
|
||||
; tmp3 = z1 + z2 * 0.765366865;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
||||
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
||||
|
||||
movq mm4,mm1 ; mm1=in2=z2
|
||||
movq mm5,mm1
|
||||
punpcklwd mm4,mm3 ; mm3=in6=z3
|
||||
punpckhwd mm5,mm3
|
||||
movq mm1,mm4
|
||||
movq mm3,mm5
|
||||
pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L
|
||||
pmaddwd mm5,[GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H
|
||||
pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L
|
||||
pmaddwd mm3,[GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H
|
||||
|
||||
movq mm6,mm0
|
||||
paddw mm0,mm2 ; mm0=in0+in4
|
||||
psubw mm6,mm2 ; mm6=in0-in4
|
||||
|
||||
pxor mm7,mm7
|
||||
pxor mm2,mm2
|
||||
punpcklwd mm7,mm0 ; mm7=tmp0L
|
||||
punpckhwd mm2,mm0 ; mm2=tmp0H
|
||||
psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
|
||||
psrad mm2,(16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS
|
||||
|
||||
movq mm0,mm7
|
||||
paddd mm7,mm4 ; mm7=tmp10L
|
||||
psubd mm0,mm4 ; mm0=tmp13L
|
||||
movq mm4,mm2
|
||||
paddd mm2,mm5 ; mm2=tmp10H
|
||||
psubd mm4,mm5 ; mm4=tmp13H
|
||||
|
||||
movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L
|
||||
movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H
|
||||
movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L
|
||||
movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H
|
||||
|
||||
pxor mm5,mm5
|
||||
pxor mm7,mm7
|
||||
punpcklwd mm5,mm6 ; mm5=tmp1L
|
||||
punpckhwd mm7,mm6 ; mm7=tmp1H
|
||||
psrad mm5,(16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS
|
||||
psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
|
||||
|
||||
movq mm2,mm5
|
||||
paddd mm5,mm1 ; mm5=tmp11L
|
||||
psubd mm2,mm1 ; mm2=tmp12L
|
||||
movq mm0,mm7
|
||||
paddd mm7,mm3 ; mm7=tmp11H
|
||||
psubd mm0,mm3 ; mm0=tmp12H
|
||||
|
||||
movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L
|
||||
movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H
|
||||
movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L
|
||||
movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
movq mm5,mm6
|
||||
movq mm7,mm4
|
||||
paddw mm5,mm3 ; mm5=z3
|
||||
paddw mm7,mm1 ; mm7=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movq mm2,mm5
|
||||
movq mm0,mm5
|
||||
punpcklwd mm2,mm7
|
||||
punpckhwd mm0,mm7
|
||||
movq mm5,mm2
|
||||
movq mm7,mm0
|
||||
pmaddwd mm2,[GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L
|
||||
pmaddwd mm0,[GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H
|
||||
pmaddwd mm5,[GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L
|
||||
pmaddwd mm7,[GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H
|
||||
|
||||
movq MMWORD [wk(10)], mm2 ; wk(10)=z3L
|
||||
movq MMWORD [wk(11)], mm0 ; wk(11)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
|
||||
; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
|
||||
; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; tmp0 += z1 + z3; tmp1 += z2 + z4;
|
||||
; tmp2 += z2 + z3; tmp3 += z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
|
||||
; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
|
||||
; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
|
||||
; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
|
||||
; tmp0 += z3; tmp1 += z4;
|
||||
; tmp2 += z3; tmp3 += z4;
|
||||
|
||||
movq mm2,mm3
|
||||
movq mm0,mm3
|
||||
punpcklwd mm2,mm4
|
||||
punpckhwd mm0,mm4
|
||||
movq mm3,mm2
|
||||
movq mm4,mm0
|
||||
pmaddwd mm2,[GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L
|
||||
pmaddwd mm0,[GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H
|
||||
pmaddwd mm3,[GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L
|
||||
pmaddwd mm4,[GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H
|
||||
|
||||
paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L
|
||||
paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H
|
||||
paddd mm3,mm5 ; mm3=tmp3L
|
||||
paddd mm4,mm7 ; mm4=tmp3H
|
||||
|
||||
movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L
|
||||
movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H
|
||||
|
||||
movq mm2,mm1
|
||||
movq mm0,mm1
|
||||
punpcklwd mm2,mm6
|
||||
punpckhwd mm0,mm6
|
||||
movq mm1,mm2
|
||||
movq mm6,mm0
|
||||
pmaddwd mm2,[GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L
|
||||
pmaddwd mm0,[GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H
|
||||
pmaddwd mm1,[GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L
|
||||
pmaddwd mm6,[GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H
|
||||
|
||||
paddd mm2,mm5 ; mm2=tmp1L
|
||||
paddd mm0,mm7 ; mm0=tmp1H
|
||||
paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L
|
||||
paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H
|
||||
|
||||
movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L
|
||||
movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movq mm5, MMWORD [wk(0)] ; mm5=tmp10L
|
||||
movq mm7, MMWORD [wk(1)] ; mm7=tmp10H
|
||||
|
||||
movq mm2,mm5
|
||||
movq mm0,mm7
|
||||
paddd mm5,mm3 ; mm5=data0L
|
||||
paddd mm7,mm4 ; mm7=data0H
|
||||
psubd mm2,mm3 ; mm2=data7L
|
||||
psubd mm0,mm4 ; mm0=data7H
|
||||
|
||||
movq mm3,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2]
|
||||
|
||||
paddd mm5,mm3
|
||||
paddd mm7,mm3
|
||||
psrad mm5,DESCALE_P2
|
||||
psrad mm7,DESCALE_P2
|
||||
paddd mm2,mm3
|
||||
paddd mm0,mm3
|
||||
psrad mm2,DESCALE_P2
|
||||
psrad mm0,DESCALE_P2
|
||||
|
||||
packssdw mm5,mm7 ; mm5=data0=(00 10 20 30)
|
||||
packssdw mm2,mm0 ; mm2=data7=(07 17 27 37)
|
||||
|
||||
movq mm4, MMWORD [wk(4)] ; mm4=tmp11L
|
||||
movq mm3, MMWORD [wk(5)] ; mm3=tmp11H
|
||||
|
||||
movq mm7,mm4
|
||||
movq mm0,mm3
|
||||
paddd mm4,mm1 ; mm4=data1L
|
||||
paddd mm3,mm6 ; mm3=data1H
|
||||
psubd mm7,mm1 ; mm7=data6L
|
||||
psubd mm0,mm6 ; mm0=data6H
|
||||
|
||||
movq mm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2]
|
||||
|
||||
paddd mm4,mm1
|
||||
paddd mm3,mm1
|
||||
psrad mm4,DESCALE_P2
|
||||
psrad mm3,DESCALE_P2
|
||||
paddd mm7,mm1
|
||||
paddd mm0,mm1
|
||||
psrad mm7,DESCALE_P2
|
||||
psrad mm0,DESCALE_P2
|
||||
|
||||
packssdw mm4,mm3 ; mm4=data1=(01 11 21 31)
|
||||
packssdw mm7,mm0 ; mm7=data6=(06 16 26 36)
|
||||
|
||||
packsswb mm5,mm7 ; mm5=(00 10 20 30 06 16 26 36)
|
||||
packsswb mm4,mm2 ; mm4=(01 11 21 31 07 17 27 37)
|
||||
|
||||
movq mm6, MMWORD [wk(6)] ; mm6=tmp12L
|
||||
movq mm1, MMWORD [wk(7)] ; mm1=tmp12H
|
||||
movq mm3, MMWORD [wk(10)] ; mm3=tmp1L
|
||||
movq mm0, MMWORD [wk(11)] ; mm0=tmp1H
|
||||
|
||||
movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 20 30 06 16 26 36)
|
||||
movq MMWORD [wk(1)], mm4 ; wk(1)=(01 11 21 31 07 17 27 37)
|
||||
|
||||
movq mm7,mm6
|
||||
movq mm2,mm1
|
||||
paddd mm6,mm3 ; mm6=data2L
|
||||
paddd mm1,mm0 ; mm1=data2H
|
||||
psubd mm7,mm3 ; mm7=data5L
|
||||
psubd mm2,mm0 ; mm2=data5H
|
||||
|
||||
movq mm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2]
|
||||
|
||||
paddd mm6,mm5
|
||||
paddd mm1,mm5
|
||||
psrad mm6,DESCALE_P2
|
||||
psrad mm1,DESCALE_P2
|
||||
paddd mm7,mm5
|
||||
paddd mm2,mm5
|
||||
psrad mm7,DESCALE_P2
|
||||
psrad mm2,DESCALE_P2
|
||||
|
||||
packssdw mm6,mm1 ; mm6=data2=(02 12 22 32)
|
||||
packssdw mm7,mm2 ; mm7=data5=(05 15 25 35)
|
||||
|
||||
movq mm4, MMWORD [wk(2)] ; mm4=tmp13L
|
||||
movq mm3, MMWORD [wk(3)] ; mm3=tmp13H
|
||||
movq mm0, MMWORD [wk(8)] ; mm0=tmp0L
|
||||
movq mm5, MMWORD [wk(9)] ; mm5=tmp0H
|
||||
|
||||
movq mm1,mm4
|
||||
movq mm2,mm3
|
||||
paddd mm4,mm0 ; mm4=data3L
|
||||
paddd mm3,mm5 ; mm3=data3H
|
||||
psubd mm1,mm0 ; mm1=data4L
|
||||
psubd mm2,mm5 ; mm2=data4H
|
||||
|
||||
movq mm0,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2]
|
||||
|
||||
paddd mm4,mm0
|
||||
paddd mm3,mm0
|
||||
psrad mm4,DESCALE_P2
|
||||
psrad mm3,DESCALE_P2
|
||||
paddd mm1,mm0
|
||||
paddd mm2,mm0
|
||||
psrad mm1,DESCALE_P2
|
||||
psrad mm2,DESCALE_P2
|
||||
|
||||
movq mm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm5=[PB_CENTERJSAMP]
|
||||
|
||||
packssdw mm4,mm3 ; mm4=data3=(03 13 23 33)
|
||||
packssdw mm1,mm2 ; mm1=data4=(04 14 24 34)
|
||||
|
||||
movq mm0, MMWORD [wk(0)] ; mm0=(00 10 20 30 06 16 26 36)
|
||||
movq mm3, MMWORD [wk(1)] ; mm3=(01 11 21 31 07 17 27 37)
|
||||
|
||||
packsswb mm6,mm1 ; mm6=(02 12 22 32 04 14 24 34)
|
||||
packsswb mm4,mm7 ; mm4=(03 13 23 33 05 15 25 35)
|
||||
|
||||
paddb mm0,mm5
|
||||
paddb mm3,mm5
|
||||
paddb mm6,mm5
|
||||
paddb mm4,mm5
|
||||
|
||||
movq mm2,mm0 ; transpose coefficients(phase 1)
|
||||
punpcklbw mm0,mm3 ; mm0=(00 01 10 11 20 21 30 31)
|
||||
punpckhbw mm2,mm3 ; mm2=(06 07 16 17 26 27 36 37)
|
||||
movq mm1,mm6 ; transpose coefficients(phase 1)
|
||||
punpcklbw mm6,mm4 ; mm6=(02 03 12 13 22 23 32 33)
|
||||
punpckhbw mm1,mm4 ; mm1=(04 05 14 15 24 25 34 35)
|
||||
|
||||
movq mm7,mm0 ; transpose coefficients(phase 2)
|
||||
punpcklwd mm0,mm6 ; mm0=(00 01 02 03 10 11 12 13)
|
||||
punpckhwd mm7,mm6 ; mm7=(20 21 22 23 30 31 32 33)
|
||||
movq mm5,mm1 ; transpose coefficients(phase 2)
|
||||
punpcklwd mm1,mm2 ; mm1=(04 05 06 07 14 15 16 17)
|
||||
punpckhwd mm5,mm2 ; mm5=(24 25 26 27 34 35 36 37)
|
||||
|
||||
movq mm3,mm0 ; transpose coefficients(phase 3)
|
||||
punpckldq mm0,mm1 ; mm0=(00 01 02 03 04 05 06 07)
|
||||
punpckhdq mm3,mm1 ; mm3=(10 11 12 13 14 15 16 17)
|
||||
movq mm4,mm7 ; transpose coefficients(phase 3)
|
||||
punpckldq mm7,mm5 ; mm7=(20 21 22 23 24 25 26 27)
|
||||
punpckhdq mm4,mm5 ; mm4=(30 31 32 33 34 35 36 37)
|
||||
|
||||
pushpic ebx ; save GOT address
|
||||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
|
||||
movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
|
||||
mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
|
||||
mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
|
||||
movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
|
||||
movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
|
||||
|
||||
poppic ebx ; restore GOT address
|
||||
|
||||
add esi, byte 4*SIZEOF_JCOEF ; wsptr
|
||||
add edi, byte 4*SIZEOF_JSAMPROW
|
||||
dec ecx ; ctr
|
||||
jnz near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp,ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
703
simd/jimmxred.asm
Normal file
703
simd/jimmxred.asm
Normal file
@@ -0,0 +1,703 @@
|
||||
;
|
||||
; jimmxred.asm - reduced-size IDCT (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains inverse-DCT routines that produce reduced-size
|
||||
; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
|
||||
; The following code is based directly on the IJG's original jidctred.c;
|
||||
; see the jidctred.c for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "simd/jsimdext.inc"
|
||||
%include "simd/jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
|
||||
%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
|
||||
%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
|
||||
%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_211 equ 1730 ; FIX(0.211164243)
|
||||
F_0_509 equ 4176 ; FIX(0.509795579)
|
||||
F_0_601 equ 4926 ; FIX(0.601344887)
|
||||
F_0_720 equ 5906 ; FIX(0.720959822)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_850 equ 6967 ; FIX(0.850430095)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_061 equ 8697 ; FIX(1.061594337)
|
||||
F_1_272 equ 10426 ; FIX(1.272758580)
|
||||
F_1_451 equ 11893 ; FIX(1.451774981)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_2_172 equ 17799 ; FIX(2.172734803)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_624 equ 29692 ; FIX(3.624509785)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||
F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243)
|
||||
F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579)
|
||||
F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887)
|
||||
F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822)
|
||||
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095)
|
||||
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337)
|
||||
F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580)
|
||||
F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981)
|
||||
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
|
||||
F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803)
|
||||
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_idct_red_mmx)
|
||||
|
||||
EXTN(jconst_idct_red_mmx):
|
||||
|
||||
PW_F184_MF076 times 2 dw F_1_847,-F_0_765
|
||||
PW_F256_F089 times 2 dw F_2_562, F_0_899
|
||||
PW_F106_MF217 times 2 dw F_1_061,-F_2_172
|
||||
PW_MF060_MF050 times 2 dw -F_0_601,-F_0_509
|
||||
PW_F145_MF021 times 2 dw F_1_451,-F_0_211
|
||||
PW_F362_MF127 times 2 dw F_3_624,-F_1_272
|
||||
PW_F085_MF072 times 2 dw F_0_850,-F_0_720
|
||||
PD_DESCALE_P1_4 times 2 dd 1 << (DESCALE_P1_4-1)
|
||||
PD_DESCALE_P2_4 times 2 dd 1 << (DESCALE_P2_4-1)
|
||||
PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2-1)
|
||||
PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2-1)
|
||||
PB_CENTERJSAMP times 8 db CENTERJSAMPLE
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients,
|
||||
; producing a reduced-size 4x4 output block.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_4x4_mmx (void * dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
%define dct_table(b) (b)+8 ; void * dct_table
|
||||
%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
|
||||
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
|
||||
%define output_col(b) (b)+20 ; JDIMENSION output_col
|
||||
|
||||
%define original_ebp ebp+0
|
||||
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF
|
||||
; JCOEF workspace[DCTSIZE2]
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_idct_4x4_mmx)
|
||||
|
||||
EXTN(jsimd_idct_4x4_mmx):
|
||||
push ebp
|
||||
mov eax,esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp],eax
|
||||
mov ebp,esp ; ebp = aligned ebp
|
||||
lea esp, [workspace]
|
||||
pushpic ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process columns from input, store into work array.
|
||||
|
||||
; mov eax, [original_ebp]
|
||||
mov edx, POINTER [dct_table(eax)] ; quantptr
|
||||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
lea edi, [workspace] ; JCOEF * wsptr
|
||||
mov ecx, DCTSIZE/4 ; ctr
|
||||
alignx 16,7
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
|
||||
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz short .columnDCT
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0,mm1
|
||||
packsswb mm0,mm0
|
||||
movd eax,mm0
|
||||
test eax,eax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
psllw mm0,PASS1_BITS
|
||||
|
||||
movq mm2,mm0 ; mm0=in0=(00 01 02 03)
|
||||
punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
|
||||
punpckhwd mm2,mm2 ; mm2=(02 02 03 03)
|
||||
|
||||
movq mm1,mm0
|
||||
punpckldq mm0,mm0 ; mm0=(00 00 00 00)
|
||||
punpckhdq mm1,mm1 ; mm1=(01 01 01 01)
|
||||
movq mm3,mm2
|
||||
punpckldq mm2,mm2 ; mm2=(02 02 02 02)
|
||||
punpckhdq mm3,mm3 ; mm3=(03 03 03 03)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
|
||||
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
|
||||
jmp near .nextcolumn
|
||||
alignx 16,7
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
movq mm4,mm0
|
||||
movq mm5,mm0
|
||||
punpcklwd mm4,mm1
|
||||
punpckhwd mm5,mm1
|
||||
movq mm0,mm4
|
||||
movq mm1,mm5
|
||||
pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L)
|
||||
pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H)
|
||||
pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L)
|
||||
pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H)
|
||||
|
||||
movq mm6,mm2
|
||||
movq mm7,mm2
|
||||
punpcklwd mm6,mm3
|
||||
punpckhwd mm7,mm3
|
||||
movq mm2,mm6
|
||||
movq mm3,mm7
|
||||
pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L)
|
||||
pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H)
|
||||
pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L)
|
||||
pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H)
|
||||
|
||||
paddd mm6,mm4 ; mm6=tmp2L
|
||||
paddd mm7,mm5 ; mm7=tmp2H
|
||||
paddd mm2,mm0 ; mm2=tmp0L
|
||||
paddd mm3,mm1 ; mm3=tmp0H
|
||||
|
||||
movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L
|
||||
movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
pxor mm1,mm1
|
||||
pxor mm2,mm2
|
||||
punpcklwd mm1,mm4 ; mm1=tmp0L
|
||||
punpckhwd mm2,mm4 ; mm2=tmp0H
|
||||
psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
|
||||
psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
|
||||
|
||||
movq mm3,mm5 ; mm5=in2=z2
|
||||
punpcklwd mm5,mm0 ; mm0=in6=z3
|
||||
punpckhwd mm3,mm0
|
||||
pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L
|
||||
pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H
|
||||
|
||||
movq mm4,mm1
|
||||
movq mm0,mm2
|
||||
paddd mm1,mm5 ; mm1=tmp10L
|
||||
paddd mm2,mm3 ; mm2=tmp10H
|
||||
psubd mm4,mm5 ; mm4=tmp12L
|
||||
psubd mm0,mm3 ; mm0=tmp12H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movq mm5,mm1
|
||||
movq mm3,mm2
|
||||
paddd mm1,mm6 ; mm1=data0L
|
||||
paddd mm2,mm7 ; mm2=data0H
|
||||
psubd mm5,mm6 ; mm5=data3L
|
||||
psubd mm3,mm7 ; mm3=data3H
|
||||
|
||||
movq mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm6=[PD_DESCALE_P1_4]
|
||||
|
||||
paddd mm1,mm6
|
||||
paddd mm2,mm6
|
||||
psrad mm1,DESCALE_P1_4
|
||||
psrad mm2,DESCALE_P1_4
|
||||
paddd mm5,mm6
|
||||
paddd mm3,mm6
|
||||
psrad mm5,DESCALE_P1_4
|
||||
psrad mm3,DESCALE_P1_4
|
||||
|
||||
packssdw mm1,mm2 ; mm1=data0=(00 01 02 03)
|
||||
packssdw mm5,mm3 ; mm5=data3=(30 31 32 33)
|
||||
|
||||
movq mm7, MMWORD [wk(0)] ; mm7=tmp0L
|
||||
movq mm6, MMWORD [wk(1)] ; mm6=tmp0H
|
||||
|
||||
movq mm2,mm4
|
||||
movq mm3,mm0
|
||||
paddd mm4,mm7 ; mm4=data1L
|
||||
paddd mm0,mm6 ; mm0=data1H
|
||||
psubd mm2,mm7 ; mm2=data2L
|
||||
psubd mm3,mm6 ; mm3=data2H
|
||||
|
||||
movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm7=[PD_DESCALE_P1_4]
|
||||
|
||||
paddd mm4,mm7
|
||||
paddd mm0,mm7
|
||||
psrad mm4,DESCALE_P1_4
|
||||
psrad mm0,DESCALE_P1_4
|
||||
paddd mm2,mm7
|
||||
paddd mm3,mm7
|
||||
psrad mm2,DESCALE_P1_4
|
||||
psrad mm3,DESCALE_P1_4
|
||||
|
||||
packssdw mm4,mm0 ; mm4=data1=(10 11 12 13)
|
||||
packssdw mm2,mm3 ; mm2=data2=(20 21 22 23)
|
||||
|
||||
movq mm6,mm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm1,mm4 ; mm1=(00 10 01 11)
|
||||
punpckhwd mm6,mm4 ; mm6=(02 12 03 13)
|
||||
movq mm7,mm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm2,mm5 ; mm2=(20 30 21 31)
|
||||
punpckhwd mm7,mm5 ; mm7=(22 32 23 33)
|
||||
|
||||
movq mm0,mm1 ; transpose coefficients(phase 2)
|
||||
punpckldq mm1,mm2 ; mm1=(00 10 20 30)
|
||||
punpckhdq mm0,mm2 ; mm0=(01 11 21 31)
|
||||
movq mm3,mm6 ; transpose coefficients(phase 2)
|
||||
punpckldq mm6,mm7 ; mm6=(02 12 22 32)
|
||||
punpckhdq mm3,mm7 ; mm3=(03 13 23 33)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
|
||||
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
|
||||
|
||||
.nextcolumn:
|
||||
add esi, byte 4*SIZEOF_JCOEF ; coef_block
|
||||
add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr
|
||||
add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
|
||||
dec ecx ; ctr
|
||||
jnz near .columnloop
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov eax, [original_ebp]
|
||||
lea esi, [workspace] ; JCOEF * wsptr
|
||||
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [output_col(eax)]
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
movq mm4,mm0
|
||||
movq mm5,mm0
|
||||
punpcklwd mm4,mm1
|
||||
punpckhwd mm5,mm1
|
||||
movq mm0,mm4
|
||||
movq mm1,mm5
|
||||
pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L)
|
||||
pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H)
|
||||
pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L)
|
||||
pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H)
|
||||
|
||||
movq mm6,mm2
|
||||
movq mm7,mm2
|
||||
punpcklwd mm6,mm3
|
||||
punpckhwd mm7,mm3
|
||||
movq mm2,mm6
|
||||
movq mm3,mm7
|
||||
pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L)
|
||||
pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H)
|
||||
pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L)
|
||||
pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H)
|
||||
|
||||
paddd mm6,mm4 ; mm6=tmp2L
|
||||
paddd mm7,mm5 ; mm7=tmp2H
|
||||
paddd mm2,mm0 ; mm2=tmp0L
|
||||
paddd mm3,mm1 ; mm3=tmp0H
|
||||
|
||||
movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L
|
||||
movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
pxor mm1,mm1
|
||||
pxor mm2,mm2
|
||||
punpcklwd mm1,mm4 ; mm1=tmp0L
|
||||
punpckhwd mm2,mm4 ; mm2=tmp0H
|
||||
psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
|
||||
psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
|
||||
|
||||
movq mm3,mm5 ; mm5=in2=z2
|
||||
punpcklwd mm5,mm0 ; mm0=in6=z3
|
||||
punpckhwd mm3,mm0
|
||||
pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L
|
||||
pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H
|
||||
|
||||
movq mm4,mm1
|
||||
movq mm0,mm2
|
||||
paddd mm1,mm5 ; mm1=tmp10L
|
||||
paddd mm2,mm3 ; mm2=tmp10H
|
||||
psubd mm4,mm5 ; mm4=tmp12L
|
||||
psubd mm0,mm3 ; mm0=tmp12H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movq mm5,mm1
|
||||
movq mm3,mm2
|
||||
paddd mm1,mm6 ; mm1=data0L
|
||||
paddd mm2,mm7 ; mm2=data0H
|
||||
psubd mm5,mm6 ; mm5=data3L
|
||||
psubd mm3,mm7 ; mm3=data3H
|
||||
|
||||
movq mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm6=[PD_DESCALE_P2_4]
|
||||
|
||||
paddd mm1,mm6
|
||||
paddd mm2,mm6
|
||||
psrad mm1,DESCALE_P2_4
|
||||
psrad mm2,DESCALE_P2_4
|
||||
paddd mm5,mm6
|
||||
paddd mm3,mm6
|
||||
psrad mm5,DESCALE_P2_4
|
||||
psrad mm3,DESCALE_P2_4
|
||||
|
||||
packssdw mm1,mm2 ; mm1=data0=(00 10 20 30)
|
||||
packssdw mm5,mm3 ; mm5=data3=(03 13 23 33)
|
||||
|
||||
movq mm7, MMWORD [wk(0)] ; mm7=tmp0L
|
||||
movq mm6, MMWORD [wk(1)] ; mm6=tmp0H
|
||||
|
||||
movq mm2,mm4
|
||||
movq mm3,mm0
|
||||
paddd mm4,mm7 ; mm4=data1L
|
||||
paddd mm0,mm6 ; mm0=data1H
|
||||
psubd mm2,mm7 ; mm2=data2L
|
||||
psubd mm3,mm6 ; mm3=data2H
|
||||
|
||||
movq mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm7=[PD_DESCALE_P2_4]
|
||||
|
||||
paddd mm4,mm7
|
||||
paddd mm0,mm7
|
||||
psrad mm4,DESCALE_P2_4
|
||||
psrad mm0,DESCALE_P2_4
|
||||
paddd mm2,mm7
|
||||
paddd mm3,mm7
|
||||
psrad mm2,DESCALE_P2_4
|
||||
psrad mm3,DESCALE_P2_4
|
||||
|
||||
packssdw mm4,mm0 ; mm4=data1=(01 11 21 31)
|
||||
packssdw mm2,mm3 ; mm2=data2=(02 12 22 32)
|
||||
|
||||
movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP]
|
||||
|
||||
packsswb mm1,mm2 ; mm1=(00 10 20 30 02 12 22 32)
|
||||
packsswb mm4,mm5 ; mm4=(01 11 21 31 03 13 23 33)
|
||||
paddb mm1,mm6
|
||||
paddb mm4,mm6
|
||||
|
||||
movq mm7,mm1 ; transpose coefficients(phase 1)
|
||||
punpcklbw mm1,mm4 ; mm1=(00 01 10 11 20 21 30 31)
|
||||
punpckhbw mm7,mm4 ; mm7=(02 03 12 13 22 23 32 33)
|
||||
|
||||
movq mm0,mm1 ; transpose coefficients(phase 2)
|
||||
punpcklwd mm1,mm7 ; mm1=(00 01 02 03 10 11 12 13)
|
||||
punpckhwd mm0,mm7 ; mm0=(20 21 22 23 30 31 32 33)
|
||||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
|
||||
movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
|
||||
movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
|
||||
|
||||
psrlq mm1,4*BYTE_BIT
|
||||
psrlq mm0,4*BYTE_BIT
|
||||
|
||||
mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
|
||||
movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
|
||||
movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
poppic ebx
|
||||
mov esp,ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients,
|
||||
; producing a reduced-size 2x2 output block.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_2x2_mmx (void * dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
%define dct_table(b) (b)+8 ; void * dct_table
|
||||
%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
|
||||
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
|
||||
%define output_col(b) (b)+20 ; JDIMENSION output_col
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_idct_2x2_mmx)
|
||||
|
||||
EXTN(jsimd_idct_2x2_mmx):
|
||||
push ebp
|
||||
mov ebp,esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
mov edx, POINTER [dct_table(ebp)] ; quantptr
|
||||
mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
|
||||
|
||||
; | input: | result: |
|
||||
; | 00 01 ** 03 ** 05 ** 07 | |
|
||||
; | 10 11 ** 13 ** 15 ** 17 | |
|
||||
; | ** ** ** ** ** ** ** ** | |
|
||||
; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
|
||||
; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
|
||||
; | 50 51 ** 53 ** 55 ** 57 | |
|
||||
; | ** ** ** ** ** ** ** ** | |
|
||||
; | 70 71 ** 73 ** 75 ** 77 | |
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
|
||||
; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
|
||||
|
||||
pcmpeqd mm7,mm7
|
||||
pslld mm7,WORD_BIT ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
|
||||
|
||||
movq mm4,mm0 ; mm4=(10 11 ** 13)
|
||||
movq mm5,mm2 ; mm5=(50 51 ** 53)
|
||||
punpcklwd mm4,mm1 ; mm4=(10 30 11 31)
|
||||
punpcklwd mm5,mm3 ; mm5=(50 70 51 71)
|
||||
pmaddwd mm4,[GOTOFF(ebx,PW_F362_MF127)]
|
||||
pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)]
|
||||
|
||||
psrld mm0,WORD_BIT ; mm0=(11 -- 13 --)
|
||||
pand mm1,mm7 ; mm1=(-- 31 -- 33)
|
||||
psrld mm2,WORD_BIT ; mm2=(51 -- 53 --)
|
||||
pand mm3,mm7 ; mm3=(-- 71 -- 73)
|
||||
por mm0,mm1 ; mm0=(11 31 13 33)
|
||||
por mm2,mm3 ; mm2=(51 71 53 73)
|
||||
pmaddwd mm0,[GOTOFF(ebx,PW_F362_MF127)]
|
||||
pmaddwd mm2,[GOTOFF(ebx,PW_F085_MF072)]
|
||||
|
||||
paddd mm4,mm5 ; mm4=tmp0[col0 col1]
|
||||
|
||||
movq mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movq mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
|
||||
movq mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
|
||||
; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
|
||||
|
||||
psrld mm6,WORD_BIT ; mm6=(15 -- 17 --)
|
||||
pand mm1,mm7 ; mm1=(-- 35 -- 37)
|
||||
psrld mm3,WORD_BIT ; mm3=(55 -- 57 --)
|
||||
pand mm5,mm7 ; mm5=(-- 75 -- 77)
|
||||
por mm6,mm1 ; mm6=(15 35 17 37)
|
||||
por mm3,mm5 ; mm3=(55 75 57 77)
|
||||
pmaddwd mm6,[GOTOFF(ebx,PW_F362_MF127)]
|
||||
pmaddwd mm3,[GOTOFF(ebx,PW_F085_MF072)]
|
||||
|
||||
paddd mm0,mm2 ; mm0=tmp0[col1 col3]
|
||||
paddd mm6,mm3 ; mm6=tmp0[col5 col7]
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
|
||||
|
||||
movq mm2,mm1 ; mm2=(00 01 ** 03)
|
||||
pslld mm1,WORD_BIT ; mm1=(-- 00 -- **)
|
||||
psrad mm1,(WORD_BIT-CONST_BITS-2) ; mm1=tmp10[col0 ****]
|
||||
|
||||
pand mm2,mm7 ; mm2=(-- 01 -- 03)
|
||||
pand mm5,mm7 ; mm5=(-- 05 -- 07)
|
||||
psrad mm2,(WORD_BIT-CONST_BITS-2) ; mm2=tmp10[col1 col3]
|
||||
psrad mm5,(WORD_BIT-CONST_BITS-2) ; mm5=tmp10[col5 col7]
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movq mm3,mm1
|
||||
paddd mm1,mm4 ; mm1=data0[col0 ****]=(A0 **)
|
||||
psubd mm3,mm4 ; mm3=data1[col0 ****]=(B0 **)
|
||||
punpckldq mm1,mm3 ; mm1=(A0 B0)
|
||||
|
||||
movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; mm7=[PD_DESCALE_P1_2]
|
||||
|
||||
movq mm4,mm2
|
||||
movq mm3,mm5
|
||||
paddd mm2,mm0 ; mm2=data0[col1 col3]=(A1 A3)
|
||||
paddd mm5,mm6 ; mm5=data0[col5 col7]=(A5 A7)
|
||||
psubd mm4,mm0 ; mm4=data1[col1 col3]=(B1 B3)
|
||||
psubd mm3,mm6 ; mm3=data1[col5 col7]=(B5 B7)
|
||||
|
||||
paddd mm1,mm7
|
||||
psrad mm1,DESCALE_P1_2
|
||||
|
||||
paddd mm2,mm7
|
||||
paddd mm5,mm7
|
||||
psrad mm2,DESCALE_P1_2
|
||||
psrad mm5,DESCALE_P1_2
|
||||
paddd mm4,mm7
|
||||
paddd mm3,mm7
|
||||
psrad mm4,DESCALE_P1_2
|
||||
psrad mm3,DESCALE_P1_2
|
||||
|
||||
; ---- Pass 2: process rows, store into output array.
|
||||
|
||||
mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [output_col(ebp)]
|
||||
|
||||
; | input:| result:|
|
||||
; | A0 B0 | |
|
||||
; | A1 B1 | C0 C1 |
|
||||
; | A3 B3 | D0 D1 |
|
||||
; | A5 B5 | |
|
||||
; | A7 B7 | |
|
||||
|
||||
; -- Odd part
|
||||
|
||||
packssdw mm2,mm4 ; mm2=(A1 A3 B1 B3)
|
||||
packssdw mm5,mm3 ; mm5=(A5 A7 B5 B7)
|
||||
pmaddwd mm2,[GOTOFF(ebx,PW_F362_MF127)]
|
||||
pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)]
|
||||
|
||||
paddd mm2,mm5 ; mm2=tmp0[row0 row1]
|
||||
|
||||
; -- Even part
|
||||
|
||||
pslld mm1,(CONST_BITS+2) ; mm1=tmp10[row0 row1]
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movq mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)] ; mm0=[PD_DESCALE_P2_2]
|
||||
|
||||
movq mm6,mm1
|
||||
paddd mm1,mm2 ; mm1=data0[row0 row1]=(C0 C1)
|
||||
psubd mm6,mm2 ; mm6=data1[row0 row1]=(D0 D1)
|
||||
|
||||
paddd mm1,mm0
|
||||
paddd mm6,mm0
|
||||
psrad mm1,DESCALE_P2_2
|
||||
psrad mm6,DESCALE_P2_2
|
||||
|
||||
movq mm7,mm1 ; transpose coefficients
|
||||
punpckldq mm1,mm6 ; mm1=(C0 D0)
|
||||
punpckhdq mm7,mm6 ; mm7=(C1 D1)
|
||||
|
||||
packssdw mm1,mm7 ; mm1=(C0 D0 C1 D1)
|
||||
packsswb mm1,mm1 ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
|
||||
paddb mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
|
||||
|
||||
movd ecx,mm1
|
||||
movd ebx,mm1 ; ebx=(C0 D0 C1 D1)
|
||||
shr ecx,2*BYTE_BIT ; ecx=(C1 D1 -- --)
|
||||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
mov WORD [edx+eax*SIZEOF_JSAMPLE], bx
|
||||
mov WORD [esi+eax*SIZEOF_JSAMPLE], cx
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
97
simd/jsimd.h
97
simd/jsimd.h
@@ -11,13 +11,108 @@
|
||||
/* Bitmask for supported acceleration methods */
|
||||
|
||||
#define JSIMD_NONE 0x00
|
||||
#define JSIMD_MMX 0x01
|
||||
|
||||
/* Short forms of external names for systems with brain-damaged linkers. */
|
||||
|
||||
#ifdef NEED_SHORT_EXTERNAL_NAMES
|
||||
#define jpeg_simd_cpu_support jSiCpuSupport
|
||||
#define jpeg_simd_cpu_support jSiCpuSupport
|
||||
#define jsimd_rgb_ycc_convert_mmx jSRGBYCCM
|
||||
#define jsimd_ycc_rgb_convert_mmx jSYCCRGBM
|
||||
#define jsimd_h2v2_downsample_mmx jSDnH2V2M
|
||||
#define jsimd_h2v1_downsample_mmx jSDnH2V1M
|
||||
#define jsimd_h2v2_upsample_mmx jSUpH2V2M
|
||||
#define jsimd_h2v1_upsample_mmx jSUpH2V1M
|
||||
#define jsimd_h2v2_fancy_upsample_mmx jSFUpH2V2M
|
||||
#define jsimd_h2v1_fancy_upsample_mmx jSFUpH2V1M
|
||||
#define jsimd_h2v2_merged_upsample_mmx jSMUpH2V2M
|
||||
#define jsimd_h2v1_merged_upsample_mmx jSMUpH2V1M
|
||||
#define jsimd_convsamp_mmx jSConvM
|
||||
#define jsimd_fdct_islow_mmx jSFDMIS
|
||||
#define jsimd_fdct_ifast_mmx jSFDMIF
|
||||
#define jsimd_quantize_mmx jSQuantM
|
||||
#define jsimd_idct_2x2_mmx jSIDM22
|
||||
#define jsimd_idct_4x4_mmx jSIDM44
|
||||
#define jsimd_idct_islow_mmx jSIDMIS
|
||||
#define jsimd_idct_ifast_mmx jSIDMIF
|
||||
#endif /* NEED_SHORT_EXTERNAL_NAMES */
|
||||
|
||||
/* SIMD Ext: retrieve SIMD/CPU information */
|
||||
EXTERN(unsigned int) jpeg_simd_cpu_support JPP((void));
|
||||
|
||||
/* SIMD Color Space Conversion */
|
||||
EXTERN(void) jsimd_rgb_ycc_convert_mmx
|
||||
JPP((JDIMENSION img_width,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
EXTERN(void) jsimd_ycc_rgb_convert_mmx
|
||||
JPP((JDIMENSION out_width,
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows));
|
||||
|
||||
/* SIMD Downsample */
|
||||
EXTERN(void) jsimd_h2v2_downsample_mmx
|
||||
JPP((JDIMENSION image_width, int max_v_samp_factor,
|
||||
JDIMENSION v_samp_factor, JDIMENSION width_blocks,
|
||||
JSAMPARRAY input_data, JSAMPARRAY output_data));
|
||||
EXTERN(void) jsimd_h2v1_downsample_mmx
|
||||
JPP((JDIMENSION image_width, int max_v_samp_factor,
|
||||
JDIMENSION v_samp_factor, JDIMENSION width_blocks,
|
||||
JSAMPARRAY input_data, JSAMPARRAY output_data));
|
||||
|
||||
/* SIMD Upsample */
|
||||
EXTERN(void) jsimd_h2v2_upsample_mmx
|
||||
JPP((int max_v_samp_factor, JDIMENSION output_width,
|
||||
JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
|
||||
EXTERN(void) jsimd_h2v1_upsample_mmx
|
||||
JPP((int max_v_samp_factor, JDIMENSION output_width,
|
||||
JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
|
||||
|
||||
EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
|
||||
JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
|
||||
JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
|
||||
EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
|
||||
JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
|
||||
JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
|
||||
|
||||
EXTERN(void) jsimd_h2v2_merged_upsample_mmx
|
||||
JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
|
||||
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
|
||||
EXTERN(void) jsimd_h2v1_merged_upsample_mmx
|
||||
JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
|
||||
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
|
||||
|
||||
/* SIMD Sample Conversion */
|
||||
EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
|
||||
JDIMENSION start_col,
|
||||
DCTELEM * workspace));
|
||||
|
||||
/* SIMD Forward DCT */
|
||||
EXTERN(void) jsimd_fdct_islow_mmx JPP((DCTELEM * data));
|
||||
EXTERN(void) jsimd_fdct_ifast_mmx JPP((DCTELEM * data));
|
||||
|
||||
/* SIMD Quantization */
|
||||
EXTERN(void) jsimd_quantize_mmx JPP((JCOEFPTR coef_block,
|
||||
DCTELEM * divisors,
|
||||
DCTELEM * workspace));
|
||||
|
||||
/* SIMD Reduced Inverse DCT */
|
||||
EXTERN(void) jsimd_idct_2x2_mmx JPP((void * dct_table,
|
||||
JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col));
|
||||
EXTERN(void) jsimd_idct_4x4_mmx JPP((void * dct_table,
|
||||
JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col));
|
||||
|
||||
/* SIMD Inverse DCT */
|
||||
EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table,
|
||||
JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col));
|
||||
EXTERN(void) jsimd_idct_ifast_mmx JPP((void * dct_table,
|
||||
JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col));
|
||||
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
// This file generates the include file for the assembly
|
||||
// implementations by abusing the C preprocessor.
|
||||
//
|
||||
// Note: Some things are manually defined as they need to
|
||||
// be mapped to NASM types.
|
||||
|
||||
;
|
||||
; Automatically generated include file from jsimdcfg.inc.h
|
||||
@@ -15,15 +18,101 @@
|
||||
#define define(var) %define _cpp_protection_##var
|
||||
#define definev(var) %define _cpp_protection_##var var
|
||||
|
||||
;
|
||||
; -- jpeglib.h
|
||||
;
|
||||
|
||||
definev(DCTSIZE)
|
||||
definev(DCTSIZE2)
|
||||
|
||||
;
|
||||
; -- jmorecfg.h
|
||||
;
|
||||
|
||||
definev(RGB_RED)
|
||||
definev(RGB_GREEN)
|
||||
definev(RGB_BLUE)
|
||||
|
||||
definev(RGB_PIXELSIZE)
|
||||
|
||||
; Representation of a single sample (pixel element value).
|
||||
; On this SIMD implementation, this must be 'unsigned char'.
|
||||
;
|
||||
|
||||
%define JSAMPLE byte ; unsigned char
|
||||
%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
|
||||
|
||||
definev(CENTERJSAMPLE)
|
||||
|
||||
; Representation of a DCT frequency coefficient.
|
||||
; On this SIMD implementation, this must be 'short'.
|
||||
;
|
||||
%define JCOEF word ; short
|
||||
%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
|
||||
|
||||
; Datatype used for image dimensions.
|
||||
; On this SIMD implementation, this must be 'unsigned int'.
|
||||
;
|
||||
%define JDIMENSION dword ; unsigned int
|
||||
%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
|
||||
|
||||
%define JSAMPROW POINTER ; JSAMPLE FAR * (jpeglib.h)
|
||||
%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
|
||||
%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
|
||||
%define JCOEFPTR POINTER ; JCOEF FAR * (jpeglib.h)
|
||||
%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
|
||||
%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
|
||||
%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
|
||||
%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
|
||||
|
||||
;
|
||||
; -- jdct.h
|
||||
;
|
||||
|
||||
; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
|
||||
; the DCT is to be performed in-place in that buffer.
|
||||
; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
|
||||
;
|
||||
%define DCTELEM word ; short
|
||||
%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
|
||||
|
||||
; To maximize parallelism, Type MULTIPLIER is changed to short.
|
||||
;
|
||||
%define ISLOW_MULT_TYPE word ; must be short
|
||||
%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
|
||||
|
||||
%define IFAST_MULT_TYPE word ; must be short
|
||||
%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
|
||||
%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
|
||||
|
||||
;
|
||||
; -- jsimd.h
|
||||
;
|
||||
|
||||
definev(JSIMD_NONE)
|
||||
definev(JSIMD_MMX)
|
||||
|
||||
; Short forms of external names for systems with brain-damaged linkers.
|
||||
;
|
||||
#ifdef NEED_SHORT_EXTERNAL_NAMES
|
||||
definev(jpeg_simd_cpu_support)
|
||||
definev(jsimd_rgb_ycc_convert_mmx)
|
||||
definev(jsimd_ycc_rgb_convert_mmx)
|
||||
definev(jsimd_h2v2_downsample_mmx)
|
||||
definev(jsimd_h2v1_downsample_mmx)
|
||||
definev(jsimd_h2v2_upsample_mmx)
|
||||
definev(jsimd_h2v1_upsample_mmx)
|
||||
definev(jsimd_h2v1_fancy_upsample_mmx)
|
||||
definev(jsimd_h2v2_fancy_upsample_mmx)
|
||||
definev(jsimd_h2v1_merged_upsample_mmx)
|
||||
definev(jsimd_h2v2_merged_upsample_mmx)
|
||||
definev(jsimd_convsamp_mmx)
|
||||
definev(jsimd_fdct_islow_mmx)
|
||||
definev(jsimd_fdct_ifast_mmx)
|
||||
definev(jsimd_quantize_mmx)
|
||||
definev(jsimd_idct_2x2_mmx)
|
||||
definev(jsimd_idct_4x4_mmx)
|
||||
definev(jsimd_idct_islow_mmx)
|
||||
definev(jsimd_idct_ifast_mmx)
|
||||
#endif /* NEED_SHORT_EXTERNAL_NAMES */
|
||||
|
||||
|
||||
@@ -51,6 +51,22 @@ EXTN(jpeg_simd_cpu_support):
|
||||
xor eax,edx
|
||||
jz short .return ; CPUID is not supported
|
||||
|
||||
; Check for MMX instruction support
|
||||
xor eax,eax
|
||||
cpuid
|
||||
test eax,eax
|
||||
jz short .return
|
||||
|
||||
xor eax,eax
|
||||
inc eax
|
||||
cpuid
|
||||
mov eax,edx ; eax = Standard feature flags
|
||||
|
||||
test eax, 1<<23 ; bit23:MMX
|
||||
jz short .no_mmx
|
||||
or edi, byte JSIMD_MMX
|
||||
.no_mmx:
|
||||
|
||||
.return:
|
||||
mov eax,edi
|
||||
|
||||
|
||||
@@ -107,6 +107,24 @@
|
||||
%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
|
||||
%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
|
||||
|
||||
%define INT dword ; signed integer type
|
||||
%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
|
||||
%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
|
||||
|
||||
%define MMWORD qword ; int64 (MMX register)
|
||||
%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
|
||||
%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
|
||||
|
||||
%define SIZEOF_BYTE 1 ; sizeof(BYTE)
|
||||
%define SIZEOF_WORD 2 ; sizeof(WORD)
|
||||
%define SIZEOF_DWORD 4 ; sizeof(DWORD)
|
||||
%define SIZEOF_QWORD 8 ; sizeof(QWORD)
|
||||
|
||||
%define BYTE_BIT 8 ; CHAR_BIT in C
|
||||
%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT
|
||||
%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT
|
||||
%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; External Symbol Name
|
||||
;
|
||||
|
||||
Reference in New Issue
Block a user