SIMD-accelerated RGB-to-Grayscale color conversion
This commit is contained in:
@@ -148,6 +148,8 @@ if WITH_SIMD
|
||||
else
|
||||
cmp $(srcdir)/testimgflt-nosimd.jpg testoutflt.jpg
|
||||
endif
|
||||
./cjpeg -dct int -grayscale -outfile testoutgray.jpg $(srcdir)/testorig.ppm
|
||||
cmp $(srcdir)/testimggray.jpg testoutgray.jpg
|
||||
./djpeg -dct int -fast -ppm -outfile testoutint.ppm $(srcdir)/testorig.jpg
|
||||
cmp $(srcdir)/testimgint.ppm testoutint.ppm
|
||||
./djpeg -dct fast -ppm -outfile testoutfst.ppm $(srcdir)/testorig.jpg
|
||||
|
||||
104
jccolor.c
104
jccolor.c
@@ -81,74 +81,6 @@ typedef my_color_converter * my_cconvert_ptr;
|
||||
#define TABLE_SIZE (8*(MAXJSAMPLE+1))
|
||||
|
||||
|
||||
#if BITS_IN_JSAMPLE == 8
|
||||
|
||||
static const unsigned char red_lut[256] = {
|
||||
0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 3 , 3 , 3 , 4 , 4 , 4 , 4 ,
|
||||
5 , 5 , 5 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 8 , 8 , 8 , 9 , 9 , 9 ,
|
||||
10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14,
|
||||
14, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19,
|
||||
19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 24,
|
||||
24, 24, 25, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 28,
|
||||
29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33,
|
||||
33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 38, 38,
|
||||
38, 39, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 42, 43,
|
||||
43, 43, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48,
|
||||
48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 51, 51, 51, 52, 52, 52,
|
||||
53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 56, 56, 56, 57, 57, 57,
|
||||
57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 60, 61, 61, 61, 62, 62,
|
||||
62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 67,
|
||||
67, 67, 68, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 71,
|
||||
72, 72, 72, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75, 76, 76, 76
|
||||
};
|
||||
|
||||
static const unsigned char green_lut[256] = {
|
||||
0 , 1 , 1 , 2 , 2 , 3 , 4 , 4 , 5 , 5 , 6 , 6 ,
|
||||
7 , 8 , 8 , 9 , 9 , 10 , 11 , 11 , 12 , 12 , 13 , 14 ,
|
||||
14 , 15 , 15 , 16 , 16 , 17 , 18 , 18 , 19 , 19 , 20 , 21 ,
|
||||
21 , 22 , 22 , 23 , 23 , 24 , 25 , 25 , 26 , 26 , 27 , 28 ,
|
||||
28 , 29 , 29 , 30 , 31 , 31 , 32 , 32 , 33 , 33 , 34 , 35 ,
|
||||
35 , 36 , 36 , 37 , 38 , 38 , 39 , 39 , 40 , 41 , 41 , 42 ,
|
||||
42 , 43 , 43 , 44 , 45 , 45 , 46 , 46 , 47 , 48 , 48 , 49 ,
|
||||
49 , 50 , 50 , 51 , 52 , 52 , 53 , 53 , 54 , 55 , 55 , 56 ,
|
||||
56 , 57 , 58 , 58 , 59 , 59 , 60 , 60 , 61 , 62 , 62 , 63 ,
|
||||
63 , 64 , 65 , 65 , 66 , 66 , 67 , 68 , 68 , 69 , 69 , 70 ,
|
||||
70 , 71 , 72 , 72 , 73 , 73 , 74 , 75 , 75 , 76 , 76 , 77 ,
|
||||
77 , 78 , 79 , 79 , 80 , 80 , 81 , 82 , 82 , 83 , 83 , 84 ,
|
||||
85 , 85 , 86 , 86 , 87 , 87 , 88 , 89 , 89 , 90 , 90 , 91 ,
|
||||
92 , 92 , 93 , 93 , 94 , 95 , 95 , 96 , 96 , 97 , 97 , 98 ,
|
||||
99 , 99 , 100, 100, 101, 102, 102, 103, 103, 104, 104, 105,
|
||||
106, 106, 107, 107, 108, 109, 109, 110, 110, 111, 112, 112,
|
||||
113, 113, 114, 114, 115, 116, 116, 117, 117, 118, 119, 119,
|
||||
120, 120, 121, 122, 122, 123, 123, 124, 124, 125, 126, 126,
|
||||
127, 127, 128, 129, 129, 130, 130, 131, 131, 132, 133, 133,
|
||||
34, 134, 135, 136, 136, 137, 137, 138, 139, 139, 140, 140,
|
||||
141, 141, 142, 143, 143, 144, 144, 145, 146, 146, 147, 147,
|
||||
148, 149, 149, 150
|
||||
};
|
||||
|
||||
static const unsigned char blue_lut[256] = {
|
||||
0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2 , 2 ,
|
||||
2 , 2 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 4 ,
|
||||
4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 ,
|
||||
5 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 7 , 7 ,
|
||||
7 , 7 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 9 , 9 , 9 , 9 , 9 ,
|
||||
9 , 9 , 9 , 9 , 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
|
||||
11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13,
|
||||
13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
||||
15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
|
||||
16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18,
|
||||
18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
|
||||
22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25,
|
||||
26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27,
|
||||
27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* Initialize for RGB->YCC colorspace conversion.
|
||||
*/
|
||||
@@ -259,36 +191,26 @@ rgb_gray_convert (j_compress_ptr cinfo,
|
||||
JDIMENSION output_row, int num_rows)
|
||||
{
|
||||
my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
|
||||
#if BITS_IN_JSAMPLE != 8
|
||||
register int r, g, b;
|
||||
register INT32 * ctab = cconvert->rgb_ycc_tab;
|
||||
#endif
|
||||
register JSAMPROW inptr;
|
||||
register JSAMPROW outptr;
|
||||
JSAMPLE *maxoutptr;
|
||||
register JDIMENSION col;
|
||||
JDIMENSION num_cols = cinfo->image_width;
|
||||
int rindex = rgb_red[cinfo->in_color_space];
|
||||
int gindex = rgb_green[cinfo->in_color_space];
|
||||
int bindex = rgb_blue[cinfo->in_color_space];
|
||||
int rgbstride = rgb_pixelsize[cinfo->in_color_space];
|
||||
|
||||
while (--num_rows >= 0) {
|
||||
inptr = *input_buf++;
|
||||
outptr = output_buf[0][output_row];
|
||||
maxoutptr = &outptr[num_cols];
|
||||
output_row++;
|
||||
for (; outptr < maxoutptr; outptr++, inptr += rgbstride) {
|
||||
for (col = 0; col < num_cols; col++) {
|
||||
r = GETJSAMPLE(inptr[rgb_red[cinfo->in_color_space]]);
|
||||
g = GETJSAMPLE(inptr[rgb_green[cinfo->in_color_space]]);
|
||||
b = GETJSAMPLE(inptr[rgb_blue[cinfo->in_color_space]]);
|
||||
inptr += rgb_pixelsize[cinfo->in_color_space];
|
||||
/* Y */
|
||||
#if BITS_IN_JSAMPLE == 8
|
||||
*outptr = red_lut[inptr[rindex]] + green_lut[inptr[gindex]]
|
||||
+ blue_lut[inptr[bindex]];
|
||||
#else
|
||||
*outptr = (JSAMPLE)
|
||||
((ctab[GETJSAMPLE(inptr[rindex])+R_Y_OFF]
|
||||
+ ctab[GETJSAMPLE(inptr[gindex])+G_Y_OFF]
|
||||
+ ctab[GETJSAMPLE(inptr[bindex])+B_Y_OFF])
|
||||
>> SCALEBITS);
|
||||
#endif
|
||||
outptr[col] = (JSAMPLE)
|
||||
((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
|
||||
>> SCALEBITS);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -490,8 +412,12 @@ jinit_color_converter (j_compress_ptr cinfo)
|
||||
cinfo->in_color_space == JCS_EXT_BGRX ||
|
||||
cinfo->in_color_space == JCS_EXT_XBGR ||
|
||||
cinfo->in_color_space == JCS_EXT_XRGB) {
|
||||
cconvert->pub.start_pass = rgb_ycc_start;
|
||||
cconvert->pub.color_convert = rgb_gray_convert;
|
||||
if (jsimd_can_rgb_gray())
|
||||
cconvert->pub.color_convert = jsimd_rgb_gray_convert;
|
||||
else {
|
||||
cconvert->pub.start_pass = rgb_ycc_start;
|
||||
cconvert->pub.color_convert = rgb_gray_convert;
|
||||
}
|
||||
} else if (cinfo->in_color_space == JCS_YCbCr)
|
||||
cconvert->pub.color_convert = grayscale_convert;
|
||||
else
|
||||
|
||||
7
jsimd.h
7
jsimd.h
@@ -13,8 +13,10 @@
|
||||
|
||||
#ifdef NEED_SHORT_EXTERNAL_NAMES
|
||||
#define jsimd_can_rgb_ycc jSCanRgbYcc
|
||||
#define jsimd_can_rgb_gray jSCanRgbGry
|
||||
#define jsimd_can_ycc_rgb jSCanYccRgb
|
||||
#define jsimd_rgb_ycc_convert jSRgbYccConv
|
||||
#define jsimd_rgb_gray_convert jSRgbGryConv
|
||||
#define jsimd_ycc_rgb_convert jSYccRgbConv
|
||||
#define jsimd_can_h2v2_downsample jSCanH2V2Down
|
||||
#define jsimd_can_h2v1_downsample jSCanH2V1Down
|
||||
@@ -35,12 +37,17 @@
|
||||
#endif /* NEED_SHORT_EXTERNAL_NAMES */
|
||||
|
||||
EXTERN(int) jsimd_can_rgb_ycc JPP((void));
|
||||
EXTERN(int) jsimd_can_rgb_gray JPP((void));
|
||||
EXTERN(int) jsimd_can_ycc_rgb JPP((void));
|
||||
|
||||
EXTERN(void) jsimd_rgb_ycc_convert
|
||||
JPP((j_compress_ptr cinfo,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
EXTERN(void) jsimd_rgb_gray_convert
|
||||
JPP((j_compress_ptr cinfo,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
EXTERN(void) jsimd_ycc_rgb_convert
|
||||
JPP((j_decompress_ptr cinfo,
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
|
||||
13
jsimd_none.c
13
jsimd_none.c
@@ -24,6 +24,12 @@ jsimd_can_rgb_ycc (void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_rgb_gray (void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_ycc_rgb (void)
|
||||
{
|
||||
@@ -37,6 +43,13 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_rgb_gray_convert (j_compress_ptr cinfo,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
|
||||
@@ -11,7 +11,7 @@ if SIMD_X86_64
|
||||
libsimd_la_SOURCES = jsimd_x86_64.c \
|
||||
jsimd.h jsimdcfg.inc.h \
|
||||
jsimdext.inc jcolsamp.inc jdct.inc \
|
||||
jfsseflt-64.asm \
|
||||
jfsseflt-64.asm jcgrass2-64.asm \
|
||||
jccolss2-64.asm jdcolss2-64.asm \
|
||||
jcsamss2-64.asm jdsamss2-64.asm jdmerss2-64.asm \
|
||||
jcqnts2i-64.asm jfss2fst-64.asm jfss2int-64.asm \
|
||||
@@ -20,6 +20,7 @@ libsimd_la_SOURCES = jsimd_x86_64.c \
|
||||
|
||||
jccolss2-64.lo: jcclrss2-64.asm
|
||||
jdcolss2-64.lo: jdclrss2-64.asm
|
||||
jcgrass2-64.lo: jcgryss2-64.asm
|
||||
jdmerss2-64.lo: jdmrgss2-64.asm
|
||||
endif
|
||||
|
||||
@@ -29,20 +30,22 @@ libsimd_la_SOURCES = jsimd_i386.c \
|
||||
jsimd.h jsimdcfg.inc.h \
|
||||
jsimdext.inc jcolsamp.inc jdct.inc \
|
||||
jsimdcpu.asm \
|
||||
jccolmmx.asm jdcolmmx.asm \
|
||||
jccolmmx.asm jdcolmmx.asm jcgrammx.asm \
|
||||
jcsammmx.asm jdsammmx.asm jdmermmx.asm \
|
||||
jcqntmmx.asm jfmmxfst.asm jfmmxint.asm \
|
||||
jimmxred.asm jimmxint.asm jimmxfst.asm \
|
||||
jcqnt3dn.asm jf3dnflt.asm ji3dnflt.asm \
|
||||
jcqntsse.asm jfsseflt.asm jisseflt.asm \
|
||||
jccolss2.asm jdcolss2.asm \
|
||||
jccolss2.asm jdcolss2.asm jcgrass2.asm \
|
||||
jcsamss2.asm jdsamss2.asm jdmerss2.asm \
|
||||
jcqnts2i.asm jfss2fst.asm jfss2int.asm \
|
||||
jiss2red.asm jiss2int.asm jiss2fst.asm \
|
||||
jcqnts2f.asm jiss2flt.asm
|
||||
|
||||
jccolmmx.lo: jcclrmmx.asm
|
||||
jcgrammx.lo: jcgrymmx.asm
|
||||
jccolss2.lo: jcclrss2.asm
|
||||
jcgrass2.lo: jcgryss2.asm
|
||||
jdcolmmx.lo: jdclrmmx.asm
|
||||
jdcolss2.lo: jdclrss2.asm
|
||||
jdmermmx.lo: jdmrgmmx.asm
|
||||
|
||||
113
simd/jcgrammx.asm
Normal file
113
simd/jcgrammx.asm
Normal file
@@ -0,0 +1,113 @@
|
||||
;
|
||||
; jcgrammx.asm - grayscale colorspace conversion (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2011 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_114 equ 7471 ; FIX(0.11400)
|
||||
F_0_250 equ 16384 ; FIX(0.25000)
|
||||
F_0_299 equ 19595 ; FIX(0.29900)
|
||||
F_0_587 equ 38470 ; FIX(0.58700)
|
||||
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_rgb_gray_convert_mmx)
|
||||
|
||||
EXTN(jconst_rgb_gray_convert_mmx):
|
||||
|
||||
PW_F0299_F0337 times 2 dw F_0_299, F_0_337
|
||||
PW_F0114_F0250 times 2 dw F_0_114, F_0_250
|
||||
PD_ONEHALF times 2 dd (1 << (SCALEBITS-1))
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
%include "jcgrymmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 0
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 2
|
||||
%define RGB_PIXELSIZE 3
|
||||
%define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx
|
||||
%include "jcgrymmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 0
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 2
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx
|
||||
%include "jcgrymmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 2
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 0
|
||||
%define RGB_PIXELSIZE 3
|
||||
%define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx
|
||||
%include "jcgrymmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 2
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 0
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx
|
||||
%include "jcgrymmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 3
|
||||
%define RGB_GREEN 2
|
||||
%define RGB_BLUE 1
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx
|
||||
%include "jcgrymmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 1
|
||||
%define RGB_GREEN 2
|
||||
%define RGB_BLUE 3
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx
|
||||
%include "jcgrymmx.asm"
|
||||
110
simd/jcgrass2-64.asm
Normal file
110
simd/jcgrass2-64.asm
Normal file
@@ -0,0 +1,110 @@
|
||||
;
|
||||
; jcgrass2-64.asm - grayscale colorspace conversion (64-bit SSE2)
|
||||
;
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; Copyright (C) 2011, D. R. Commander.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_114 equ 7471 ; FIX(0.11400)
|
||||
F_0_250 equ 16384 ; FIX(0.25000)
|
||||
F_0_299 equ 19595 ; FIX(0.29900)
|
||||
F_0_587 equ 38470 ; FIX(0.58700)
|
||||
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_rgb_gray_convert_sse2)
|
||||
|
||||
EXTN(jconst_rgb_gray_convert_sse2):
|
||||
|
||||
PW_F0299_F0337 times 4 dw F_0_299, F_0_337
|
||||
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
|
||||
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
%include "jcgryss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 0
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 2
|
||||
%define RGB_PIXELSIZE 3
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
|
||||
%include "jcgryss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 0
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 2
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
|
||||
%include "jcgryss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 2
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 0
|
||||
%define RGB_PIXELSIZE 3
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
|
||||
%include "jcgryss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 2
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 0
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
|
||||
%include "jcgryss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 3
|
||||
%define RGB_GREEN 2
|
||||
%define RGB_BLUE 1
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
|
||||
%include "jcgryss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 1
|
||||
%define RGB_GREEN 2
|
||||
%define RGB_BLUE 3
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
|
||||
%include "jcgryss2-64.asm"
|
||||
110
simd/jcgrass2.asm
Normal file
110
simd/jcgrass2.asm
Normal file
@@ -0,0 +1,110 @@
|
||||
;
|
||||
; jcgrass2.asm - grayscale colorspace conversion (SSE2)
|
||||
;
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; Copyright (C) 2011, D. R. Commander.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_114 equ 7471 ; FIX(0.11400)
|
||||
F_0_250 equ 16384 ; FIX(0.25000)
|
||||
F_0_299 equ 19595 ; FIX(0.29900)
|
||||
F_0_587 equ 38470 ; FIX(0.58700)
|
||||
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_rgb_gray_convert_sse2)
|
||||
|
||||
EXTN(jconst_rgb_gray_convert_sse2):
|
||||
|
||||
PW_F0299_F0337 times 4 dw F_0_299, F_0_337
|
||||
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
|
||||
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
%include "jcgryss2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 0
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 2
|
||||
%define RGB_PIXELSIZE 3
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
|
||||
%include "jcgryss2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 0
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 2
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
|
||||
%include "jcgryss2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 2
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 0
|
||||
%define RGB_PIXELSIZE 3
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
|
||||
%include "jcgryss2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 2
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 0
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
|
||||
%include "jcgryss2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 3
|
||||
%define RGB_GREEN 2
|
||||
%define RGB_BLUE 1
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
|
||||
%include "jcgryss2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 1
|
||||
%define RGB_GREEN 2
|
||||
%define RGB_BLUE 3
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
|
||||
%include "jcgryss2.asm"
|
||||
359
simd/jcgrymmx.asm
Normal file
359
simd/jcgrymmx.asm
Normal file
@@ -0,0 +1,359 @@
|
||||
;
|
||||
; jcclrmmx.asm - grayscale colorspace conversion (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2011 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width,
|
||||
; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
; JDIMENSION output_row, int num_rows);
|
||||
;
|
||||
|
||||
%define img_width(b) (b)+8 ; JDIMENSION img_width
|
||||
%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
|
||||
%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
|
||||
%define output_row(b) (b)+20 ; JDIMENSION output_row
|
||||
%define num_rows(b) (b)+24 ; int num_rows
|
||||
|
||||
%define original_ebp ebp+0
|
||||
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_rgb_gray_convert_mmx)
|
||||
|
||||
EXTN(jsimd_rgb_gray_convert_mmx):
|
||||
push ebp
|
||||
mov eax,esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp],eax
|
||||
mov ebp,esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [img_width(eax)] ; num_cols
|
||||
test ecx,ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov esi, JSAMPIMAGE [output_buf(eax)]
|
||||
mov ecx, JDIMENSION [output_row(eax)]
|
||||
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
|
||||
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop ecx
|
||||
|
||||
mov esi, JSAMPARRAY [input_buf(eax)]
|
||||
mov eax, INT [num_rows(eax)]
|
||||
test eax,eax
|
||||
jle near .return
|
||||
alignx 16,7
|
||||
.rowloop:
|
||||
pushpic eax
|
||||
push edi
|
||||
push esi
|
||||
push ecx ; col
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr0
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jae short .columnloop
|
||||
alignx 16,7
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
.column_ld1:
|
||||
push eax
|
||||
push edx
|
||||
lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
|
||||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_BYTE
|
||||
xor eax,eax
|
||||
mov al, BYTE [esi+ecx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
xor edx,edx
|
||||
mov dx, WORD [esi+ecx]
|
||||
shl eax, WORD_BIT
|
||||
or eax,edx
|
||||
.column_ld4:
|
||||
movd mmA,eax
|
||||
pop edx
|
||||
pop eax
|
||||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
movd mmG, DWORD [esi+ecx]
|
||||
psllq mmA, DWORD_BIT
|
||||
por mmA,mmG
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_MMWORD
|
||||
jz short .column_ld16
|
||||
movq mmG,mmA
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
mov ecx, SIZEOF_MMWORD
|
||||
jmp short .rgb_gray_cnv
|
||||
.column_ld16:
|
||||
test cl, 2*SIZEOF_MMWORD
|
||||
mov ecx, SIZEOF_MMWORD
|
||||
jz short .rgb_gray_cnv
|
||||
movq mmF,mmA
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
jmp short .rgb_gray_cnv
|
||||
alignx 16,7
|
||||
|
||||
.columnloop:
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
|
||||
|
||||
.rgb_gray_cnv:
|
||||
; mmA=(00 10 20 01 11 21 02 12)
|
||||
; mmG=(22 03 13 23 04 14 24 05)
|
||||
; mmF=(15 25 06 16 26 07 17 27)
|
||||
|
||||
movq mmD,mmA
|
||||
psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01)
|
||||
psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --)
|
||||
|
||||
punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05)
|
||||
psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23)
|
||||
|
||||
punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16)
|
||||
punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27)
|
||||
|
||||
movq mmE,mmA
|
||||
psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14)
|
||||
psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --)
|
||||
|
||||
punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16)
|
||||
psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25)
|
||||
|
||||
punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07)
|
||||
punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27)
|
||||
|
||||
pxor mmH,mmH
|
||||
|
||||
movq mmC,mmA
|
||||
punpcklbw mmA,mmH ; mmA=(00 02 04 06)
|
||||
punpckhbw mmC,mmH ; mmC=(10 12 14 16)
|
||||
|
||||
movq mmB,mmE
|
||||
punpcklbw mmE,mmH ; mmE=(20 22 24 26)
|
||||
punpckhbw mmB,mmH ; mmB=(01 03 05 07)
|
||||
|
||||
movq mmF,mmD
|
||||
punpcklbw mmD,mmH ; mmD=(11 13 15 17)
|
||||
punpckhbw mmF,mmH ; mmF=(21 23 25 27)
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
.column_ld1:
|
||||
test cl, SIZEOF_MMWORD/8
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_MMWORD/8
|
||||
movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_MMWORD/4
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_MMWORD/4
|
||||
movq mmF,mmA
|
||||
movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld4:
|
||||
test cl, SIZEOF_MMWORD/2
|
||||
mov ecx, SIZEOF_MMWORD
|
||||
jz short .rgb_gray_cnv
|
||||
movq mmD,mmA
|
||||
movq mmC,mmF
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
jmp short .rgb_gray_cnv
|
||||
alignx 16,7
|
||||
|
||||
.columnloop:
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
|
||||
movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
|
||||
|
||||
.rgb_gray_cnv:
|
||||
; mmA=(00 10 20 30 01 11 21 31)
|
||||
; mmF=(02 12 22 32 03 13 23 33)
|
||||
; mmD=(04 14 24 34 05 15 25 35)
|
||||
; mmC=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movq mmB,mmA
|
||||
punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32)
|
||||
punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33)
|
||||
|
||||
movq mmG,mmD
|
||||
punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36)
|
||||
punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37)
|
||||
|
||||
movq mmE,mmA
|
||||
punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16)
|
||||
punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36)
|
||||
|
||||
movq mmH,mmB
|
||||
punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17)
|
||||
punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37)
|
||||
|
||||
pxor mmF,mmF
|
||||
|
||||
movq mmC,mmA
|
||||
punpcklbw mmA,mmF ; mmA=(00 02 04 06)
|
||||
punpckhbw mmC,mmF ; mmC=(10 12 14 16)
|
||||
|
||||
movq mmD,mmB
|
||||
punpcklbw mmB,mmF ; mmB=(01 03 05 07)
|
||||
punpckhbw mmD,mmF ; mmD=(11 13 15 17)
|
||||
|
||||
movq mmG,mmE
|
||||
punpcklbw mmE,mmF ; mmE=(20 22 24 26)
|
||||
punpckhbw mmG,mmF ; mmG=(30 32 34 36)
|
||||
|
||||
punpcklbw mmF,mmH
|
||||
punpckhbw mmH,mmH
|
||||
psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27)
|
||||
psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37)
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
|
||||
; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
|
||||
|
||||
; (Original)
|
||||
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
;
|
||||
; (This implementation)
|
||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
|
||||
movq mm6,mm1
|
||||
punpcklwd mm1,mm3
|
||||
punpckhwd mm6,mm3
|
||||
pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
movq mm6,mm0
|
||||
punpcklwd mm0,mm2
|
||||
punpckhwd mm6,mm2
|
||||
pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
movq mm0, mm5 ; mm0=BO
|
||||
movq mm6, mm4 ; mm6=BE
|
||||
|
||||
movq mm4,mm0
|
||||
punpcklwd mm0,mm3
|
||||
punpckhwd mm4,mm3
|
||||
pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||
pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||
|
||||
movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF]
|
||||
|
||||
paddd mm0, mm1
|
||||
paddd mm4, mm7
|
||||
paddd mm0,mm3
|
||||
paddd mm4,mm3
|
||||
psrld mm0,SCALEBITS ; mm0=YOL
|
||||
psrld mm4,SCALEBITS ; mm4=YOH
|
||||
packssdw mm0,mm4 ; mm0=YO
|
||||
|
||||
movq mm4,mm6
|
||||
punpcklwd mm6,mm2
|
||||
punpckhwd mm4,mm2
|
||||
pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||
pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||
|
||||
movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF]
|
||||
|
||||
paddd mm6, MMWORD [wk(0)]
|
||||
paddd mm4, MMWORD [wk(1)]
|
||||
paddd mm6,mm2
|
||||
paddd mm4,mm2
|
||||
psrld mm6,SCALEBITS ; mm6=YEL
|
||||
psrld mm4,SCALEBITS ; mm4=YEH
|
||||
packssdw mm6,mm4 ; mm6=YE
|
||||
|
||||
psllw mm0,BYTE_BIT
|
||||
por mm6,mm0 ; mm6=Y
|
||||
movq MMWORD [edi], mm6 ; Save Y
|
||||
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr
|
||||
add edi, byte SIZEOF_MMWORD ; outptr0
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jae near .columnloop
|
||||
test ecx,ecx
|
||||
jnz near .column_ld1
|
||||
|
||||
pop ecx ; col
|
||||
pop esi
|
||||
pop edi
|
||||
poppic eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_buf
|
||||
add edi, byte SIZEOF_JSAMPROW
|
||||
dec eax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp,ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 16
|
||||
366
simd/jcgryss2-64.asm
Normal file
366
simd/jcgryss2-64.asm
Normal file
@@ -0,0 +1,366 @@
|
||||
;
|
||||
; jcgryss2-64.asm - grayscale colorspace conversion (64-bit SSE2)
|
||||
;
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; Copyright (C) 2011, D. R. Commander.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
|
||||
; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
; JDIMENSION output_row, int num_rows);
|
||||
;
|
||||
|
||||
; r10 = JDIMENSION img_width
|
||||
; r11 = JSAMPARRAY input_buf
|
||||
; r12 = JSAMPIMAGE output_buf
|
||||
; r13 = JDIMENSION output_row
|
||||
; r14 = int num_rows
|
||||
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 16
|
||||
|
||||
global EXTN(jsimd_rgb_gray_convert_sse2)
|
||||
|
||||
EXTN(jsimd_rgb_gray_convert_sse2):
|
||||
push rbp
|
||||
mov rax,rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp],rax
|
||||
mov rbp,rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
push rbx
|
||||
|
||||
mov rcx, r10
|
||||
test rcx,rcx
|
||||
jz near .return
|
||||
|
||||
push rcx
|
||||
|
||||
mov rsi, r12
|
||||
mov rcx, r13
|
||||
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
|
||||
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop rcx
|
||||
|
||||
mov rsi, r11
|
||||
mov eax, r14d
|
||||
test rax,rax
|
||||
jle near .return
|
||||
.rowloop:
|
||||
push rdi
|
||||
push rsi
|
||||
push rcx ; col
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rdi, JSAMPROW [rdi] ; outptr0
|
||||
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
.column_ld1:
|
||||
push rax
|
||||
push rdx
|
||||
lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
|
||||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub rcx, byte SIZEOF_BYTE
|
||||
movzx rax, BYTE [rsi+rcx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
movzx rdx, WORD [rsi+rcx]
|
||||
shl rax, WORD_BIT
|
||||
or rax,rdx
|
||||
.column_ld4:
|
||||
movd xmmA,eax
|
||||
pop rdx
|
||||
pop rax
|
||||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub rcx, byte SIZEOF_DWORD
|
||||
movd xmmF, XMM_DWORD [rsi+rcx]
|
||||
pslldq xmmA, SIZEOF_DWORD
|
||||
por xmmA,xmmF
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_MMWORD
|
||||
jz short .column_ld16
|
||||
sub rcx, byte SIZEOF_MMWORD
|
||||
movq xmmB, XMM_MMWORD [rsi+rcx]
|
||||
pslldq xmmA, SIZEOF_MMWORD
|
||||
por xmmA,xmmB
|
||||
.column_ld16:
|
||||
test cl, SIZEOF_XMMWORD
|
||||
jz short .column_ld32
|
||||
movdqa xmmF,xmmA
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
mov rcx, SIZEOF_XMMWORD
|
||||
jmp short .rgb_gray_cnv
|
||||
.column_ld32:
|
||||
test cl, 2*SIZEOF_XMMWORD
|
||||
mov rcx, SIZEOF_XMMWORD
|
||||
jz short .rgb_gray_cnv
|
||||
movdqa xmmB,xmmA
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
jmp short .rgb_gray_cnv
|
||||
|
||||
.columnloop:
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
|
||||
|
||||
.rgb_gray_cnv:
|
||||
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||
|
||||
movdqa xmmG,xmmA
|
||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
|
||||
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
|
||||
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
|
||||
|
||||
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
|
||||
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
|
||||
|
||||
movdqa xmmD,xmmA
|
||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
|
||||
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
|
||||
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
|
||||
|
||||
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
|
||||
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
|
||||
|
||||
movdqa xmmE,xmmA
|
||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
|
||||
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
|
||||
|
||||
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
|
||||
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
|
||||
|
||||
pxor xmmH,xmmH
|
||||
|
||||
movdqa xmmC,xmmA
|
||||
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||
|
||||
movdqa xmmB,xmmE
|
||||
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||
|
||||
movdqa xmmF,xmmD
|
||||
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
.column_ld1:
|
||||
test cl, SIZEOF_XMMWORD/16
|
||||
jz short .column_ld2
|
||||
sub rcx, byte SIZEOF_XMMWORD/16
|
||||
movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_XMMWORD/8
|
||||
jz short .column_ld4
|
||||
sub rcx, byte SIZEOF_XMMWORD/8
|
||||
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
pslldq xmmA, SIZEOF_MMWORD
|
||||
por xmmA,xmmE
|
||||
.column_ld4:
|
||||
test cl, SIZEOF_XMMWORD/4
|
||||
jz short .column_ld8
|
||||
sub rcx, byte SIZEOF_XMMWORD/4
|
||||
movdqa xmmE,xmmA
|
||||
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_XMMWORD/2
|
||||
mov rcx, SIZEOF_XMMWORD
|
||||
jz short .rgb_gray_cnv
|
||||
movdqa xmmF,xmmA
|
||||
movdqa xmmH,xmmE
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
jmp short .rgb_gray_cnv
|
||||
|
||||
.columnloop:
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
|
||||
movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
|
||||
|
||||
.rgb_gray_cnv:
|
||||
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
|
||||
movdqa xmmD,xmmA
|
||||
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
|
||||
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
|
||||
|
||||
movdqa xmmC,xmmF
|
||||
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
|
||||
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
|
||||
|
||||
movdqa xmmB,xmmA
|
||||
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
|
||||
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
|
||||
|
||||
movdqa xmmG,xmmD
|
||||
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
|
||||
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
|
||||
|
||||
movdqa xmmE,xmmA
|
||||
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
|
||||
|
||||
movdqa xmmH,xmmB
|
||||
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
|
||||
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
|
||||
|
||||
pxor xmmF,xmmF
|
||||
|
||||
movdqa xmmC,xmmA
|
||||
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||
|
||||
movdqa xmmD,xmmB
|
||||
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||
|
||||
movdqa xmmG,xmmE
|
||||
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
|
||||
|
||||
punpcklbw xmmF,xmmH
|
||||
punpckhbw xmmH,xmmH
|
||||
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
|
||||
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
|
||||
|
||||
; (Original)
|
||||
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
;
|
||||
; (This implementation)
|
||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
|
||||
movdqa xmm6,xmm1
|
||||
punpcklwd xmm1,xmm3
|
||||
punpckhwd xmm6,xmm3
|
||||
pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
movdqa xmm6,xmm0
|
||||
punpcklwd xmm0,xmm2
|
||||
punpckhwd xmm6,xmm2
|
||||
pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
movdqa xmm0, xmm5 ; xmm0=BO
|
||||
movdqa xmm6, xmm4 ; xmm6=BE
|
||||
|
||||
movdqa xmm4,xmm0
|
||||
punpcklwd xmm0,xmm3
|
||||
punpckhwd xmm4,xmm3
|
||||
pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||
|
||||
movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
|
||||
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm4, xmm7
|
||||
paddd xmm0,xmm3
|
||||
paddd xmm4,xmm3
|
||||
psrld xmm0,SCALEBITS ; xmm0=YOL
|
||||
psrld xmm4,SCALEBITS ; xmm4=YOH
|
||||
packssdw xmm0,xmm4 ; xmm0=YO
|
||||
|
||||
movdqa xmm4,xmm6
|
||||
punpcklwd xmm6,xmm2
|
||||
punpckhwd xmm4,xmm2
|
||||
pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||
|
||||
movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
|
||||
|
||||
paddd xmm6, XMMWORD [wk(0)]
|
||||
paddd xmm4, XMMWORD [wk(1)]
|
||||
paddd xmm6,xmm2
|
||||
paddd xmm4,xmm2
|
||||
psrld xmm6,SCALEBITS ; xmm6=YEL
|
||||
psrld xmm4,SCALEBITS ; xmm4=YEH
|
||||
packssdw xmm6,xmm4 ; xmm6=YE
|
||||
|
||||
psllw xmm0,BYTE_BIT
|
||||
por xmm6,xmm0 ; xmm6=Y
|
||||
movdqa XMMWORD [rdi], xmm6 ; Save Y
|
||||
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr0
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
test rcx,rcx
|
||||
jnz near .column_ld1
|
||||
|
||||
pop rcx ; col
|
||||
pop rsi
|
||||
pop rdi
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW ; input_buf
|
||||
add rdi, byte SIZEOF_JSAMPROW
|
||||
dec rax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args
|
||||
mov rsp,rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 16
|
||||
385
simd/jcgryss2.asm
Normal file
385
simd/jcgryss2.asm
Normal file
@@ -0,0 +1,385 @@
|
||||
;
|
||||
; jcgryss2.asm - grayscale colorspace conversion (SSE2)
|
||||
;
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; Copyright (C) 2011, D. R. Commander.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
|
||||
; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
; JDIMENSION output_row, int num_rows);
|
||||
;
|
||||
|
||||
%define img_width(b) (b)+8 ; JDIMENSION img_width
|
||||
%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
|
||||
%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
|
||||
%define output_row(b) (b)+20 ; JDIMENSION output_row
|
||||
%define num_rows(b) (b)+24 ; int num_rows
|
||||
|
||||
%define original_ebp ebp+0
|
||||
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 16
|
||||
|
||||
global EXTN(jsimd_rgb_gray_convert_sse2)
|
||||
|
||||
EXTN(jsimd_rgb_gray_convert_sse2):
|
||||
push ebp
|
||||
mov eax,esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp],eax
|
||||
mov ebp,esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [img_width(eax)]
|
||||
test ecx,ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov esi, JSAMPIMAGE [output_buf(eax)]
|
||||
mov ecx, JDIMENSION [output_row(eax)]
|
||||
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
|
||||
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop ecx
|
||||
|
||||
mov esi, JSAMPARRAY [input_buf(eax)]
|
||||
mov eax, INT [num_rows(eax)]
|
||||
test eax,eax
|
||||
jle near .return
|
||||
alignx 16,7
|
||||
.rowloop:
|
||||
pushpic eax
|
||||
push edi
|
||||
push esi
|
||||
push ecx ; col
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr0
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
alignx 16,7
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
.column_ld1:
|
||||
push eax
|
||||
push edx
|
||||
lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
|
||||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_BYTE
|
||||
movzx eax, BYTE [esi+ecx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
movzx edx, WORD [esi+ecx]
|
||||
shl eax, WORD_BIT
|
||||
or eax,edx
|
||||
.column_ld4:
|
||||
movd xmmA,eax
|
||||
pop edx
|
||||
pop eax
|
||||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
movd xmmF, XMM_DWORD [esi+ecx]
|
||||
pslldq xmmA, SIZEOF_DWORD
|
||||
por xmmA,xmmF
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_MMWORD
|
||||
jz short .column_ld16
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
movq xmmB, XMM_MMWORD [esi+ecx]
|
||||
pslldq xmmA, SIZEOF_MMWORD
|
||||
por xmmA,xmmB
|
||||
.column_ld16:
|
||||
test cl, SIZEOF_XMMWORD
|
||||
jz short .column_ld32
|
||||
movdqa xmmF,xmmA
|
||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
mov ecx, SIZEOF_XMMWORD
|
||||
jmp short .rgb_gray_cnv
|
||||
.column_ld32:
|
||||
test cl, 2*SIZEOF_XMMWORD
|
||||
mov ecx, SIZEOF_XMMWORD
|
||||
jz short .rgb_gray_cnv
|
||||
movdqa xmmB,xmmA
|
||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
jmp short .rgb_gray_cnv
|
||||
alignx 16,7
|
||||
|
||||
.columnloop:
|
||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
|
||||
|
||||
.rgb_gray_cnv:
|
||||
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||
|
||||
movdqa xmmG,xmmA
|
||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
|
||||
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
|
||||
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
|
||||
|
||||
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
|
||||
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
|
||||
|
||||
movdqa xmmD,xmmA
|
||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
|
||||
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
|
||||
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
|
||||
|
||||
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
|
||||
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
|
||||
|
||||
movdqa xmmE,xmmA
|
||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
|
||||
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
|
||||
|
||||
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
|
||||
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
|
||||
|
||||
pxor xmmH,xmmH
|
||||
|
||||
movdqa xmmC,xmmA
|
||||
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||
|
||||
movdqa xmmB,xmmE
|
||||
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||
|
||||
movdqa xmmF,xmmD
|
||||
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
.column_ld1:
|
||||
test cl, SIZEOF_XMMWORD/16
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_XMMWORD/16
|
||||
movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_XMMWORD/8
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_XMMWORD/8
|
||||
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
pslldq xmmA, SIZEOF_MMWORD
|
||||
por xmmA,xmmE
|
||||
.column_ld4:
|
||||
test cl, SIZEOF_XMMWORD/4
|
||||
jz short .column_ld8
|
||||
sub ecx, byte SIZEOF_XMMWORD/4
|
||||
movdqa xmmE,xmmA
|
||||
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_XMMWORD/2
|
||||
mov ecx, SIZEOF_XMMWORD
|
||||
jz short .rgb_gray_cnv
|
||||
movdqa xmmF,xmmA
|
||||
movdqa xmmH,xmmE
|
||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
jmp short .rgb_gray_cnv
|
||||
alignx 16,7
|
||||
|
||||
.columnloop:
|
||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
|
||||
movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
|
||||
|
||||
.rgb_gray_cnv:
|
||||
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
|
||||
movdqa xmmD,xmmA
|
||||
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
|
||||
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
|
||||
|
||||
movdqa xmmC,xmmF
|
||||
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
|
||||
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
|
||||
|
||||
movdqa xmmB,xmmA
|
||||
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
|
||||
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
|
||||
|
||||
movdqa xmmG,xmmD
|
||||
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
|
||||
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
|
||||
|
||||
movdqa xmmE,xmmA
|
||||
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
|
||||
|
||||
movdqa xmmH,xmmB
|
||||
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
|
||||
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
|
||||
|
||||
pxor xmmF,xmmF
|
||||
|
||||
movdqa xmmC,xmmA
|
||||
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||
|
||||
movdqa xmmD,xmmB
|
||||
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||
|
||||
movdqa xmmG,xmmE
|
||||
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
|
||||
|
||||
punpcklbw xmmF,xmmH
|
||||
punpckhbw xmmH,xmmH
|
||||
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
|
||||
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
|
||||
|
||||
; (Original)
|
||||
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
;
|
||||
; (This implementation)
|
||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
|
||||
movdqa xmm6,xmm1
|
||||
punpcklwd xmm1,xmm3
|
||||
punpckhwd xmm6,xmm3
|
||||
pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
movdqa xmm6,xmm0
|
||||
punpcklwd xmm0,xmm2
|
||||
punpckhwd xmm6,xmm2
|
||||
pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
movdqa xmm0, xmm5 ; xmm0=BO
|
||||
movdqa xmm6, xmm4 ; xmm6=BE
|
||||
|
||||
movdqa xmm4,xmm0
|
||||
punpcklwd xmm0,xmm3
|
||||
punpckhwd xmm4,xmm3
|
||||
pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||
|
||||
movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
|
||||
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm4, xmm7
|
||||
paddd xmm0,xmm3
|
||||
paddd xmm4,xmm3
|
||||
psrld xmm0,SCALEBITS ; xmm0=YOL
|
||||
psrld xmm4,SCALEBITS ; xmm4=YOH
|
||||
packssdw xmm0,xmm4 ; xmm0=YO
|
||||
|
||||
movdqa xmm4,xmm6
|
||||
punpcklwd xmm6,xmm2
|
||||
punpckhwd xmm4,xmm2
|
||||
pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||
|
||||
movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
|
||||
|
||||
paddd xmm6, XMMWORD [wk(0)]
|
||||
paddd xmm4, XMMWORD [wk(1)]
|
||||
paddd xmm6,xmm2
|
||||
paddd xmm4,xmm2
|
||||
psrld xmm6,SCALEBITS ; xmm6=YEL
|
||||
psrld xmm4,SCALEBITS ; xmm4=YEH
|
||||
packssdw xmm6,xmm4 ; xmm6=YE
|
||||
|
||||
psllw xmm0,BYTE_BIT
|
||||
por xmm6,xmm0 ; xmm6=Y
|
||||
movdqa XMMWORD [edi], xmm6 ; Save Y
|
||||
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr0
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
test ecx,ecx
|
||||
jnz near .column_ld1
|
||||
|
||||
pop ecx ; col
|
||||
pop esi
|
||||
pop edi
|
||||
poppic eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_buf
|
||||
add edi, byte SIZEOF_JSAMPROW
|
||||
dec eax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp,ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 16
|
||||
67
simd/jsimd.h
67
simd/jsimd.h
@@ -43,6 +43,14 @@
|
||||
#define jsimd_extbgrx_ycc_convert_sse2 jSEXTBGRXYCCS2
|
||||
#define jsimd_extxbgr_ycc_convert_sse2 jSEXTXBGRYCCS2
|
||||
#define jsimd_extxrgb_ycc_convert_sse2 jSEXTXRGBYCCS2
|
||||
#define jconst_rgb_gray_convert_sse2 jSCRGBGRYS2
|
||||
#define jsimd_rgb_gray_convert_sse2 jSRGBGRYS2
|
||||
#define jsimd_extrgb_gray_convert_sse2 jSEXTRGBGRYS2
|
||||
#define jsimd_extrgbx_gray_convert_sse2 jSEXTRGBXGRYS2
|
||||
#define jsimd_extbgr_gray_convert_sse2 jSEXTBGRGRYS2
|
||||
#define jsimd_extbgrx_gray_convert_sse2 jSEXTBGRXGRYS2
|
||||
#define jsimd_extxbgr_gray_convert_sse2 jSEXTXBGRGRYS2
|
||||
#define jsimd_extxrgb_gray_convert_sse2 jSEXTXRGBGRYS2
|
||||
#define jconst_ycc_rgb_convert_sse2 jSCYCCRGBS2
|
||||
#define jsimd_ycc_rgb_convert_sse2 jSYCCRGBS2
|
||||
#define jsimd_ycc_extrgb_convert_sse2 jSYCCEXTRGBS2
|
||||
@@ -163,6 +171,35 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_mmx
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
|
||||
EXTERN(void) jsimd_rgb_gray_convert_mmx
|
||||
JPP((JDIMENSION img_width,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
EXTERN(void) jsimd_extrgb_gray_convert_mmx
|
||||
JPP((JDIMENSION img_width,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
EXTERN(void) jsimd_extrgbx_gray_convert_mmx
|
||||
JPP((JDIMENSION img_width,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
EXTERN(void) jsimd_extbgr_gray_convert_mmx
|
||||
JPP((JDIMENSION img_width,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
EXTERN(void) jsimd_extbgrx_gray_convert_mmx
|
||||
JPP((JDIMENSION img_width,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
EXTERN(void) jsimd_extxbgr_gray_convert_mmx
|
||||
JPP((JDIMENSION img_width,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
EXTERN(void) jsimd_extxrgb_gray_convert_mmx
|
||||
JPP((JDIMENSION img_width,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
|
||||
EXTERN(void) jsimd_ycc_rgb_convert_mmx
|
||||
JPP((JDIMENSION out_width,
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
@@ -222,6 +259,36 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_sse2
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
|
||||
extern const int jconst_rgb_gray_convert_sse2[];
|
||||
EXTERN(void) jsimd_rgb_gray_convert_sse2
|
||||
JPP((JDIMENSION img_width,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
EXTERN(void) jsimd_extrgb_gray_convert_sse2
|
||||
JPP((JDIMENSION img_width,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
EXTERN(void) jsimd_extrgbx_gray_convert_sse2
|
||||
JPP((JDIMENSION img_width,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
EXTERN(void) jsimd_extbgr_gray_convert_sse2
|
||||
JPP((JDIMENSION img_width,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
EXTERN(void) jsimd_extbgrx_gray_convert_sse2
|
||||
JPP((JDIMENSION img_width,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
EXTERN(void) jsimd_extxbgr_gray_convert_sse2
|
||||
JPP((JDIMENSION img_width,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
EXTERN(void) jsimd_extxrgb_gray_convert_sse2
|
||||
JPP((JDIMENSION img_width,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows));
|
||||
|
||||
extern const int jconst_ycc_rgb_convert_sse2[];
|
||||
EXTERN(void) jsimd_ycc_rgb_convert_sse2
|
||||
JPP((JDIMENSION out_width,
|
||||
|
||||
@@ -83,6 +83,28 @@ jsimd_can_rgb_ycc (void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_rgb_gray (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
|
||||
return 0;
|
||||
|
||||
if ((simd_support & JSIMD_SSE2) &&
|
||||
IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
|
||||
return 1;
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_ycc_rgb (void)
|
||||
{
|
||||
@@ -154,6 +176,55 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
|
||||
output_buf, output_row, num_rows);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_rgb_gray_convert (j_compress_ptr cinfo,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows)
|
||||
{
|
||||
void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
|
||||
void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
|
||||
|
||||
switch(cinfo->in_color_space)
|
||||
{
|
||||
case JCS_EXT_RGB:
|
||||
sse2fct=jsimd_extrgb_gray_convert_sse2;
|
||||
mmxfct=jsimd_extrgb_gray_convert_mmx;
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
sse2fct=jsimd_extrgbx_gray_convert_sse2;
|
||||
mmxfct=jsimd_extrgbx_gray_convert_mmx;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
sse2fct=jsimd_extbgr_gray_convert_sse2;
|
||||
mmxfct=jsimd_extbgr_gray_convert_mmx;
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
sse2fct=jsimd_extbgrx_gray_convert_sse2;
|
||||
mmxfct=jsimd_extbgrx_gray_convert_mmx;
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
sse2fct=jsimd_extxbgr_gray_convert_sse2;
|
||||
mmxfct=jsimd_extxbgr_gray_convert_mmx;
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
sse2fct=jsimd_extxrgb_gray_convert_sse2;
|
||||
mmxfct=jsimd_extxrgb_gray_convert_mmx;
|
||||
break;
|
||||
default:
|
||||
sse2fct=jsimd_rgb_gray_convert_sse2;
|
||||
mmxfct=jsimd_rgb_gray_convert_mmx;
|
||||
break;
|
||||
}
|
||||
|
||||
if ((simd_support & JSIMD_SSE2) &&
|
||||
IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
|
||||
sse2fct(cinfo->image_width, input_buf,
|
||||
output_buf, output_row, num_rows);
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
mmxfct(cinfo->image_width, input_buf,
|
||||
output_buf, output_row, num_rows);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
|
||||
@@ -46,6 +46,23 @@ jsimd_can_rgb_ycc (void)
|
||||
return 1;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_rgb_gray (void)
|
||||
{
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
|
||||
return 0;
|
||||
|
||||
if (!IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_ycc_rgb (void)
|
||||
{
|
||||
@@ -98,6 +115,41 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
|
||||
sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_rgb_gray_convert (j_compress_ptr cinfo,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows)
|
||||
{
|
||||
void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
|
||||
|
||||
switch(cinfo->in_color_space)
|
||||
{
|
||||
case JCS_EXT_RGB:
|
||||
sse2fct=jsimd_extrgb_gray_convert_sse2;
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
sse2fct=jsimd_extrgbx_gray_convert_sse2;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
sse2fct=jsimd_extbgr_gray_convert_sse2;
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
sse2fct=jsimd_extbgrx_gray_convert_sse2;
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
sse2fct=jsimd_extxbgr_gray_convert_sse2;
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
sse2fct=jsimd_extxrgb_gray_convert_sse2;
|
||||
break;
|
||||
default:
|
||||
sse2fct=jsimd_rgb_gray_convert_sse2;
|
||||
break;
|
||||
}
|
||||
|
||||
sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
|
||||
Reference in New Issue
Block a user