SIMD-accelerated RGB-to-Grayscale color conversion

This commit is contained in:
DRC
2011-02-18 11:23:45 +00:00
parent a683760d1c
commit 439527e0b9
14 changed files with 1676 additions and 92 deletions

View File

@@ -148,6 +148,8 @@ if WITH_SIMD
else
cmp $(srcdir)/testimgflt-nosimd.jpg testoutflt.jpg
endif
./cjpeg -dct int -grayscale -outfile testoutgray.jpg $(srcdir)/testorig.ppm
cmp $(srcdir)/testimggray.jpg testoutgray.jpg
./djpeg -dct int -fast -ppm -outfile testoutint.ppm $(srcdir)/testorig.jpg
cmp $(srcdir)/testimgint.ppm testoutint.ppm
./djpeg -dct fast -ppm -outfile testoutfst.ppm $(srcdir)/testorig.jpg

104
jccolor.c
View File

@@ -81,74 +81,6 @@ typedef my_color_converter * my_cconvert_ptr;
#define TABLE_SIZE (8*(MAXJSAMPLE+1))
#if BITS_IN_JSAMPLE == 8
static const unsigned char red_lut[256] = {
0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 3 , 3 , 3 , 4 , 4 , 4 , 4 ,
5 , 5 , 5 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 8 , 8 , 8 , 9 , 9 , 9 ,
10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14,
14, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19,
19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 24,
24, 24, 25, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 28,
29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33,
33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 38, 38,
38, 39, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 42, 43,
43, 43, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48,
48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 51, 51, 51, 52, 52, 52,
53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 56, 56, 56, 57, 57, 57,
57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 60, 61, 61, 61, 62, 62,
62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 67,
67, 67, 68, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 71,
72, 72, 72, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75, 76, 76, 76
};
static const unsigned char green_lut[256] = {
0 , 1 , 1 , 2 , 2 , 3 , 4 , 4 , 5 , 5 , 6 , 6 ,
7 , 8 , 8 , 9 , 9 , 10 , 11 , 11 , 12 , 12 , 13 , 14 ,
14 , 15 , 15 , 16 , 16 , 17 , 18 , 18 , 19 , 19 , 20 , 21 ,
21 , 22 , 22 , 23 , 23 , 24 , 25 , 25 , 26 , 26 , 27 , 28 ,
28 , 29 , 29 , 30 , 31 , 31 , 32 , 32 , 33 , 33 , 34 , 35 ,
35 , 36 , 36 , 37 , 38 , 38 , 39 , 39 , 40 , 41 , 41 , 42 ,
42 , 43 , 43 , 44 , 45 , 45 , 46 , 46 , 47 , 48 , 48 , 49 ,
49 , 50 , 50 , 51 , 52 , 52 , 53 , 53 , 54 , 55 , 55 , 56 ,
56 , 57 , 58 , 58 , 59 , 59 , 60 , 60 , 61 , 62 , 62 , 63 ,
63 , 64 , 65 , 65 , 66 , 66 , 67 , 68 , 68 , 69 , 69 , 70 ,
70 , 71 , 72 , 72 , 73 , 73 , 74 , 75 , 75 , 76 , 76 , 77 ,
77 , 78 , 79 , 79 , 80 , 80 , 81 , 82 , 82 , 83 , 83 , 84 ,
85 , 85 , 86 , 86 , 87 , 87 , 88 , 89 , 89 , 90 , 90 , 91 ,
92 , 92 , 93 , 93 , 94 , 95 , 95 , 96 , 96 , 97 , 97 , 98 ,
99 , 99 , 100, 100, 101, 102, 102, 103, 103, 104, 104, 105,
106, 106, 107, 107, 108, 109, 109, 110, 110, 111, 112, 112,
113, 113, 114, 114, 115, 116, 116, 117, 117, 118, 119, 119,
120, 120, 121, 122, 122, 123, 123, 124, 124, 125, 126, 126,
127, 127, 128, 129, 129, 130, 130, 131, 131, 132, 133, 133,
34, 134, 135, 136, 136, 137, 137, 138, 139, 139, 140, 140,
141, 141, 142, 143, 143, 144, 144, 145, 146, 146, 147, 147,
148, 149, 149, 150
};
static const unsigned char blue_lut[256] = {
0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2 , 2 ,
2 , 2 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 4 ,
4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 ,
5 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 7 , 7 ,
7 , 7 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 9 , 9 , 9 , 9 , 9 ,
9 , 9 , 9 , 9 , 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13,
13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14,
15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18,
18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24,
24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25,
26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27,
27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29
};
#endif
/*
* Initialize for RGB->YCC colorspace conversion.
*/
@@ -259,36 +191,26 @@ rgb_gray_convert (j_compress_ptr cinfo,
JDIMENSION output_row, int num_rows)
{
my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
#if BITS_IN_JSAMPLE != 8
register int r, g, b;
register INT32 * ctab = cconvert->rgb_ycc_tab;
#endif
register JSAMPROW inptr;
register JSAMPROW outptr;
JSAMPLE *maxoutptr;
register JDIMENSION col;
JDIMENSION num_cols = cinfo->image_width;
int rindex = rgb_red[cinfo->in_color_space];
int gindex = rgb_green[cinfo->in_color_space];
int bindex = rgb_blue[cinfo->in_color_space];
int rgbstride = rgb_pixelsize[cinfo->in_color_space];
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr = output_buf[0][output_row];
maxoutptr = &outptr[num_cols];
output_row++;
for (; outptr < maxoutptr; outptr++, inptr += rgbstride) {
for (col = 0; col < num_cols; col++) {
r = GETJSAMPLE(inptr[rgb_red[cinfo->in_color_space]]);
g = GETJSAMPLE(inptr[rgb_green[cinfo->in_color_space]]);
b = GETJSAMPLE(inptr[rgb_blue[cinfo->in_color_space]]);
inptr += rgb_pixelsize[cinfo->in_color_space];
/* Y */
#if BITS_IN_JSAMPLE == 8
*outptr = red_lut[inptr[rindex]] + green_lut[inptr[gindex]]
+ blue_lut[inptr[bindex]];
#else
*outptr = (JSAMPLE)
((ctab[GETJSAMPLE(inptr[rindex])+R_Y_OFF]
+ ctab[GETJSAMPLE(inptr[gindex])+G_Y_OFF]
+ ctab[GETJSAMPLE(inptr[bindex])+B_Y_OFF])
>> SCALEBITS);
#endif
outptr[col] = (JSAMPLE)
((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
>> SCALEBITS);
}
}
}
@@ -490,8 +412,12 @@ jinit_color_converter (j_compress_ptr cinfo)
cinfo->in_color_space == JCS_EXT_BGRX ||
cinfo->in_color_space == JCS_EXT_XBGR ||
cinfo->in_color_space == JCS_EXT_XRGB) {
cconvert->pub.start_pass = rgb_ycc_start;
cconvert->pub.color_convert = rgb_gray_convert;
if (jsimd_can_rgb_gray())
cconvert->pub.color_convert = jsimd_rgb_gray_convert;
else {
cconvert->pub.start_pass = rgb_ycc_start;
cconvert->pub.color_convert = rgb_gray_convert;
}
} else if (cinfo->in_color_space == JCS_YCbCr)
cconvert->pub.color_convert = grayscale_convert;
else

View File

@@ -13,8 +13,10 @@
#ifdef NEED_SHORT_EXTERNAL_NAMES
#define jsimd_can_rgb_ycc jSCanRgbYcc
#define jsimd_can_rgb_gray jSCanRgbGry
#define jsimd_can_ycc_rgb jSCanYccRgb
#define jsimd_rgb_ycc_convert jSRgbYccConv
#define jsimd_rgb_gray_convert jSRgbGryConv
#define jsimd_ycc_rgb_convert jSYccRgbConv
#define jsimd_can_h2v2_downsample jSCanH2V2Down
#define jsimd_can_h2v1_downsample jSCanH2V1Down
@@ -35,12 +37,17 @@
#endif /* NEED_SHORT_EXTERNAL_NAMES */
EXTERN(int) jsimd_can_rgb_ycc JPP((void));
EXTERN(int) jsimd_can_rgb_gray JPP((void));
EXTERN(int) jsimd_can_ycc_rgb JPP((void));
EXTERN(void) jsimd_rgb_ycc_convert
JPP((j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_rgb_gray_convert
JPP((j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_ycc_rgb_convert
JPP((j_decompress_ptr cinfo,
JSAMPIMAGE input_buf, JDIMENSION input_row,

View File

@@ -24,6 +24,12 @@ jsimd_can_rgb_ycc (void)
return 0;
}
GLOBAL(int)
jsimd_can_rgb_gray (void)
{
return 0;
}
GLOBAL(int)
jsimd_can_ycc_rgb (void)
{
@@ -37,6 +43,13 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
{
}
GLOBAL(void)
jsimd_rgb_gray_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
}
GLOBAL(void)
jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
JSAMPIMAGE input_buf, JDIMENSION input_row,

View File

@@ -11,7 +11,7 @@ if SIMD_X86_64
libsimd_la_SOURCES = jsimd_x86_64.c \
jsimd.h jsimdcfg.inc.h \
jsimdext.inc jcolsamp.inc jdct.inc \
jfsseflt-64.asm \
jfsseflt-64.asm jcgrass2-64.asm \
jccolss2-64.asm jdcolss2-64.asm \
jcsamss2-64.asm jdsamss2-64.asm jdmerss2-64.asm \
jcqnts2i-64.asm jfss2fst-64.asm jfss2int-64.asm \
@@ -20,6 +20,7 @@ libsimd_la_SOURCES = jsimd_x86_64.c \
jccolss2-64.lo: jcclrss2-64.asm
jdcolss2-64.lo: jdclrss2-64.asm
jcgrass2-64.lo: jcgryss2-64.asm
jdmerss2-64.lo: jdmrgss2-64.asm
endif
@@ -29,20 +30,22 @@ libsimd_la_SOURCES = jsimd_i386.c \
jsimd.h jsimdcfg.inc.h \
jsimdext.inc jcolsamp.inc jdct.inc \
jsimdcpu.asm \
jccolmmx.asm jdcolmmx.asm \
jccolmmx.asm jdcolmmx.asm jcgrammx.asm \
jcsammmx.asm jdsammmx.asm jdmermmx.asm \
jcqntmmx.asm jfmmxfst.asm jfmmxint.asm \
jimmxred.asm jimmxint.asm jimmxfst.asm \
jcqnt3dn.asm jf3dnflt.asm ji3dnflt.asm \
jcqntsse.asm jfsseflt.asm jisseflt.asm \
jccolss2.asm jdcolss2.asm \
jccolss2.asm jdcolss2.asm jcgrass2.asm \
jcsamss2.asm jdsamss2.asm jdmerss2.asm \
jcqnts2i.asm jfss2fst.asm jfss2int.asm \
jiss2red.asm jiss2int.asm jiss2fst.asm \
jcqnts2f.asm jiss2flt.asm
jccolmmx.lo: jcclrmmx.asm
jcgrammx.lo: jcgrymmx.asm
jccolss2.lo: jcclrss2.asm
jcgrass2.lo: jcgryss2.asm
jdcolmmx.lo: jdclrmmx.asm
jdcolss2.lo: jdclrss2.asm
jdmermmx.lo: jdmrgmmx.asm

113
simd/jcgrammx.asm Normal file
View File

@@ -0,0 +1,113 @@
;
; jcgrammx.asm - grayscale colorspace conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2011 D. R. Commander
;
; Based on
; x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; [TAB8]
%include "jsimdext.inc"
; --------------------------------------------------------------------------
%define SCALEBITS 16
F_0_114 equ 7471 ; FIX(0.11400)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 16
global EXTN(jconst_rgb_gray_convert_mmx)
EXTN(jconst_rgb_gray_convert_mmx):
PW_F0299_F0337 times 2 dw F_0_299, F_0_337
PW_F0114_F0250 times 2 dw F_0_114, F_0_250
PD_ONEHALF times 2 dd (1 << (SCALEBITS-1))
alignz 16
; --------------------------------------------------------------------------
%include "jcgrymmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 0
%define RGB_GREEN 1
%define RGB_BLUE 2
%define RGB_PIXELSIZE 3
%define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx
%include "jcgrymmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 0
%define RGB_GREEN 1
%define RGB_BLUE 2
%define RGB_PIXELSIZE 4
%define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx
%include "jcgrymmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 2
%define RGB_GREEN 1
%define RGB_BLUE 0
%define RGB_PIXELSIZE 3
%define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx
%include "jcgrymmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 2
%define RGB_GREEN 1
%define RGB_BLUE 0
%define RGB_PIXELSIZE 4
%define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx
%include "jcgrymmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 3
%define RGB_GREEN 2
%define RGB_BLUE 1
%define RGB_PIXELSIZE 4
%define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx
%include "jcgrymmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 1
%define RGB_GREEN 2
%define RGB_BLUE 3
%define RGB_PIXELSIZE 4
%define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx
%include "jcgrymmx.asm"

110
simd/jcgrass2-64.asm Normal file
View File

@@ -0,0 +1,110 @@
;
; jcgrass2-64.asm - grayscale colorspace conversion (64-bit SSE2)
;
; x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; Copyright (C) 2011, D. R. Commander.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; [TAB8]
%include "jsimdext.inc"
; --------------------------------------------------------------------------
%define SCALEBITS 16
F_0_114 equ 7471 ; FIX(0.11400)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 16
global EXTN(jconst_rgb_gray_convert_sse2)
EXTN(jconst_rgb_gray_convert_sse2):
PW_F0299_F0337 times 4 dw F_0_299, F_0_337
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
alignz 16
; --------------------------------------------------------------------------
%include "jcgryss2-64.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 0
%define RGB_GREEN 1
%define RGB_BLUE 2
%define RGB_PIXELSIZE 3
%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
%include "jcgryss2-64.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 0
%define RGB_GREEN 1
%define RGB_BLUE 2
%define RGB_PIXELSIZE 4
%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
%include "jcgryss2-64.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 2
%define RGB_GREEN 1
%define RGB_BLUE 0
%define RGB_PIXELSIZE 3
%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
%include "jcgryss2-64.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 2
%define RGB_GREEN 1
%define RGB_BLUE 0
%define RGB_PIXELSIZE 4
%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
%include "jcgryss2-64.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 3
%define RGB_GREEN 2
%define RGB_BLUE 1
%define RGB_PIXELSIZE 4
%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
%include "jcgryss2-64.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 1
%define RGB_GREEN 2
%define RGB_BLUE 3
%define RGB_PIXELSIZE 4
%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
%include "jcgryss2-64.asm"

110
simd/jcgrass2.asm Normal file
View File

@@ -0,0 +1,110 @@
;
; jcgrass2.asm - grayscale colorspace conversion (SSE2)
;
; x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; Copyright (C) 2011, D. R. Commander.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; [TAB8]
%include "jsimdext.inc"
; --------------------------------------------------------------------------
%define SCALEBITS 16
F_0_114 equ 7471 ; FIX(0.11400)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 16
global EXTN(jconst_rgb_gray_convert_sse2)
EXTN(jconst_rgb_gray_convert_sse2):
PW_F0299_F0337 times 4 dw F_0_299, F_0_337
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
alignz 16
; --------------------------------------------------------------------------
%include "jcgryss2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 0
%define RGB_GREEN 1
%define RGB_BLUE 2
%define RGB_PIXELSIZE 3
%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
%include "jcgryss2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 0
%define RGB_GREEN 1
%define RGB_BLUE 2
%define RGB_PIXELSIZE 4
%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
%include "jcgryss2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 2
%define RGB_GREEN 1
%define RGB_BLUE 0
%define RGB_PIXELSIZE 3
%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
%include "jcgryss2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 2
%define RGB_GREEN 1
%define RGB_BLUE 0
%define RGB_PIXELSIZE 4
%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
%include "jcgryss2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 3
%define RGB_GREEN 2
%define RGB_BLUE 1
%define RGB_PIXELSIZE 4
%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
%include "jcgryss2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED 1
%define RGB_GREEN 2
%define RGB_BLUE 3
%define RGB_PIXELSIZE 4
%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
%include "jcgryss2.asm"

359
simd/jcgrymmx.asm Normal file
View File

@@ -0,0 +1,359 @@
;
; jcclrmmx.asm - grayscale colorspace conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2011 D. R. Commander
;
; Based on
; x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; [TAB8]
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Convert some rows of samples to the output colorspace.
;
; GLOBAL(void)
; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width,
; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
; JDIMENSION output_row, int num_rows);
;
%define img_width(b) (b)+8 ; JDIMENSION img_width
%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
%define output_row(b) (b)+20 ; JDIMENSION output_row
%define num_rows(b) (b)+24 ; int num_rows
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
%define WK_NUM 2
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
align 16
global EXTN(jsimd_rgb_gray_convert_mmx)
EXTN(jsimd_rgb_gray_convert_mmx):
push ebp
mov eax,esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
mov [esp],eax
mov ebp,esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)] ; num_cols
test ecx,ecx
jz near .return
push ecx
mov esi, JSAMPIMAGE [output_buf(eax)]
mov ecx, JDIMENSION [output_row(eax)]
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
pop ecx
mov esi, JSAMPARRAY [input_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax,eax
jle near .return
alignx 16,7
.rowloop:
pushpic eax
push edi
push esi
push ecx ; col
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0
movpic eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_MMWORD
jae short .columnloop
alignx 16,7
%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
push eax
push edx
lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
xor eax,eax
mov al, BYTE [esi+ecx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
xor edx,edx
mov dx, WORD [esi+ecx]
shl eax, WORD_BIT
or eax,edx
.column_ld4:
movd mmA,eax
pop edx
pop eax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub ecx, byte SIZEOF_DWORD
movd mmG, DWORD [esi+ecx]
psllq mmA, DWORD_BIT
por mmA,mmG
.column_ld8:
test cl, SIZEOF_MMWORD
jz short .column_ld16
movq mmG,mmA
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
mov ecx, SIZEOF_MMWORD
jmp short .rgb_gray_cnv
.column_ld16:
test cl, 2*SIZEOF_MMWORD
mov ecx, SIZEOF_MMWORD
jz short .rgb_gray_cnv
movq mmF,mmA
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
jmp short .rgb_gray_cnv
alignx 16,7
.columnloop:
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
.rgb_gray_cnv:
; mmA=(00 10 20 01 11 21 02 12)
; mmG=(22 03 13 23 04 14 24 05)
; mmF=(15 25 06 16 26 07 17 27)
movq mmD,mmA
psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01)
psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --)
punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05)
psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23)
punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16)
punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27)
movq mmE,mmA
psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14)
psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --)
punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16)
psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25)
punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07)
punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27)
pxor mmH,mmH
movq mmC,mmA
punpcklbw mmA,mmH ; mmA=(00 02 04 06)
punpckhbw mmC,mmH ; mmC=(10 12 14 16)
movq mmB,mmE
punpcklbw mmE,mmH ; mmE=(20 22 24 26)
punpckhbw mmB,mmH ; mmB=(01 03 05 07)
movq mmF,mmD
punpcklbw mmD,mmH ; mmD=(11 13 15 17)
punpckhbw mmF,mmH ; mmF=(21 23 25 27)
%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
test cl, SIZEOF_MMWORD/8
jz short .column_ld2
sub ecx, byte SIZEOF_MMWORD/8
movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld2:
test cl, SIZEOF_MMWORD/4
jz short .column_ld4
sub ecx, byte SIZEOF_MMWORD/4
movq mmF,mmA
movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld4:
test cl, SIZEOF_MMWORD/2
mov ecx, SIZEOF_MMWORD
jz short .rgb_gray_cnv
movq mmD,mmA
movq mmC,mmF
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
jmp short .rgb_gray_cnv
alignx 16,7
.columnloop:
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
.rgb_gray_cnv:
; mmA=(00 10 20 30 01 11 21 31)
; mmF=(02 12 22 32 03 13 23 33)
; mmD=(04 14 24 34 05 15 25 35)
; mmC=(06 16 26 36 07 17 27 37)
movq mmB,mmA
punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32)
punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33)
movq mmG,mmD
punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36)
punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37)
movq mmE,mmA
punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16)
punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36)
movq mmH,mmB
punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17)
punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37)
pxor mmF,mmF
movq mmC,mmA
punpcklbw mmA,mmF ; mmA=(00 02 04 06)
punpckhbw mmC,mmF ; mmC=(10 12 14 16)
movq mmD,mmB
punpcklbw mmB,mmF ; mmB=(01 03 05 07)
punpckhbw mmD,mmF ; mmD=(11 13 15 17)
movq mmG,mmE
punpcklbw mmE,mmF ; mmE=(20 22 24 26)
punpckhbw mmG,mmF ; mmG=(30 32 34 36)
punpcklbw mmF,mmH
punpckhbw mmH,mmH
psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27)
psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37)
%endif ; RGB_PIXELSIZE ; ---------------
; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
movq mm6,mm1
punpcklwd mm1,mm3
punpckhwd mm6,mm3
pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
movq mm6,mm0
punpcklwd mm0,mm2
punpckhwd mm6,mm2
pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
movq mm0, mm5 ; mm0=BO
movq mm6, mm4 ; mm6=BE
movq mm4,mm0
punpcklwd mm0,mm3
punpckhwd mm4,mm3
pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF]
paddd mm0, mm1
paddd mm4, mm7
paddd mm0,mm3
paddd mm4,mm3
psrld mm0,SCALEBITS ; mm0=YOL
psrld mm4,SCALEBITS ; mm4=YOH
packssdw mm0,mm4 ; mm0=YO
movq mm4,mm6
punpcklwd mm6,mm2
punpckhwd mm4,mm2
pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF]
paddd mm6, MMWORD [wk(0)]
paddd mm4, MMWORD [wk(1)]
paddd mm6,mm2
paddd mm4,mm2
psrld mm6,SCALEBITS ; mm6=YEL
psrld mm4,SCALEBITS ; mm4=YEH
packssdw mm6,mm4 ; mm6=YE
psllw mm0,BYTE_BIT
por mm6,mm0 ; mm6=Y
movq MMWORD [edi], mm6 ; Save Y
sub ecx, byte SIZEOF_MMWORD
add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr
add edi, byte SIZEOF_MMWORD ; outptr0
cmp ecx, byte SIZEOF_MMWORD
jae near .columnloop
test ecx,ecx
jnz near .column_ld1
pop ecx ; col
pop esi
pop edi
poppic eax
add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW
dec eax ; num_rows
jg near .rowloop
emms ; empty MMX state
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp,ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16

366
simd/jcgryss2-64.asm Normal file
View File

@@ -0,0 +1,366 @@
;
; jcgryss2-64.asm - grayscale colorspace conversion (64-bit SSE2)
;
; x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; Copyright (C) 2011, D. R. Commander.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; [TAB8]
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
;
; Convert some rows of samples to the output colorspace.
;
; GLOBAL(void)
; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
; JDIMENSION output_row, int num_rows);
;
; r10 = JDIMENSION img_width
; r11 = JSAMPARRAY input_buf
; r12 = JSAMPIMAGE output_buf
; r13 = JDIMENSION output_row
; r14 = int num_rows
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
align 16
global EXTN(jsimd_rgb_gray_convert_sse2)
EXTN(jsimd_rgb_gray_convert_sse2):
push rbp
mov rax,rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp],rax
mov rbp,rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
collect_args
push rbx
mov rcx, r10
test rcx,rcx
jz near .return
push rcx
mov rsi, r12
mov rcx, r13
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
pop rcx
mov rsi, r11
mov eax, r14d
test rax,rax
jle near .return
.rowloop:
push rdi
push rsi
push rcx ; col
mov rsi, JSAMPROW [rsi] ; inptr
mov rdi, JSAMPROW [rdi] ; outptr0
cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop
%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
push rax
push rdx
lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub rcx, byte SIZEOF_BYTE
movzx rax, BYTE [rsi+rcx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub rcx, byte SIZEOF_WORD
movzx rdx, WORD [rsi+rcx]
shl rax, WORD_BIT
or rax,rdx
.column_ld4:
movd xmmA,eax
pop rdx
pop rax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub rcx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [rsi+rcx]
pslldq xmmA, SIZEOF_DWORD
por xmmA,xmmF
.column_ld8:
test cl, SIZEOF_MMWORD
jz short .column_ld16
sub rcx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [rsi+rcx]
pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmB
.column_ld16:
test cl, SIZEOF_XMMWORD
jz short .column_ld32
movdqa xmmF,xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
mov rcx, SIZEOF_XMMWORD
jmp short .rgb_gray_cnv
.column_ld32:
test cl, 2*SIZEOF_XMMWORD
mov rcx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv
movdqa xmmB,xmmA
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv
.columnloop:
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
.rgb_gray_cnv:
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
movdqa xmmG,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
movdqa xmmD,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
movdqa xmmE,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
pxor xmmH,xmmH
movdqa xmmC,xmmA
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmB,xmmE
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmF,xmmD
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
test cl, SIZEOF_XMMWORD/16
jz short .column_ld2
sub rcx, byte SIZEOF_XMMWORD/16
movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
.column_ld2:
test cl, SIZEOF_XMMWORD/8
jz short .column_ld4
sub rcx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmE
.column_ld4:
test cl, SIZEOF_XMMWORD/4
jz short .column_ld8
sub rcx, byte SIZEOF_XMMWORD/4
movdqa xmmE,xmmA
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
.column_ld8:
test cl, SIZEOF_XMMWORD/2
mov rcx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv
movdqa xmmF,xmmA
movdqa xmmH,xmmE
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv
.columnloop:
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
.rgb_gray_cnv:
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD,xmmA
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmC,xmmF
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmB,xmmA
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmG,xmmD
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmE,xmmA
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmH,xmmB
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
pxor xmmF,xmmF
movdqa xmmC,xmmA
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmD,xmmB
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmG,xmmE
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
punpcklbw xmmF,xmmH
punpckhbw xmmH,xmmH
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
%endif ; RGB_PIXELSIZE ; ---------------
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
movdqa xmm6,xmm1
punpcklwd xmm1,xmm3
punpckhwd xmm6,xmm3
pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm6,xmm0
punpcklwd xmm0,xmm2
punpckhwd xmm6,xmm2
pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa xmm0, xmm5 ; xmm0=BO
movdqa xmm6, xmm4 ; xmm6=BE
movdqa xmm4,xmm0
punpcklwd xmm0,xmm3
punpckhwd xmm4,xmm3
pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
paddd xmm0, xmm1
paddd xmm4, xmm7
paddd xmm0,xmm3
paddd xmm4,xmm3
psrld xmm0,SCALEBITS ; xmm0=YOL
psrld xmm4,SCALEBITS ; xmm4=YOH
packssdw xmm0,xmm4 ; xmm0=YO
movdqa xmm4,xmm6
punpcklwd xmm6,xmm2
punpckhwd xmm4,xmm2
pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
paddd xmm6, XMMWORD [wk(0)]
paddd xmm4, XMMWORD [wk(1)]
paddd xmm6,xmm2
paddd xmm4,xmm2
psrld xmm6,SCALEBITS ; xmm6=YEL
psrld xmm4,SCALEBITS ; xmm4=YEH
packssdw xmm6,xmm4 ; xmm6=YE
psllw xmm0,BYTE_BIT
por xmm6,xmm0 ; xmm6=Y
movdqa XMMWORD [rdi], xmm6 ; Save Y
sub rcx, byte SIZEOF_XMMWORD
add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
add rdi, byte SIZEOF_XMMWORD ; outptr0
cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop
test rcx,rcx
jnz near .column_ld1
pop rcx ; col
pop rsi
pop rdi
add rsi, byte SIZEOF_JSAMPROW ; input_buf
add rdi, byte SIZEOF_JSAMPROW
dec rax ; num_rows
jg near .rowloop
.return:
pop rbx
uncollect_args
mov rsp,rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16

385
simd/jcgryss2.asm Normal file
View File

@@ -0,0 +1,385 @@
;
; jcgryss2.asm - grayscale colorspace conversion (SSE2)
;
; x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; Copyright (C) 2011, D. R. Commander.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; [TAB8]
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Convert some rows of samples to the output colorspace.
;
; GLOBAL(void)
; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
; JDIMENSION output_row, int num_rows);
;
%define img_width(b) (b)+8 ; JDIMENSION img_width
%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
%define output_row(b) (b)+20 ; JDIMENSION output_row
%define num_rows(b) (b)+24 ; int num_rows
%define original_ebp ebp+0
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
align 16
global EXTN(jsimd_rgb_gray_convert_sse2)
EXTN(jsimd_rgb_gray_convert_sse2):
push ebp
mov eax,esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp],eax
mov ebp,esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)]
test ecx,ecx
jz near .return
push ecx
mov esi, JSAMPIMAGE [output_buf(eax)]
mov ecx, JDIMENSION [output_row(eax)]
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
pop ecx
mov esi, JSAMPARRAY [input_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax,eax
jle near .return
alignx 16,7
.rowloop:
pushpic eax
push edi
push esi
push ecx ; col
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0
movpic eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
alignx 16,7
%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
push eax
push edx
lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
movzx eax, BYTE [esi+ecx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
movzx edx, WORD [esi+ecx]
shl eax, WORD_BIT
or eax,edx
.column_ld4:
movd xmmA,eax
pop edx
pop eax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub ecx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [esi+ecx]
pslldq xmmA, SIZEOF_DWORD
por xmmA,xmmF
.column_ld8:
test cl, SIZEOF_MMWORD
jz short .column_ld16
sub ecx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [esi+ecx]
pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmB
.column_ld16:
test cl, SIZEOF_XMMWORD
jz short .column_ld32
movdqa xmmF,xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
mov ecx, SIZEOF_XMMWORD
jmp short .rgb_gray_cnv
.column_ld32:
test cl, 2*SIZEOF_XMMWORD
mov ecx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv
movdqa xmmB,xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv
alignx 16,7
.columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
.rgb_gray_cnv:
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
movdqa xmmG,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
movdqa xmmD,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
movdqa xmmE,xmmA
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
pxor xmmH,xmmH
movdqa xmmC,xmmA
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmB,xmmE
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmF,xmmD
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
test cl, SIZEOF_XMMWORD/16
jz short .column_ld2
sub ecx, byte SIZEOF_XMMWORD/16
movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld2:
test cl, SIZEOF_XMMWORD/8
jz short .column_ld4
sub ecx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD
por xmmA,xmmE
.column_ld4:
test cl, SIZEOF_XMMWORD/4
jz short .column_ld8
sub ecx, byte SIZEOF_XMMWORD/4
movdqa xmmE,xmmA
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld8:
test cl, SIZEOF_XMMWORD/2
mov ecx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv
movdqa xmmF,xmmA
movdqa xmmH,xmmE
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv
alignx 16,7
.columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
.rgb_gray_cnv:
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD,xmmA
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmC,xmmF
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmB,xmmA
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmG,xmmD
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmE,xmmA
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmH,xmmB
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
pxor xmmF,xmmF
movdqa xmmC,xmmA
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmD,xmmB
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmG,xmmE
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
punpcklbw xmmF,xmmH
punpckhbw xmmH,xmmH
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
%endif ; RGB_PIXELSIZE ; ---------------
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
movdqa xmm6,xmm1
punpcklwd xmm1,xmm3
punpckhwd xmm6,xmm3
pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm6,xmm0
punpcklwd xmm0,xmm2
punpckhwd xmm6,xmm2
pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa xmm0, xmm5 ; xmm0=BO
movdqa xmm6, xmm4 ; xmm6=BE
movdqa xmm4,xmm0
punpcklwd xmm0,xmm3
punpckhwd xmm4,xmm3
pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
paddd xmm0, xmm1
paddd xmm4, xmm7
paddd xmm0,xmm3
paddd xmm4,xmm3
psrld xmm0,SCALEBITS ; xmm0=YOL
psrld xmm4,SCALEBITS ; xmm4=YOH
packssdw xmm0,xmm4 ; xmm0=YO
movdqa xmm4,xmm6
punpcklwd xmm6,xmm2
punpckhwd xmm4,xmm2
pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
paddd xmm6, XMMWORD [wk(0)]
paddd xmm4, XMMWORD [wk(1)]
paddd xmm6,xmm2
paddd xmm4,xmm2
psrld xmm6,SCALEBITS ; xmm6=YEL
psrld xmm4,SCALEBITS ; xmm4=YEH
packssdw xmm6,xmm4 ; xmm6=YE
psllw xmm0,BYTE_BIT
por xmm6,xmm0 ; xmm6=Y
movdqa XMMWORD [edi], xmm6 ; Save Y
sub ecx, byte SIZEOF_XMMWORD
add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
add edi, byte SIZEOF_XMMWORD ; outptr0
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
test ecx,ecx
jnz near .column_ld1
pop ecx ; col
pop esi
pop edi
poppic eax
add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW
dec eax ; num_rows
jg near .rowloop
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp,ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 16

View File

@@ -43,6 +43,14 @@
#define jsimd_extbgrx_ycc_convert_sse2 jSEXTBGRXYCCS2
#define jsimd_extxbgr_ycc_convert_sse2 jSEXTXBGRYCCS2
#define jsimd_extxrgb_ycc_convert_sse2 jSEXTXRGBYCCS2
#define jconst_rgb_gray_convert_sse2 jSCRGBGRYS2
#define jsimd_rgb_gray_convert_sse2 jSRGBGRYS2
#define jsimd_extrgb_gray_convert_sse2 jSEXTRGBGRYS2
#define jsimd_extrgbx_gray_convert_sse2 jSEXTRGBXGRYS2
#define jsimd_extbgr_gray_convert_sse2 jSEXTBGRGRYS2
#define jsimd_extbgrx_gray_convert_sse2 jSEXTBGRXGRYS2
#define jsimd_extxbgr_gray_convert_sse2 jSEXTXBGRGRYS2
#define jsimd_extxrgb_gray_convert_sse2 jSEXTXRGBGRYS2
#define jconst_ycc_rgb_convert_sse2 jSCYCCRGBS2
#define jsimd_ycc_rgb_convert_sse2 jSYCCRGBS2
#define jsimd_ycc_extrgb_convert_sse2 jSYCCEXTRGBS2
@@ -163,6 +171,35 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_mmx
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_rgb_gray_convert_mmx
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extrgb_gray_convert_mmx
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extrgbx_gray_convert_mmx
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extbgr_gray_convert_mmx
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extbgrx_gray_convert_mmx
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extxbgr_gray_convert_mmx
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extxrgb_gray_convert_mmx
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_ycc_rgb_convert_mmx
JPP((JDIMENSION out_width,
JSAMPIMAGE input_buf, JDIMENSION input_row,
@@ -222,6 +259,36 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_sse2
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
extern const int jconst_rgb_gray_convert_sse2[];
EXTERN(void) jsimd_rgb_gray_convert_sse2
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extrgb_gray_convert_sse2
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extrgbx_gray_convert_sse2
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extbgr_gray_convert_sse2
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extbgrx_gray_convert_sse2
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extxbgr_gray_convert_sse2
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
EXTERN(void) jsimd_extxrgb_gray_convert_sse2
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
extern const int jconst_ycc_rgb_convert_sse2[];
EXTERN(void) jsimd_ycc_rgb_convert_sse2
JPP((JDIMENSION out_width,

View File

@@ -83,6 +83,28 @@ jsimd_can_rgb_ycc (void)
return 0;
}
GLOBAL(int)
jsimd_can_rgb_gray (void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
return 0;
if ((simd_support & JSIMD_SSE2) &&
IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
return 1;
if (simd_support & JSIMD_MMX)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_ycc_rgb (void)
{
@@ -154,6 +176,55 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
output_buf, output_row, num_rows);
}
GLOBAL(void)
jsimd_rgb_gray_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
switch(cinfo->in_color_space)
{
case JCS_EXT_RGB:
sse2fct=jsimd_extrgb_gray_convert_sse2;
mmxfct=jsimd_extrgb_gray_convert_mmx;
break;
case JCS_EXT_RGBX:
sse2fct=jsimd_extrgbx_gray_convert_sse2;
mmxfct=jsimd_extrgbx_gray_convert_mmx;
break;
case JCS_EXT_BGR:
sse2fct=jsimd_extbgr_gray_convert_sse2;
mmxfct=jsimd_extbgr_gray_convert_mmx;
break;
case JCS_EXT_BGRX:
sse2fct=jsimd_extbgrx_gray_convert_sse2;
mmxfct=jsimd_extbgrx_gray_convert_mmx;
break;
case JCS_EXT_XBGR:
sse2fct=jsimd_extxbgr_gray_convert_sse2;
mmxfct=jsimd_extxbgr_gray_convert_mmx;
break;
case JCS_EXT_XRGB:
sse2fct=jsimd_extxrgb_gray_convert_sse2;
mmxfct=jsimd_extxrgb_gray_convert_mmx;
break;
default:
sse2fct=jsimd_rgb_gray_convert_sse2;
mmxfct=jsimd_rgb_gray_convert_mmx;
break;
}
if ((simd_support & JSIMD_SSE2) &&
IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
sse2fct(cinfo->image_width, input_buf,
output_buf, output_row, num_rows);
else if (simd_support & JSIMD_MMX)
mmxfct(cinfo->image_width, input_buf,
output_buf, output_row, num_rows);
}
GLOBAL(void)
jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
JSAMPIMAGE input_buf, JDIMENSION input_row,

View File

@@ -46,6 +46,23 @@ jsimd_can_rgb_ycc (void)
return 1;
}
GLOBAL(int)
jsimd_can_rgb_gray (void)
{
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
return 0;
if (!IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
return 0;
return 1;
}
GLOBAL(int)
jsimd_can_ycc_rgb (void)
{
@@ -98,6 +115,41 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
}
GLOBAL(void)
jsimd_rgb_gray_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
switch(cinfo->in_color_space)
{
case JCS_EXT_RGB:
sse2fct=jsimd_extrgb_gray_convert_sse2;
break;
case JCS_EXT_RGBX:
sse2fct=jsimd_extrgbx_gray_convert_sse2;
break;
case JCS_EXT_BGR:
sse2fct=jsimd_extbgr_gray_convert_sse2;
break;
case JCS_EXT_BGRX:
sse2fct=jsimd_extbgrx_gray_convert_sse2;
break;
case JCS_EXT_XBGR:
sse2fct=jsimd_extxbgr_gray_convert_sse2;
break;
case JCS_EXT_XRGB:
sse2fct=jsimd_extxrgb_gray_convert_sse2;
break;
default:
sse2fct=jsimd_rgb_gray_convert_sse2;
break;
}
sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
}
GLOBAL(void)
jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
JSAMPIMAGE input_buf, JDIMENSION input_row,